Ejemplo n.º 1
0
def statistic_company(df, mysql_url):
    """
    公司相关维度分析
    :param df: 
    :param mysql_url: 
    :return: 
    """
    # 数据过滤
    df = df.filter(df.company_name.isNotNull()).filter(df.position_category.isNotNull())
    latest_work_df = df.filter(df.work_index == df.work_len)
    # 持续化数据
    df.persist()
    latest_work_df.persist()
    # 分析结果表
    result_tables = dict()
    result_tables["company__rank"] = statistic_company_rank(df)
    result_tables["company__salary"] = statistic_company_salary(df)
    result_tables["company__gender"] = statistic_company_gender(df)
    result_tables["company__age"] = statistic_company_age(df)
    result_tables["company__address"] = statistic_company_address(df)
    # result_tables["company__work_year"] = statistic_company_work_year(df)
    result_tables["company__school"] = statistic_company_school(df)
    result_tables["company__major"] = statistic_company_major(df)
    result_tables["company__degree"] = statistic_company_degree(df)
    result_tables["company__prev_company"] = statistic_company_prev_company(df)
    result_tables["company__next_company"] = statistic_company_next_company(df)
    result_tables["company__duration"] = statistic_company_duration(df)
    # 删除持续化数据
    df.unpersist()
    latest_work_df.unpersist()
    # 将专业相关的分析结果写入数据库
    write_mysql(mysql_url, result_tables)
Ejemplo n.º 2
0
def statistic_school_major(df, mysql_url):
    """
    学校+专业相关分析
    :param df:
    :param mysql_url:
    :return:
    """
    # 数据过滤
    df = df.filter(df.avg_salary > MIN_SALARY)
    df = df.filter(df.school_name.isNotNull()).filter(
        df.major.isNotNull()).filter(df.degree.isNotNull())
    latest_work_df = df.filter(df.work_index == df.work_len)
    # 持续化数据
    df.persist()
    latest_work_df.persist()
    # 分析结果表
    result_tables = dict()
    result_tables["school__major__rank"] = statistic_school_major_rank(
        latest_work_df)
    result_tables["school__major__position"] = statistic_school_major_position(
        df)
    # result_tables["school__major__work_year"] = statistic_school_major_work_year(df)
    result_tables["school__major__industry"] = statistic_school_major_industry(
        df)
    result_tables["school__major__company"] = statistic_school_major_company(
        df)
    result_tables["school__major__flow"] = statistic_school_major_flow(df)
    result_tables[
        "school__major__position__rank"] = statistic_school_major_position_rank(
            df)
    # 删除持续化数据
    df.unpersist()
    latest_work_df.unpersist()
    # 将专业相关的分析结果写入数据库
    write_mysql(mysql_url, result_tables)
Ejemplo n.º 3
0
def statistic_major(df, mysql_url):
    """
    专业相关分析
    :param df: 
    :param mysql_url: 
    :return: 
    """
    # 数据过滤
    df = df.filter(df.major.isNotNull()).filter(df.degree.isNotNull())
    df = df.filter((df.salary_min > MIN_SALARY)).filter(df.salary_max >= df.salary_min)
    # 最近5年本专业应届毕业生去向
    latest_work_df = df.filter(df.work_index == df.work_len) \
        .filter(df.edu_end_year >= datetime.datetime.now().year - LATEST_YEAR)

    # 持续化数据
    df.persist()
    latest_work_df.persist()

    # 分析结果表
    result_tables = dict()
    result_tables["major__rank"] = statistic_major_rank(latest_work_df)
    result_tables["major__gender"] = statistic_major_gender(latest_work_df)
    result_tables["major__address"] = statistic_major_address(latest_work_df)
    result_tables["major__company"] = statistic_major_company(latest_work_df)
    result_tables["major__industry"] = statistic_major_industry(latest_work_df)
    result_tables["major__position"] = statistic_major_position(latest_work_df)
    # result_tables["major__work_year"] = statistic_major_work_year(df)
    # 删除持续化数据
    df.unpersist()
    latest_work_df.unpersist()
    # 将专业相关的分析结果写入数据库
    write_mysql(mysql_url, result_tables)
Ejemplo n.º 4
0
def statistic_position_address(df, mysql_url):
    """
    职位相关维度分析
    :param df: 
    :param mysql_url: 
    :return: 
    """
    # 数据过滤
    df = df.filter(df.position_name.isNotNull()).filter(df.address.isNotNull())
    df.persist()
    # 分析结果表
    result_tables = dict()
    result_tables["position__address__compare"] = statistic_position_address_compare(df)
    result_tables["position__address__flow"] = statistic_position_address_flow(df)
    result_tables["position__address__salary_range"] = statistic_position_address_salary_range(df)
    result_tables["position__address__age_range"] = statistic_position_address_age_range(df)
    result_tables["position__address__gender"] = statistic_position_address_gender(df)
    result_tables["position__address__work_year_range"] = statistic_position_address_work_year_range(df)
    result_tables["position__address__degree"] = statistic_position_address_degree(df)
    result_tables["position__address__duration"] = statistic_position_address_duration(df)
    result_tables["position__address__industry"] = statistic_position_address_industry(df)
    result_tables["position__address__salary_work_year"] = statistic_position_address_salary_work_year(df)
    result_tables["position__address__prev_position"] = statistic_position_address_prev_position(df)
    result_tables["position__address__next_position"] = statistic_position_address_next_position(df)
    result_tables["position__address__change"] = statistic_position_address_change(df)
    # 将专业相关的分析结果写入数据库
    write_mysql(mysql_url, result_tables)
    df.unpersist()
Ejemplo n.º 5
0
def statistic_dimension(df, mysql_url):
    # 薪资类型(salary category)
    common = statistic
    median = statistic_median_salary
    avg = statistic_avg_salary
    parameters = [
        # 专业维度相关分析
        (["major", "degree"], [
            ("major__rank", None, median),
            ("major__gender", "gender", median),
            ("major__address", "address", median),
            ("major__company", "company_name", median),
            ("major__industry", "industry", median),
            ("major__position", "position_name", median),
        ]),

        # ("major__rank", ["major", "degree"], sc, True),
        # ("major__gender", ["major", "degree","gender"], sc),
        # ("major__address", ["major", "degree","address"], sc),
        # ("major__company", ["major", "degree", "company_name"], sc),
        # ("major__industry", ["major", "degree", "industry"], sc),
        # ("major__position", ["major", "degree","position_name"], sc),
        #
        # # 学校维度相关分析
        # ("school__rank", ["school_name", "degree"], sc, True),
        # ("school__gender", ["school_name", "degree", "gender"], sc),
        # ("school__address", ["school_name", "degree", "address"], sc),
        # ("school__company", ["school_name", "degree", "company_name"], sc),
        # ("school__industry", ["school_name", "degree", "industry"], sc),
        # ("school__position", ["school_name", "degree", "position_name"], sc),
        #
        # # 学校 + 专业  相关维度的分析
        # ("school__major__rank", ["school_name", "major", "degree"], sc, True),
        # ("school__major__position", ["school_name", "major", "degree", "position_name"], sc),
        # ("school__major__industry", ["school_name", "major", "degree", "industry"], sc),
        # ("school__major__company", ["school_name", "major", "degree", "company_name"], sc),
        # ("school__major__flow", ["school_name", "major", "degree", "company_name", "industry"], sc),
        # ("school__major__position__rank", ["school_name", "major", "degree", "position_name"], sc, True),

    ]

    for groups, values in parameters:
        result_tables = dict()
        mdf = df
        for field in groups:
            mdf = df.filter(df[field].isNotNull())
        mdf.persist()
        for table, result_field, func in values:
            if not result_field:
                result_field = []
            if isinstance(result_field, str):
                result_field = [result_field]
                result_df = mdf.filter(mdf[result_field].isNotNull())
                fields = groups + [result_field]
            result_tables[table] = func(result_df, fields, "rank" in table)
        write_mysql(mysql_url, result_tables)
        mdf.unpersist()
Ejemplo n.º 6
0
def statistic_dimension(df, mysql_url, parameters, func):
    """各维度简历数据分析"""
    # 当前工作的最小薪资必须大于MIN_SALARY, 且区间最大值大于最小值
    df = df.filter((df.salary_min > MIN_SALARY)).filter(df.salary_max >= df.salary_min) \
        .filter(df.avg_salary.isNotNull())
    result_tables = dict()
    df.persist()
    for groups, na_field, values in parameters:
        result_tables[values[0]] = func(df, groups, na_field, *values[1:])
    write_mysql(mysql_url, result_tables)
    df.unpersist()
Ejemplo n.º 7
0
def postgraduate(profile_df, education_df, work_df, mysql_url):
    """
    留学、考研、就业分析
    :param profile_df: 简历基本信息
    :param education_df: 简历教育经历
    :param work_df: 简历工作经历
    :param mysql_url: 数据库路径
    :return: 
    """
    education_df = education_df.filter(education_df.school_name.isNotNull()) \
        .filter(education_df.major.isNotNull()) \
        .filter(education_df.degree.isNotNull())
    education_df = education_df.select("resume_id", "school_name",
                                       "school_area", "major", "degree",
                                       "edu_index", "edu_end_date")
    education_df = education_df.groupby("resume_id").apply(cal_postgraduate)
    education_df = education_df.filter(
        education_df.postgraduate.isNotNull()).filter(
            education_df.flag.isNull())
    education_df.persist()

    # 计算每份工作平均薪资
    work_df = work_df.withColumn(
        "avg_salary",
        F.udf(cal_avg_salary, FloatType())(work_df.salary_min,
                                           work_df.salary_max))
    work_df = work_df.filter(work_df.avg_salary > MIN_SALARY)

    # # 过滤掉只有简历概要信息的所有数据(工作经历分数大于等于2)
    # wc_df = work_df.filter(work_df.work_index == 2).select("resume_id")
    # edu_df = wc_df.join(education_df, "resume_id")

    # 组合教育经历和工作经历
    df = education_df.join(work_df, "resume_id")
    df = df.withColumn("work_month",
                       F.months_between(df.work_end_date, df.edu_end_date))
    df = df.withColumn("work_year",
                       F.udf(get_years, IntegerType())(df.work_month))
    df = df.filter(df.work_year >= 0).filter(df.work_year <= 40)

    # 分析结果表
    result_tables = dict()
    result_tables[
        "school__major__postgraduate__ratio__v1"] = postgraduate_ratio(
            education_df)
    result_tables[
        "school__major__postgraduate__work_year__v1"] = postgraduate_work_year(
            df)
    # 删除持续化数据
    education_df.unpersist()
    # 将专业相关的分析结果写入数据库
    write_mysql(mysql_url, result_tables)
Ejemplo n.º 8
0
def statistic_person(df, mysql_url):
    """
    个人维度分析
    :param df: 
    :param mysql_url: 
    :return: 
    """
    # 数据过滤
    df = df.filter(df.avg_salary > MIN_SALARY)
    df.persist()
    # 分析结果表
    result_tables = dict()
    result_tables["person__rank"] = statistic_person_rank(df)
    # 将专业相关的分析结果写入数据库
    write_mysql(mysql_url, result_tables)
    df.unpersist()
Ejemplo n.º 9
0
def statistic_industry(df, mysql_url):
    """
    行业相关维度分析
    :param df: 
    :param mysql_url: 
    :return: 
    """
    # 数据过滤
    df = df.filter(df.industry.isNotNull())
    df.persist()
    # 分析结果表
    result_tables = dict()
    result_tables["industry__rank"] = statistic_industry_rank(df)
    result_tables["industry__address"] = statistic_industry_address(df)
    # 将专业相关的分析结果写入数据库
    write_mysql(mysql_url, result_tables)
    df.unpersist()
Ejemplo n.º 10
0
def statistic_special(df, mysql_url):
    """
    特殊情况分析
    :param df:
    :param mysql_url:
    :return:
    """

    config = {
        "company__age": [
            (["company_name"], "position_category", ("company__age", None, False, True)),
        ],
        "company__duration": [
            (["company_name"], "position_category", ("company__duration", None)),
        ],
        "position__address__duration": [
            (["position_name"], "address", ("position__address__duration", None, False, True))
        ],
        "address__flow": [
            (["industry"], None, ("industry__address__flow", "address,expect_area")),
            (["position_name"], None, ("position__address__flow", "address,expect_area"))
        ],
        "address_change": [
            (["position_name"], "address", ("position__address__change", None)),
        ]
    }

    # 当前工作的最小薪资必须大于MIN_SALARY, 且区间最大值大于最小值
    df = df.filter((df.salary_min > MIN_SALARY)).filter(df.salary_max >= df.salary_min)
    for type, parameters in config.items():
        result_tables = dict()
        df.persist()
        for groups, na_field, values in parameters:
            if type == 'company__age':
                result_df = statistic_company_age(df)
            elif type == 'company__duration':
                result_df = statistic_company_duration(df)
            elif type == 'position__address__duration':
                result_df = statistic_position_address_duration(df)
            elif type == 'address__flow':
                result_df = statistic_address_flow(df, groups, na_field, *values[1:])
            # 过滤掉某些字段中的不合法值
            result_tables[values[0]] = filter_invalid_value(result_df)
        write_mysql(mysql_url, result_tables)
        df.unpersist()
Ejemplo n.º 11
0
def statistic_industry_position_address(df, mysql_url):
    """
    行业相关维度分析
    :param df: 
    :param mysql_url: 
    :return: 
    """
    # 数据过滤
    df = df.filter(df.industry.isNotNull()).filter(
        df.position_name.isNotNull()).filter(df.address.isNotNull())
    df.persist()
    # 分析结果表
    result_tables = dict()
    result_tables[
        "industry__position__address__work_year"] = statistic_industry_position_address_work_year(
            df)
    # 将专业相关的分析结果写入数据库
    write_mysql(mysql_url, result_tables)
    df.unpersist()
Ejemplo n.º 12
0
def statistic_neighbour_special(df, mysql_url):
    config = {
        "address_change": [
            (["position_name"], "address", ("position__address__change",
                                            None)),
        ]
    }

    # 当前工作的最小薪资必须大于MIN_SALARY, 且区间最大值大于最小值
    df = df.filter(
        (df.salary_min > MIN_SALARY)).filter(df.salary_max >= df.salary_min)
    for type, parameters in config.items():
        result_tables = dict()
        df.persist()
        for groups, na_field, values in parameters:
            result_df = statistic_position_address_change(
                df, groups, na_field, *values[1:])
            # 过滤掉某些字段中的不合法值
            result_tables[values[0]] = filter_invalid_value(result_df)
        write_mysql(mysql_url, result_tables)
        df.unpersist()
Ejemplo n.º 13
0
Archivo: job.py Proyecto: reganzm/ai
def statistic_job(position_df, job_df, mysql_url):
    """
    招聘职位相关分析
    :param df: 
    :param mysql_url: 
    :return: 
    """
    job_df = job_df.filter(job_df.position_name.isNotNull())
    # position_df = filter_str(position_df, "expect_career")

    # position_df = position_df.withColumnRenamed("expect_career", "position_name") \
    #     .withColumnRenamed("update_date", "date_time").withColumnRenamed("address", "area") \
    #     .withColumnRenamed("expect_area", "address").withColumnRenamed("avg_salary", "job_avg_salary")
    #
    result_tables = dict()
    # 分析详情表
    result_tables["position__jd"] = statistic_job_jd(job_df)
    # 分析关系表
    # result_tables["job__position__relation"] = statistic_job_position_relation(job_df, position_df)

    # 将专业相关的分析结果写入数据库
    write_mysql(mysql_url, result_tables)
Ejemplo n.º 14
0
def statistic_industry_address(df, mysql_url):
    """
    行业相关维度分析
    :param df: 
    :param mysql_url: 
    :return: 
    """
    # 数据过滤
    df = df.filter(df.industry.isNotNull()).filter(df.address.isNotNull())
    df.persist()
    # 分析结果表
    result_tables = dict()
    result_tables["industry__address__compare"] = statistic_industry_address_compare(df)
    result_tables["industry__address__flow"] = statistic_industry_address_flow(df)
    result_tables["industry__address__salary"] = statistic_industry_address_salary_range(df)
    result_tables["industry__address__age"] = statistic_industry_address_age_range(df)
    result_tables["industry__address__gender"] = statistic_industry_address_gender(df)
    # result_tables["industry__address__work_year"] = statistic_industry_address_work_year_range(df)
    result_tables["industry__address__degree"] = statistic_industry_address_degree(df)
    result_tables["industry__address__salary_work_year"] = statistic_industry_address_salary_work_year(df)
    result_tables["industry__address__rank"] = statistic_industry_address_rank(df)
    # 将专业相关的分析结果写入数据库
    write_mysql(mysql_url, result_tables)
    df.unpersist()
Ejemplo n.º 15
0
def statistic_work_year(df, mysql_url):
    df = flat_work_year(df)
    df.persist()
    result_tables = dict()
    parameters = [
        {
            "name": "major__work_year",
            "groups": ["major", "work_year", "degree"]
        },
        {
            "name": "school__work_year",
            "groups": ["school_name", "work_year", "degree"]
        },
        {
            "name": "company__work_year",
            "groups": ["company_name", "work_year", "position_category"]
        },
        {
            "name": "industry__address__work_year",
            "groups": ["industry", "work_year", "address"]
        },
        {
            "name": "position__address__work_year",
            "groups": ["position_name", "work_year", "address"]
        },
        {
            "name": "school__major__work_year",
            "groups": ["school_name", "major", "work_year", "degree"]
        },
    ]
    for param in parameters:
        result_tables[param["name"]] = statistic_work_year_dimension(
            df, param["groups"])

    write_mysql(mysql_url, result_tables)
    df.unpersist()