Beispiel #1
0
def main():
    """
    Run pipeline:
    - Create spark session
    - Get config
    - Read all dataframes with meta
    - Merge country names
    - Correct country names
    - Generate an id column
    - Write with meta
    :return: None
    """
    spark = create_spark_session()

    config_path = get_config_path_from_cli()
    config = provide_config(config_path).get('scripts').get('country')
    country_mapping_path = config.get('country_mapping_path')

    (gdp_per_capita, human_capital_index, press_freedom_index,
     temperatures_by_country, immigration) = read_data(spark, config=config)
    df = merge_country_names(gdp_per_capita, human_capital_index,
                             press_freedom_index, temperatures_by_country,
                             immigration)

    df = correct_country_names(df=df,
                               country_col='country_name',
                               country_mapping_path=country_mapping_path)
    df = df.withColumn('country_id',
                       F.row_number().over(Window.orderBy('country_name')))

    write_with_meta(df=df, df_meta=config['output_meta'])
def main():
    """
    Run pipeline:
    - Create spark session
    - Get config
    - Read with meta
    - Convert dates from sas format to datetime
    - Uppercase columns
    - Rename dataframe
    - Correct country names
    - Get origin country id
    - Control input
    - Write with meta
    :return: None
    """
    spark = create_spark_session()

    config_path = get_config_path_from_cli()
    config = provide_config(config_path).get('scripts').get('immigration')
    country_mapping_path = config.get('country_mapping_path')

    df = read_with_meta(spark, df_meta=config['input_meta'])
    df = convert_sas_to_date(df=df)
    df = uppercase_columns(df=df, col_list=['i94port', 'i94addr', 'occup', 'gender'])
    df = rename(df=df)
    df = correct_country_names(df=df, country_col='country_name',
                               country_mapping_path=country_mapping_path)
    df = get_country_id(spark, df=df, config=config)
    df = control_input(df=df)
    df = df.withColumnRenamed('country_id', 'origin_country_id')

    write_with_meta(df=df, df_meta=config['output_meta'])
def main():
    """
    Run pipeline:
    - Create spark session
    - Get config
    - Read with meta
    - Uppercase columns
    - Rename dataframe
    - Correct country names
    - Get country id
    - Control input
    - Add decade column
    - Write with meta
    :return: None
    """
    spark = create_spark_session()

    config_path = get_config_path_from_cli()
    config = provide_config(config_path).get('scripts').get(
        'temperatures_by_country')
    country_mapping_path = config.get('country_mapping_path')

    df = read_with_meta(spark, df_meta=config['input_meta'], header=True)
    df = uppercase_columns(df=df, col_list=['Country'])
    df = rename(df=df)
    df = control_input(df=df)
    df = correct_country_names(df=df,
                               country_col='country_name',
                               country_mapping_path=country_mapping_path)
    df = get_country_id(spark, df=df, config=config)
    df = add_decade_column(df=df, date_col='date')

    write_with_meta(df=df, df_meta=config['output_meta'])
def main():
    """
    Run pipeline:
    - Create spark session
    - Get config
    - Read with meta
    - Uppercase columns
    - Rename dataframe
    - Correct country names
    - Get country id
    - Convert wide dataframe to long
    - Add rank column
    - Write with meta
    :return: None
    """
    spark = create_spark_session()

    config_path = get_config_path_from_cli()
    config = provide_config(config_path).get('scripts').get(
        'human_capital_index')
    country_mapping_path = config.get('country_mapping_path')

    df = read_with_meta(spark, df_meta=config['input_meta'], header=True)
    df = uppercase_columns(df=df, col_list=['Country Name'])
    df = df.withColumnRenamed("Country Name", "country_name")
    df = correct_country_names(df=df,
                               country_col='country_name',
                               country_mapping_path=country_mapping_path)
    df = get_country_id(spark, df=df, config=config)

    df_long = melt(df=df,
                   key_cols=['country_id'],
                   value_cols=[str(i) for i in list(range(2010, 2021))],
                   var_name='year',
                   value_name='human_capital_index')
    df_long = add_rank_column(df=df_long,
                              partition_col='year',
                              order_by_col='human_capital_index',
                              rank_col='human_capital_rank',
                              ascending=False)

    write_with_meta(df=df_long, df_meta=config['output_meta'])