def main():
    """
    Run pipeline:
    - Create spark session
    - Get config
    - Read with meta
    - Uppercase columns
    - Rename dataframe
    - Correct country names
    - Get country id
    - Control input
    - Add decade column
    - Write with meta
    :return: None
    """
    spark = create_spark_session()

    config_path = get_config_path_from_cli()
    config = provide_config(config_path).get('scripts').get(
        'temperatures_by_country')
    country_mapping_path = config.get('country_mapping_path')

    df = read_with_meta(spark, df_meta=config['input_meta'], header=True)
    df = uppercase_columns(df=df, col_list=['Country'])
    df = rename(df=df)
    df = control_input(df=df)
    df = correct_country_names(df=df,
                               country_col='country_name',
                               country_mapping_path=country_mapping_path)
    df = get_country_id(spark, df=df, config=config)
    df = add_decade_column(df=df, date_col='date')

    write_with_meta(df=df, df_meta=config['output_meta'])
Ejemplo n.º 2
0
def main():
    """
    Run pipeline:
    - Create spark session
    - Get config
    - Read all dataframes with meta
    - Merge country names
    - Correct country names
    - Generate an id column
    - Write with meta
    :return: None
    """
    spark = create_spark_session()

    config_path = get_config_path_from_cli()
    config = provide_config(config_path).get('scripts').get('country')
    country_mapping_path = config.get('country_mapping_path')

    (gdp_per_capita, human_capital_index, press_freedom_index,
     temperatures_by_country, immigration) = read_data(spark, config=config)
    df = merge_country_names(gdp_per_capita, human_capital_index,
                             press_freedom_index, temperatures_by_country,
                             immigration)

    df = correct_country_names(df=df,
                               country_col='country_name',
                               country_mapping_path=country_mapping_path)
    df = df.withColumn('country_id',
                       F.row_number().over(Window.orderBy('country_name')))

    write_with_meta(df=df, df_meta=config['output_meta'])
def main():
    """
    Run pipeline:
    - Create spark session
    - Get config
    - Read with meta
    - Convert dates from sas format to datetime
    - Uppercase columns
    - Rename dataframe
    - Correct country names
    - Get origin country id
    - Control input
    - Write with meta
    :return: None
    """
    spark = create_spark_session()

    config_path = get_config_path_from_cli()
    config = provide_config(config_path).get('scripts').get('immigration')
    country_mapping_path = config.get('country_mapping_path')

    df = read_with_meta(spark, df_meta=config['input_meta'])
    df = convert_sas_to_date(df=df)
    df = uppercase_columns(df=df, col_list=['i94port', 'i94addr', 'occup', 'gender'])
    df = rename(df=df)
    df = correct_country_names(df=df, country_col='country_name',
                               country_mapping_path=country_mapping_path)
    df = get_country_id(spark, df=df, config=config)
    df = control_input(df=df)
    df = df.withColumnRenamed('country_id', 'origin_country_id')

    write_with_meta(df=df, df_meta=config['output_meta'])
Ejemplo n.º 4
0
def correct_country_names(
    df: DataFrame,
    country_col: str,
    country_mapping_path: str,
) -> DataFrame:
    """
    Replace corrupted country values with true ones.

    :param df: dataframe including country_name column
    :param country_col: Column name of country
    :param country_mapping_path: Path of mapping config
    :return: dataframe including country_name columns
    """
    column = country_col
    replace_dict = provide_config(country_mapping_path)
    corrupted_values = list(replace_dict.keys())
    map_col = create_map([lit(x) for x in chain(*replace_dict.items())])
    df = df.withColumn(column, F.regexp_replace(column, '"', ''))
    df = df.withColumn(
        column,
        F.when(F.col(column).isin(corrupted_values),
               map_col[df[column]]).otherwise(F.col(column)))
    df = df.filter(F.col(column).isNotNull())
    df = df.drop_duplicates()
    logging.info("Corrupted country columns are replaced with true values")
    return df
def replace_ids_with_values(df: DataFrame,
                            mapping_config_path: str) -> DataFrame:
    """
    Replace ids with values in order to faster analytic processes.
    :param df: immigration dataframe
    :param mapping_config_path: Path of id-value mapping config
    :return: immigration dataframe
    """
    mapping = provide_config(mapping_config_path)
    for column in mapping.keys():
        replace_dict = mapping.get(column)
        map_col = create_map([lit(x) for x in chain(*replace_dict.items())])
        df = df.withColumn(column, map_col[df[column]])
        df = df.fillna('UNKNOWN', column)
    logging.info("ID columns are replaced with values")
    return df
def main():
    """
    Run pipeline:
    - Create spark session
    - Get config
    - Read with meta
    - Uppercase columns
    - Rename dataframe
    - Correct country names
    - Get country id
    - Convert wide dataframe to long
    - Add rank column
    - Write with meta
    :return: None
    """
    spark = create_spark_session()

    config_path = get_config_path_from_cli()
    config = provide_config(config_path).get('scripts').get(
        'human_capital_index')
    country_mapping_path = config.get('country_mapping_path')

    df = read_with_meta(spark, df_meta=config['input_meta'], header=True)
    df = uppercase_columns(df=df, col_list=['Country Name'])
    df = df.withColumnRenamed("Country Name", "country_name")
    df = correct_country_names(df=df,
                               country_col='country_name',
                               country_mapping_path=country_mapping_path)
    df = get_country_id(spark, df=df, config=config)

    df_long = melt(df=df,
                   key_cols=['country_id'],
                   value_cols=[str(i) for i in list(range(2010, 2021))],
                   var_name='year',
                   value_name='human_capital_index')
    df_long = add_rank_column(df=df_long,
                              partition_col='year',
                              order_by_col='human_capital_index',
                              rank_col='human_capital_rank',
                              ascending=False)

    write_with_meta(df=df_long, df_meta=config['output_meta'])
def main():
    """
    Run pipeline:
    - Create spark session
    - Get config
    - Read with meta
    - Uppercase columns
    - Rename dataframe
    - Write with meta
    :return: None
    """
    spark = create_spark_session()

    config_path = get_config_path_from_cli()
    config = provide_config(config_path).get('scripts').get('us_cities_demographics')

    df = read_with_meta(spark, df_meta=config['input_meta'], header=True, sep=';')
    df = uppercase_columns(df=df, col_list=['City', 'State', 'Race'])
    df = rename(df=df)

    write_with_meta(df=df, df_meta=config['output_meta'])
def main():
    """
    Run pipeline:
    - Create spark session
    - Get config
    - Read with meta
    - Replace ids with values
    - Write with meta
    :return: None
    """
    spark = create_spark_session()

    config_path = get_config_path_from_cli()
    config = provide_config(config_path).get('scripts').get(
        'immigration_mapping')
    mapping_config_path = config.get('mapping_config_path')

    df = read_with_meta(spark, df_meta=config['input_meta'])
    df = replace_ids_with_values(df=df,
                                 mapping_config_path=mapping_config_path)

    write_with_meta(df=df, df_meta=config['output_meta'])
def main():
    """
    Run pipeline:
    - Create spark session
    - Get config
    - Read with meta
    - Rename dataframe
    - Add decade column
    - Write with meta
    :return: None
    """
    spark = create_spark_session()

    config_path = get_config_path_from_cli()
    config = provide_config(config_path).get('scripts').get(
        'global_temperatures')

    df = read_with_meta(spark, df_meta=config['input_meta'], header=True)
    df = rename(df=df)
    df = add_decade_column(df=df, date_col='date')

    write_with_meta(df, df_meta=config['output_meta'])