def main(): """ Run pipeline: - Create spark session - Get config - Read with meta - Uppercase columns - Rename dataframe - Correct country names - Get country id - Control input - Add decade column - Write with meta :return: None """ spark = create_spark_session() config_path = get_config_path_from_cli() config = provide_config(config_path).get('scripts').get( 'temperatures_by_country') country_mapping_path = config.get('country_mapping_path') df = read_with_meta(spark, df_meta=config['input_meta'], header=True) df = uppercase_columns(df=df, col_list=['Country']) df = rename(df=df) df = control_input(df=df) df = correct_country_names(df=df, country_col='country_name', country_mapping_path=country_mapping_path) df = get_country_id(spark, df=df, config=config) df = add_decade_column(df=df, date_col='date') write_with_meta(df=df, df_meta=config['output_meta'])
def main(): """ Run pipeline: - Create spark session - Get config - Read all dataframes with meta - Merge country names - Correct country names - Generate an id column - Write with meta :return: None """ spark = create_spark_session() config_path = get_config_path_from_cli() config = provide_config(config_path).get('scripts').get('country') country_mapping_path = config.get('country_mapping_path') (gdp_per_capita, human_capital_index, press_freedom_index, temperatures_by_country, immigration) = read_data(spark, config=config) df = merge_country_names(gdp_per_capita, human_capital_index, press_freedom_index, temperatures_by_country, immigration) df = correct_country_names(df=df, country_col='country_name', country_mapping_path=country_mapping_path) df = df.withColumn('country_id', F.row_number().over(Window.orderBy('country_name'))) write_with_meta(df=df, df_meta=config['output_meta'])
def main(): """ Run pipeline: - Create spark session - Get config - Read with meta - Convert dates from sas format to datetime - Uppercase columns - Rename dataframe - Correct country names - Get origin country id - Control input - Write with meta :return: None """ spark = create_spark_session() config_path = get_config_path_from_cli() config = provide_config(config_path).get('scripts').get('immigration') country_mapping_path = config.get('country_mapping_path') df = read_with_meta(spark, df_meta=config['input_meta']) df = convert_sas_to_date(df=df) df = uppercase_columns(df=df, col_list=['i94port', 'i94addr', 'occup', 'gender']) df = rename(df=df) df = correct_country_names(df=df, country_col='country_name', country_mapping_path=country_mapping_path) df = get_country_id(spark, df=df, config=config) df = control_input(df=df) df = df.withColumnRenamed('country_id', 'origin_country_id') write_with_meta(df=df, df_meta=config['output_meta'])
def correct_country_names( df: DataFrame, country_col: str, country_mapping_path: str, ) -> DataFrame: """ Replace corrupted country values with true ones. :param df: dataframe including country_name column :param country_col: Column name of country :param country_mapping_path: Path of mapping config :return: dataframe including country_name columns """ column = country_col replace_dict = provide_config(country_mapping_path) corrupted_values = list(replace_dict.keys()) map_col = create_map([lit(x) for x in chain(*replace_dict.items())]) df = df.withColumn(column, F.regexp_replace(column, '"', '')) df = df.withColumn( column, F.when(F.col(column).isin(corrupted_values), map_col[df[column]]).otherwise(F.col(column))) df = df.filter(F.col(column).isNotNull()) df = df.drop_duplicates() logging.info("Corrupted country columns are replaced with true values") return df
def replace_ids_with_values(df: DataFrame, mapping_config_path: str) -> DataFrame: """ Replace ids with values in order to faster analytic processes. :param df: immigration dataframe :param mapping_config_path: Path of id-value mapping config :return: immigration dataframe """ mapping = provide_config(mapping_config_path) for column in mapping.keys(): replace_dict = mapping.get(column) map_col = create_map([lit(x) for x in chain(*replace_dict.items())]) df = df.withColumn(column, map_col[df[column]]) df = df.fillna('UNKNOWN', column) logging.info("ID columns are replaced with values") return df
def main(): """ Run pipeline: - Create spark session - Get config - Read with meta - Uppercase columns - Rename dataframe - Correct country names - Get country id - Convert wide dataframe to long - Add rank column - Write with meta :return: None """ spark = create_spark_session() config_path = get_config_path_from_cli() config = provide_config(config_path).get('scripts').get( 'human_capital_index') country_mapping_path = config.get('country_mapping_path') df = read_with_meta(spark, df_meta=config['input_meta'], header=True) df = uppercase_columns(df=df, col_list=['Country Name']) df = df.withColumnRenamed("Country Name", "country_name") df = correct_country_names(df=df, country_col='country_name', country_mapping_path=country_mapping_path) df = get_country_id(spark, df=df, config=config) df_long = melt(df=df, key_cols=['country_id'], value_cols=[str(i) for i in list(range(2010, 2021))], var_name='year', value_name='human_capital_index') df_long = add_rank_column(df=df_long, partition_col='year', order_by_col='human_capital_index', rank_col='human_capital_rank', ascending=False) write_with_meta(df=df_long, df_meta=config['output_meta'])
def main(): """ Run pipeline: - Create spark session - Get config - Read with meta - Uppercase columns - Rename dataframe - Write with meta :return: None """ spark = create_spark_session() config_path = get_config_path_from_cli() config = provide_config(config_path).get('scripts').get('us_cities_demographics') df = read_with_meta(spark, df_meta=config['input_meta'], header=True, sep=';') df = uppercase_columns(df=df, col_list=['City', 'State', 'Race']) df = rename(df=df) write_with_meta(df=df, df_meta=config['output_meta'])
def main(): """ Run pipeline: - Create spark session - Get config - Read with meta - Replace ids with values - Write with meta :return: None """ spark = create_spark_session() config_path = get_config_path_from_cli() config = provide_config(config_path).get('scripts').get( 'immigration_mapping') mapping_config_path = config.get('mapping_config_path') df = read_with_meta(spark, df_meta=config['input_meta']) df = replace_ids_with_values(df=df, mapping_config_path=mapping_config_path) write_with_meta(df=df, df_meta=config['output_meta'])
def main(): """ Run pipeline: - Create spark session - Get config - Read with meta - Rename dataframe - Add decade column - Write with meta :return: None """ spark = create_spark_session() config_path = get_config_path_from_cli() config = provide_config(config_path).get('scripts').get( 'global_temperatures') df = read_with_meta(spark, df_meta=config['input_meta'], header=True) df = rename(df=df) df = add_decade_column(df=df, date_col='date') write_with_meta(df, df_meta=config['output_meta'])