def main(): """ Run pipeline: - Create spark session - Get config - Read with meta - Uppercase columns - Rename dataframe - Correct country names - Get country id - Control input - Add decade column - Write with meta :return: None """ spark = create_spark_session() config_path = get_config_path_from_cli() config = provide_config(config_path).get('scripts').get( 'temperatures_by_country') country_mapping_path = config.get('country_mapping_path') df = read_with_meta(spark, df_meta=config['input_meta'], header=True) df = uppercase_columns(df=df, col_list=['Country']) df = rename(df=df) df = control_input(df=df) df = correct_country_names(df=df, country_col='country_name', country_mapping_path=country_mapping_path) df = get_country_id(spark, df=df, config=config) df = add_decade_column(df=df, date_col='date') write_with_meta(df=df, df_meta=config['output_meta'])
def main(): """ Run pipeline: - Create spark session - Get config - Read with meta - Convert dates from sas format to datetime - Uppercase columns - Rename dataframe - Correct country names - Get origin country id - Control input - Write with meta :return: None """ spark = create_spark_session() config_path = get_config_path_from_cli() config = provide_config(config_path).get('scripts').get('immigration') country_mapping_path = config.get('country_mapping_path') df = read_with_meta(spark, df_meta=config['input_meta']) df = convert_sas_to_date(df=df) df = uppercase_columns(df=df, col_list=['i94port', 'i94addr', 'occup', 'gender']) df = rename(df=df) df = correct_country_names(df=df, country_col='country_name', country_mapping_path=country_mapping_path) df = get_country_id(spark, df=df, config=config) df = control_input(df=df) df = df.withColumnRenamed('country_id', 'origin_country_id') write_with_meta(df=df, df_meta=config['output_meta'])
def read_data(spark, config: dict) -> tuple: """ Read all dataframes that include country columns :param spark: Spark session :param config: config including input meta :return: dataframe tuple """ gdp_per_capita = read_with_meta(spark, df_meta=config['gdp_per_capita_meta'], header=True) human_capital_index = read_with_meta( spark, df_meta=config['human_capital_index_meta'], header=True) press_freedom_index = read_with_meta( spark, df_meta=config['press_freedom_index_meta'], header=True) temperatures_by_country = read_with_meta( spark, df_meta=config['temperatures_by_country_meta'], header=True) immigration = read_with_meta(spark, df_meta=config['immigration_meta'], header=True) return (gdp_per_capita, human_capital_index, press_freedom_index, temperatures_by_country, immigration)
def get_country_id(spark, df: DataFrame, config: Dict) -> DataFrame: """ Get country id from its dimension table :param spark: Spark session :param df: dataframe :param config: string array of columns to be upper-cased :return: dataframe """ country = read_with_meta(spark, config['country_meta']) key_col = 'country_name' df = df.join(country, on=key_col, how='inner') df = df.drop(key_col) logging.info("Country name is converted to id from dimension table") return df
def main(): """ Run pipeline: - Create spark session - Get config - Read with meta - Uppercase columns - Rename dataframe - Correct country names - Get country id - Convert wide dataframe to long - Add rank column - Write with meta :return: None """ spark = create_spark_session() config_path = get_config_path_from_cli() config = provide_config(config_path).get('scripts').get( 'human_capital_index') country_mapping_path = config.get('country_mapping_path') df = read_with_meta(spark, df_meta=config['input_meta'], header=True) df = uppercase_columns(df=df, col_list=['Country Name']) df = df.withColumnRenamed("Country Name", "country_name") df = correct_country_names(df=df, country_col='country_name', country_mapping_path=country_mapping_path) df = get_country_id(spark, df=df, config=config) df_long = melt(df=df, key_cols=['country_id'], value_cols=[str(i) for i in list(range(2010, 2021))], var_name='year', value_name='human_capital_index') df_long = add_rank_column(df=df_long, partition_col='year', order_by_col='human_capital_index', rank_col='human_capital_rank', ascending=False) write_with_meta(df=df_long, df_meta=config['output_meta'])
def main(): """ Run pipeline: - Create spark session - Get config - Read with meta - Uppercase columns - Rename dataframe - Write with meta :return: None """ spark = create_spark_session() config_path = get_config_path_from_cli() config = provide_config(config_path).get('scripts').get('us_cities_demographics') df = read_with_meta(spark, df_meta=config['input_meta'], header=True, sep=';') df = uppercase_columns(df=df, col_list=['City', 'State', 'Race']) df = rename(df=df) write_with_meta(df=df, df_meta=config['output_meta'])
def main(): """ Run pipeline: - Create spark session - Get config - Read with meta - Replace ids with values - Write with meta :return: None """ spark = create_spark_session() config_path = get_config_path_from_cli() config = provide_config(config_path).get('scripts').get( 'immigration_mapping') mapping_config_path = config.get('mapping_config_path') df = read_with_meta(spark, df_meta=config['input_meta']) df = replace_ids_with_values(df=df, mapping_config_path=mapping_config_path) write_with_meta(df=df, df_meta=config['output_meta'])
def main(): """ Run pipeline: - Create spark session - Get config - Read with meta - Rename dataframe - Add decade column - Write with meta :return: None """ spark = create_spark_session() config_path = get_config_path_from_cli() config = provide_config(config_path).get('scripts').get( 'global_temperatures') df = read_with_meta(spark, df_meta=config['input_meta'], header=True) df = rename(df=df) df = add_decade_column(df=df, date_col='date') write_with_meta(df, df_meta=config['output_meta'])