def main(): """ Run pipeline: - Create spark session - Get config - Read with meta - Uppercase columns - Rename dataframe - Correct country names - Get country id - Control input - Add decade column - Write with meta :return: None """ spark = create_spark_session() config_path = get_config_path_from_cli() config = provide_config(config_path).get('scripts').get( 'temperatures_by_country') country_mapping_path = config.get('country_mapping_path') df = read_with_meta(spark, df_meta=config['input_meta'], header=True) df = uppercase_columns(df=df, col_list=['Country']) df = rename(df=df) df = control_input(df=df) df = correct_country_names(df=df, country_col='country_name', country_mapping_path=country_mapping_path) df = get_country_id(spark, df=df, config=config) df = add_decade_column(df=df, date_col='date') write_with_meta(df=df, df_meta=config['output_meta'])
def main(): """ Run pipeline: - Create spark session - Get config - Read all dataframes with meta - Merge country names - Correct country names - Generate an id column - Write with meta :return: None """ spark = create_spark_session() config_path = get_config_path_from_cli() config = provide_config(config_path).get('scripts').get('country') country_mapping_path = config.get('country_mapping_path') (gdp_per_capita, human_capital_index, press_freedom_index, temperatures_by_country, immigration) = read_data(spark, config=config) df = merge_country_names(gdp_per_capita, human_capital_index, press_freedom_index, temperatures_by_country, immigration) df = correct_country_names(df=df, country_col='country_name', country_mapping_path=country_mapping_path) df = df.withColumn('country_id', F.row_number().over(Window.orderBy('country_name'))) write_with_meta(df=df, df_meta=config['output_meta'])
def main(): """ Run pipeline: - Create spark session - Get config - Read with meta - Convert dates from sas format to datetime - Uppercase columns - Rename dataframe - Correct country names - Get origin country id - Control input - Write with meta :return: None """ spark = create_spark_session() config_path = get_config_path_from_cli() config = provide_config(config_path).get('scripts').get('immigration') country_mapping_path = config.get('country_mapping_path') df = read_with_meta(spark, df_meta=config['input_meta']) df = convert_sas_to_date(df=df) df = uppercase_columns(df=df, col_list=['i94port', 'i94addr', 'occup', 'gender']) df = rename(df=df) df = correct_country_names(df=df, country_col='country_name', country_mapping_path=country_mapping_path) df = get_country_id(spark, df=df, config=config) df = control_input(df=df) df = df.withColumnRenamed('country_id', 'origin_country_id') write_with_meta(df=df, df_meta=config['output_meta'])
def main(): """ Run pipeline: - Create spark session - Get config - Read with meta - Uppercase columns - Rename dataframe - Correct country names - Get country id - Convert wide dataframe to long - Add rank column - Write with meta :return: None """ spark = create_spark_session() config_path = get_config_path_from_cli() config = provide_config(config_path).get('scripts').get( 'human_capital_index') country_mapping_path = config.get('country_mapping_path') df = read_with_meta(spark, df_meta=config['input_meta'], header=True) df = uppercase_columns(df=df, col_list=['Country Name']) df = df.withColumnRenamed("Country Name", "country_name") df = correct_country_names(df=df, country_col='country_name', country_mapping_path=country_mapping_path) df = get_country_id(spark, df=df, config=config) df_long = melt(df=df, key_cols=['country_id'], value_cols=[str(i) for i in list(range(2010, 2021))], var_name='year', value_name='human_capital_index') df_long = add_rank_column(df=df_long, partition_col='year', order_by_col='human_capital_index', rank_col='human_capital_rank', ascending=False) write_with_meta(df=df_long, df_meta=config['output_meta'])
def main(): """ Run pipeline: - Create spark session - Get config - Read with meta - Uppercase columns - Rename dataframe - Write with meta :return: None """ spark = create_spark_session() config_path = get_config_path_from_cli() config = provide_config(config_path).get('scripts').get('us_cities_demographics') df = read_with_meta(spark, df_meta=config['input_meta'], header=True, sep=';') df = uppercase_columns(df=df, col_list=['City', 'State', 'Race']) df = rename(df=df) write_with_meta(df=df, df_meta=config['output_meta'])
def main(): """ Run pipeline: - Create spark session - Get config - Read with meta - Replace ids with values - Write with meta :return: None """ spark = create_spark_session() config_path = get_config_path_from_cli() config = provide_config(config_path).get('scripts').get( 'immigration_mapping') mapping_config_path = config.get('mapping_config_path') df = read_with_meta(spark, df_meta=config['input_meta']) df = replace_ids_with_values(df=df, mapping_config_path=mapping_config_path) write_with_meta(df=df, df_meta=config['output_meta'])
def main(): """ Run pipeline: - Create spark session - Get config - Read with meta - Rename dataframe - Add decade column - Write with meta :return: None """ spark = create_spark_session() config_path = get_config_path_from_cli() config = provide_config(config_path).get('scripts').get( 'global_temperatures') df = read_with_meta(spark, df_meta=config['input_meta'], header=True) df = rename(df=df) df = add_decade_column(df=df, date_col='date') write_with_meta(df, df_meta=config['output_meta'])