def compare_FBS_results(fbs1_load, fbs2_load, ignore_metasources=False): """ Compare a parquet on Data Commons to a parquet stored locally :param fbs1_load: df, fbs format :param fbs2_load: df, fbs format :param ignore_metasources: bool, True to compare fbs without matching metasources :return: df, comparison of the two dfs """ import flowsa # load first file (must be saved locally) df1 = flowsa.getFlowBySector(fbs1_load).rename( columns={'FlowAmount': 'FlowAmount_fbs1'}) df1 = replace_strings_with_NoneType(df1) # load second file (must be saved locally) df2 = flowsa.getFlowBySector(fbs2_load).rename( columns={'FlowAmount': 'FlowAmount_fbs2'}) df2 = replace_strings_with_NoneType(df2) # compare df merge_cols = [ 'Flowable', 'Class', 'SectorProducedBy', 'SectorConsumedBy', 'SectorSourceName', 'Context', 'Location', 'LocationSystem', 'Unit', 'FlowType', 'Year', 'MetaSources' ] if ignore_metasources: merge_cols.remove('MetaSources') # check units compare_df_units(df1, df2) df_m = pd.merge(df1[merge_cols + ['FlowAmount_fbs1']], df2[merge_cols + ['FlowAmount_fbs2']], how='outer') df_m = df_m.assign(FlowAmount_diff=df_m['FlowAmount_fbs2'] - df_m['FlowAmount_fbs1']) df_m = df_m.assign( Percent_Diff=(df_m['FlowAmount_diff'] / df_m['FlowAmount_fbs1']) * 100) df_m = df_m[df_m['FlowAmount_diff'].apply( lambda x: round(abs(x), 2) != 0)].reset_index(drop=True) # if no differences, print, if differences, provide df subset if len(df_m) == 0: vLog.debug('No differences between dataframes') else: vLog.debug('Differences exist between dataframes') df_m = df_m.sort_values([ 'Location', 'SectorProducedBy', 'SectorConsumedBy', 'Flowable', 'Context', ]).reset_index(drop=True) return df_m
def load_source_dataframe(k, v): """ Load the source dataframe. Data can be a FlowbyActivity or FlowBySector parquet stored in flowsa, or a FlowBySector formatted dataframe from another package. :param k: The datasource name :param v: The datasource parameters :return: """ if v['data_format'] == 'FBA': # if yaml specifies a geoscale to load, use parameter to filter dataframe if 'source_fba_load_scale' in v: geo_level = v['source_fba_load_scale'] else: geo_level = 'all' log.info("Retrieving flowbyactivity for datasource " + k + " in year " + str(v['year'])) flows_df = flowsa.getFlowByActivity(flowclass=[v['class']], years=[v['year']], datasource=k, geographic_level=geo_level) elif v['data_format'] == 'FBS': log.info("Retrieving flowbysector for datasource " + k) flows_df = flowsa.getFlowBySector(k) elif v['data_format'] == 'FBS_outside_flowsa': log.info("Retrieving flowbysector for datasource " + k) flows_df = getattr(sys.modules[__name__], v["FBS_datapull_fxn"])(*v['parameters']) else: log.error("Data format not specified in method file for datasource " + k) return flows_df
def load_source_dataframe(k, v): """ Load the source dataframe. Data can be a FlowbyActivity or FlowBySector parquet stored in flowsa, or a FlowBySector formatted dataframe from another package. :param k: str, The datasource name :param v: dictionary, The datasource parameters :return: df of identified parquet """ if v['data_format'] == 'FBA': # if yaml specifies a geoscale to load, use parameter to filter dataframe if 'source_fba_load_scale' in v: geo_level = v['source_fba_load_scale'] else: geo_level = None vLog.info("Retrieving flowbyactivity for datasource %s in year %s", k, str(v['year'])) flows_df = flowsa.getFlowByActivity(datasource=k, year=v['year'], flowclass=v['class'], geographic_level=geo_level) elif v['data_format'] == 'FBS': vLog.info("Retrieving flowbysector for datasource %s", k) flows_df = flowsa.getFlowBySector(k) elif v['data_format'] == 'FBS_outside_flowsa': vLog.info("Retrieving flowbysector for datasource %s", k) flows_df = dynamically_import_fxn(k, v["FBS_datapull_fxn"])(v) else: vLog.error( "Data format not specified in method file for datasource %s", k) return flows_df
def compare_remote_to_local_FBS_parquet(DataCommonsParquetName, LocalParquetName): """ Compare a parquet on Data Commons to a parquet stored locally :param DataCommonsParquetName: :param LocalParquetName: :param FileFormat: Either 'FlowByActivity' or 'FlowBySector' :return: """ import flowsa from flowsa.flowbyfunctions import dataframe_difference # test # DataCommonsParquetName = 'Water_national_2015_m1' # LocalParquetName = 'Water_national_2015_m1' # load remote file df_remote = flowsa.getFlowBySector(DataCommonsParquetName, file_location='remote') # load local file df_local = flowsa.getFlowBySector((LocalParquetName)) # compare df df_diff = dataframe_difference(df_remote, df_local) # if no differences, print, if differences, provide df subset if len(df_diff) == 0: log.info('No differences between dataframes') else: log.info('Differences exist between dataframes') df_diff = df_diff.sort_values([ 'Location', 'SectorProducedBy', 'SectorConsumedBy', 'Flowable', 'Context', ]).reset_index(drop=True) return df_diff
def load_source_dataframe(k, v): """ Load the source dataframe. Data can be a FlowbyActivity or FlowBySector parquet stored in flowsa, or a FlowBySector formatted dataframe from another package. :param k: The datasource name :param v: The datasource parameters :return: """ if v['data_format'] == 'FBA': log.info("Retrieving flowbyactivity for datasource " + k + " in year " + str(v['year'])) flows_df = flowsa.getFlowByActivity(flowclass=[v['class']], years=[v['year']], datasource=k) elif v['data_format'] == 'FBS': log.info("Retrieving flowbysector for datasource " + k) flows_df = flowsa.getFlowBySector(k) elif v['data_format'] == 'FBS_outside_flowsa': log.info("Retrieving flowbysector for datasource " + k) flows_df = getattr(sys.modules[__name__], v["FBS_datapull_fxn"])(v['parameters']) else: log.error("No parquet file found for datasource " + k) return flows_df
def load_source_dataframe(sourcename, source_dict, download_FBA_if_missing): """ Load the source dataframe. Data can be a FlowbyActivity or FlowBySector parquet stored in flowsa, or a FlowBySector formatted dataframe from another package. :param sourcename: str, The datasource name :param source_dict: dictionary, The datasource parameters :param download_FBA_if_missing: Bool, if True will download FBAs from Data Commons. Default is False. :return: df of identified parquet """ if source_dict['data_format'] == 'FBA': # if yaml specifies a geoscale to load, use parameter # to filter dataframe if 'source_fba_load_scale' in source_dict: geo_level = source_dict['source_fba_load_scale'] else: geo_level = None vLog.info("Retrieving Flow-By-Activity for datasource %s in year %s", sourcename, str(source_dict['year'])) flows_df = flowsa.getFlowByActivity( datasource=sourcename, year=source_dict['year'], flowclass=source_dict['class'], geographic_level=geo_level, download_FBA_if_missing=download_FBA_if_missing) elif source_dict['data_format'] == 'FBS': vLog.info("Retrieving flowbysector for datasource %s", sourcename) flows_df = flowsa.getFlowBySector(sourcename) elif source_dict['data_format'] == 'FBS_outside_flowsa': vLog.info("Retrieving flowbysector for datasource %s", sourcename) flows_df = dynamically_import_fxn( sourcename, source_dict["FBS_datapull_fxn"])(source_dict) else: vLog.error( "Data format not specified in method " "file for datasource %s", sourcename) return flows_df
# __init__.py (flowsa) # !/usr/bin/env python3 # coding=utf-8 # [email protected] """ Retrieves stored data in the FlowBySector format :param methodname: string, Name of an available method for the given class. Method files found in flowsa/data/flowbysectormethods :return: dataframe in flow by sector format """ import flowsa water = flowsa.getFlowBySector('Water_national_2015_m1')
def get_fbs_subset(name): test_fbs = flowsa.getFlowBySector(name) return test_fbs
# get_flows_by_sector.py (flowsa) # !/usr/bin/env python3 # coding=utf-8 """ Retrieves stored data in the FlowBySector format :param methodname: string, Name of an available method for the given class. Method files found in flowsa/data/flowbysectormethods :return: dataframe in flow by sector format """ import flowsa # see available FBS models flowsa.seeAvailableFlowByModels('FBS') # load FBS from local directory, if does not exist, method will run fbs_water = flowsa.getFlowBySector('Water_national_2015_m1', download_FBAs_if_missing=True) # collapse the FBS - output has 'Sector' column instead of # 'SectorProducedBy' and 'SectorConsumedBy' columns fbs_water_collapsed = flowsa.collapse_FlowBySector('Water_national_2015_m1')
# __init__.py (flowsa) # !/usr/bin/env python3 # coding=utf-8 # [email protected] """ Retrieves stored data in the FlowBySector format :param methodname: string, Name of an available method for the given class. Method files found in flowsa/data/flowbysectormethods :return: dataframe in flow by sector format """ import flowsa # load a FBS from local folder fbs_water = flowsa.getFlowBySector('Water_national_2015_m1') # load a FBS from remote server fbs_water_remote = flowsa.getFlowBySector('Water_national_2015_m1', file_location='remote') # collapse the FBS - output has 'Sector' column instead of 'SectorProducedBy' and 'SectorConsumedBy' columns fbs_water_collapsed = flowsa.getFlowBySector_collapsed( 'Water_national_2015_m1')
def test_get_flows_by_sector(): # set function to download any FBAs that are missing flowsa.getFlowBySector('Water_national_2015_m1', download_FBAs_if_missing=True)