Exemple #1
0
def compare_FBS_results(fbs1_load, fbs2_load, ignore_metasources=False):
    """
    Compare a parquet on Data Commons to a parquet stored locally
    :param fbs1_load: df, fbs format
    :param fbs2_load: df, fbs format
    :param ignore_metasources: bool, True to compare fbs without
    matching metasources
    :return: df, comparison of the two dfs
    """
    import flowsa

    # load first file (must be saved locally)
    df1 = flowsa.getFlowBySector(fbs1_load).rename(
        columns={'FlowAmount': 'FlowAmount_fbs1'})
    df1 = replace_strings_with_NoneType(df1)
    # load second file (must be saved locally)
    df2 = flowsa.getFlowBySector(fbs2_load).rename(
        columns={'FlowAmount': 'FlowAmount_fbs2'})
    df2 = replace_strings_with_NoneType(df2)
    # compare df
    merge_cols = [
        'Flowable', 'Class', 'SectorProducedBy', 'SectorConsumedBy',
        'SectorSourceName', 'Context', 'Location', 'LocationSystem', 'Unit',
        'FlowType', 'Year', 'MetaSources'
    ]
    if ignore_metasources:
        merge_cols.remove('MetaSources')
    # check units
    compare_df_units(df1, df2)
    df_m = pd.merge(df1[merge_cols + ['FlowAmount_fbs1']],
                    df2[merge_cols + ['FlowAmount_fbs2']],
                    how='outer')
    df_m = df_m.assign(FlowAmount_diff=df_m['FlowAmount_fbs2'] -
                       df_m['FlowAmount_fbs1'])
    df_m = df_m.assign(
        Percent_Diff=(df_m['FlowAmount_diff'] / df_m['FlowAmount_fbs1']) * 100)
    df_m = df_m[df_m['FlowAmount_diff'].apply(
        lambda x: round(abs(x), 2) != 0)].reset_index(drop=True)
    # if no differences, print, if differences, provide df subset
    if len(df_m) == 0:
        vLog.debug('No differences between dataframes')
    else:
        vLog.debug('Differences exist between dataframes')
        df_m = df_m.sort_values([
            'Location',
            'SectorProducedBy',
            'SectorConsumedBy',
            'Flowable',
            'Context',
        ]).reset_index(drop=True)

    return df_m
Exemple #2
0
def load_source_dataframe(k, v):
    """
    Load the source dataframe. Data can be a FlowbyActivity or FlowBySector parquet stored in flowsa, or a FlowBySector
    formatted dataframe from another package.
    :param k: The datasource name
    :param v: The datasource parameters
    :return:
    """
    if v['data_format'] == 'FBA':
        # if yaml specifies a geoscale to load, use parameter to filter dataframe
        if 'source_fba_load_scale' in v:
            geo_level = v['source_fba_load_scale']
        else:
            geo_level = 'all'
        log.info("Retrieving flowbyactivity for datasource " + k +
                 " in year " + str(v['year']))
        flows_df = flowsa.getFlowByActivity(flowclass=[v['class']],
                                            years=[v['year']],
                                            datasource=k,
                                            geographic_level=geo_level)
    elif v['data_format'] == 'FBS':
        log.info("Retrieving flowbysector for datasource " + k)
        flows_df = flowsa.getFlowBySector(k)
    elif v['data_format'] == 'FBS_outside_flowsa':
        log.info("Retrieving flowbysector for datasource " + k)
        flows_df = getattr(sys.modules[__name__],
                           v["FBS_datapull_fxn"])(*v['parameters'])
    else:
        log.error("Data format not specified in method file for datasource " +
                  k)

    return flows_df
Exemple #3
0
def load_source_dataframe(k, v):
    """
    Load the source dataframe. Data can be a FlowbyActivity or
    FlowBySector parquet stored in flowsa, or a FlowBySector
    formatted dataframe from another package.
    :param k: str, The datasource name
    :param v: dictionary, The datasource parameters
    :return: df of identified parquet
    """
    if v['data_format'] == 'FBA':
        # if yaml specifies a geoscale to load, use parameter to filter dataframe
        if 'source_fba_load_scale' in v:
            geo_level = v['source_fba_load_scale']
        else:
            geo_level = None
        vLog.info("Retrieving flowbyactivity for datasource %s in year %s", k,
                  str(v['year']))
        flows_df = flowsa.getFlowByActivity(datasource=k,
                                            year=v['year'],
                                            flowclass=v['class'],
                                            geographic_level=geo_level)
    elif v['data_format'] == 'FBS':
        vLog.info("Retrieving flowbysector for datasource %s", k)
        flows_df = flowsa.getFlowBySector(k)
    elif v['data_format'] == 'FBS_outside_flowsa':
        vLog.info("Retrieving flowbysector for datasource %s", k)
        flows_df = dynamically_import_fxn(k, v["FBS_datapull_fxn"])(v)
    else:
        vLog.error(
            "Data format not specified in method file for datasource %s", k)

    return flows_df
Exemple #4
0
def compare_remote_to_local_FBS_parquet(DataCommonsParquetName,
                                        LocalParquetName):
    """
    Compare a parquet on Data Commons to a parquet stored locally
    :param DataCommonsParquetName:
    :param LocalParquetName:
    :param FileFormat: Either 'FlowByActivity' or 'FlowBySector'
    :return:
    """
    import flowsa
    from flowsa.flowbyfunctions import dataframe_difference

    # test
    # DataCommonsParquetName = 'Water_national_2015_m1'
    # LocalParquetName = 'Water_national_2015_m1'

    # load remote file
    df_remote = flowsa.getFlowBySector(DataCommonsParquetName,
                                       file_location='remote')
    # load local file
    df_local = flowsa.getFlowBySector((LocalParquetName))
    # compare df
    df_diff = dataframe_difference(df_remote, df_local)
    # if no differences, print, if differences, provide df subset
    if len(df_diff) == 0:
        log.info('No differences between dataframes')
    else:
        log.info('Differences exist between dataframes')
        df_diff = df_diff.sort_values([
            'Location',
            'SectorProducedBy',
            'SectorConsumedBy',
            'Flowable',
            'Context',
        ]).reset_index(drop=True)

    return df_diff
Exemple #5
0
def load_source_dataframe(k, v):
    """
    Load the source dataframe. Data can be a FlowbyActivity or FlowBySector parquet stored in flowsa, or a FlowBySector
    formatted dataframe from another package.
    :param k: The datasource name
    :param v: The datasource parameters
    :return:
    """
    if v['data_format'] == 'FBA':
        log.info("Retrieving flowbyactivity for datasource " + k + " in year " + str(v['year']))
        flows_df = flowsa.getFlowByActivity(flowclass=[v['class']], years=[v['year']], datasource=k)
    elif v['data_format'] == 'FBS':
        log.info("Retrieving flowbysector for datasource " + k)
        flows_df = flowsa.getFlowBySector(k)
    elif v['data_format'] == 'FBS_outside_flowsa':
        log.info("Retrieving flowbysector for datasource " + k)
        flows_df = getattr(sys.modules[__name__], v["FBS_datapull_fxn"])(v['parameters'])
    else:
        log.error("No parquet file found for datasource " + k)

    return flows_df
Exemple #6
0
def load_source_dataframe(sourcename, source_dict, download_FBA_if_missing):
    """
    Load the source dataframe. Data can be a FlowbyActivity or
    FlowBySector parquet stored in flowsa, or a FlowBySector
    formatted dataframe from another package.
    :param sourcename: str, The datasource name
    :param source_dict: dictionary, The datasource parameters
    :param download_FBA_if_missing: Bool, if True will download FBAs from
       Data Commons. Default is False.
    :return: df of identified parquet
    """
    if source_dict['data_format'] == 'FBA':
        # if yaml specifies a geoscale to load, use parameter
        # to filter dataframe
        if 'source_fba_load_scale' in source_dict:
            geo_level = source_dict['source_fba_load_scale']
        else:
            geo_level = None
        vLog.info("Retrieving Flow-By-Activity for datasource %s in year %s",
                  sourcename, str(source_dict['year']))
        flows_df = flowsa.getFlowByActivity(
            datasource=sourcename,
            year=source_dict['year'],
            flowclass=source_dict['class'],
            geographic_level=geo_level,
            download_FBA_if_missing=download_FBA_if_missing)
    elif source_dict['data_format'] == 'FBS':
        vLog.info("Retrieving flowbysector for datasource %s", sourcename)
        flows_df = flowsa.getFlowBySector(sourcename)
    elif source_dict['data_format'] == 'FBS_outside_flowsa':
        vLog.info("Retrieving flowbysector for datasource %s", sourcename)
        flows_df = dynamically_import_fxn(
            sourcename, source_dict["FBS_datapull_fxn"])(source_dict)
    else:
        vLog.error(
            "Data format not specified in method "
            "file for datasource %s", sourcename)

    return flows_df
Exemple #7
0
# __init__.py (flowsa)
# !/usr/bin/env python3
# coding=utf-8
# [email protected]
"""
Retrieves stored data in the FlowBySector format
    :param methodname: string, Name of an available method for the given class. Method files found in
                       flowsa/data/flowbysectormethods
    :return: dataframe in flow by sector format
"""

import flowsa

water = flowsa.getFlowBySector('Water_national_2015_m1')
Exemple #8
0
def get_fbs_subset(name):
    test_fbs = flowsa.getFlowBySector(name)
    return test_fbs
Exemple #9
0
# get_flows_by_sector.py (flowsa)
# !/usr/bin/env python3
# coding=utf-8
"""
Retrieves stored data in the FlowBySector format
    :param methodname: string, Name of an available method for the given class.
    Method files found in flowsa/data/flowbysectormethods
    :return: dataframe in flow by sector format

"""

import flowsa

# see available FBS models
flowsa.seeAvailableFlowByModels('FBS')

# load FBS from local directory, if does not exist, method will run
fbs_water = flowsa.getFlowBySector('Water_national_2015_m1',
                                   download_FBAs_if_missing=True)

# collapse the FBS - output has 'Sector' column instead of
# 'SectorProducedBy' and 'SectorConsumedBy' columns
fbs_water_collapsed = flowsa.collapse_FlowBySector('Water_national_2015_m1')
# __init__.py (flowsa)
# !/usr/bin/env python3
# coding=utf-8
# [email protected]
"""
Retrieves stored data in the FlowBySector format
    :param methodname: string, Name of an available method for the given class. Method files found in
                       flowsa/data/flowbysectormethods
    :return: dataframe in flow by sector format
"""

import flowsa

# load a FBS from local folder
fbs_water = flowsa.getFlowBySector('Water_national_2015_m1')

# load a FBS from remote server
fbs_water_remote = flowsa.getFlowBySector('Water_national_2015_m1',
                                          file_location='remote')

# collapse the FBS - output has 'Sector' column instead of 'SectorProducedBy' and 'SectorConsumedBy' columns
fbs_water_collapsed = flowsa.getFlowBySector_collapsed(
    'Water_national_2015_m1')
Exemple #11
0
def test_get_flows_by_sector():
    # set function to download any FBAs that are missing
    flowsa.getFlowBySector('Water_national_2015_m1',
                           download_FBAs_if_missing=True)