Ejemplo n.º 1
0
    def _min_max_sd(self, forecast, tamconfig, data_sources):
        """Return the min, max, and standard deviation for TAM data.
           Arguments:
             forecast: the TAM forecast dataframe for all sources.
             tamconfig: the row from self.tamconfig to use
             data_sources: dict of dicts of datasources, as described in tam_ref_data_sources in
               the constructor
        """
        source_until_2014 = tamconfig['source_until_2014']
        source_after_2014 = tamconfig['source_after_2014']

        result = pd.DataFrame(np.nan, index=forecast.index.copy(), columns=['Min', 'Max', 'S.D'])
        result.loc[:, 'Min'] = forecast.dropna(axis='columns', how='all').min(axis=1)
        result.loc[:, 'Max'] = forecast.max(axis=1)
        if forecast.empty:
            # Some solutions provide no data sources for PDS
            result.loc[:, 'S.D'] = np.nan
        else:
            columns = interpolation.matching_data_sources(data_sources=data_sources,
                    name=source_until_2014, groups_only=True)
            # Excel STDDEV.P is a whole population stddev, ddof=0
            m = forecast.loc[:2014, columns].dropna(axis='columns', how='all').std(axis=1, ddof=0)
            m.name = 'S.D'
            result.update(m)
            columns = interpolation.matching_data_sources(data_sources=data_sources,
                    name=source_after_2014, groups_only=True)
            m = forecast.loc[2015:, columns].dropna(axis='columns', how='all').std(axis=1, ddof=0)
            m.name = 'S.D'
            result.update(m)
        return result
Ejemplo n.º 2
0
    def _min_max_sd(self, forecast, tamconfig, data_sources, region):
        """Return the min, max, and standard deviation for TAM data.
           Arguments:
             forecast: the TAM forecast dataframe for all sources.
             tamconfig: the row from self.tamconfig to use
             data_sources: dict of dicts of datasources, as described in tam_ref_data_sources in
               the constructor
             region: the name of the region to match, like 'OECD90' or 'Latin America'. Can
               be None, which will use the top level ('World') data_sources.
        """
        source_until_2014 = tamconfig['source_until_2014']
        source_after_2014 = tamconfig['source_after_2014']

        result = pd.DataFrame(np.nan, index=forecast.index.copy(), columns=['Min', 'Max', 'S.D'])
        result.loc[:, 'Min'] = forecast.dropna(axis='columns', how='all').min(axis=1)
        result.loc[:, 'Max'] = forecast.max(axis=1)
        if forecast.empty:
            # Some solutions provide no data sources for PDS
            result.loc[:, 'S.D'] = np.nan
        else:
            region_key = None if region is None else f'Region: {region}'
            columns = interpolation.matching_data_sources(data_sources=data_sources,
                    name=source_until_2014, groups_only=True, region_key=region_key)
            # Excel STDDEV.P is a whole population stddev, ddof=0
            m = forecast.loc[:2014, columns].dropna(axis='columns', how='all').std(axis=1, ddof=0)
            m.name = 'S.D'
            result.update(m)
            columns = interpolation.matching_data_sources(data_sources=data_sources,
                    name=source_after_2014, groups_only=True, region_key=region_key)
            m = forecast.loc[2015:, columns].dropna(axis='columns', how='all').std(axis=1, ddof=0)
            m.name = 'S.D'
            result.update(m)
        return result
Ejemplo n.º 3
0
    def _low_med_high(self, forecast, min_max_sd, tamconfig, data_sources):
        """Return the selected data sources as Medium, and N stddev away as Low and High.

           Arguments:
             forecast: DataFrame of all of the data sources, source name as the column name.
             min_max_sd: DataFrame with columns for the Minimum, Maxiumum, and Standard deviation.
             tamconfig: the row from self.tamconfig to use
             data_sources: dict of dicts of datasources, as described in tam_ref_data_sources in
               the constructor
        """
        result = pd.DataFrame(np.nan, index=forecast.index.copy(),
                              columns=['Low', 'Medium', 'High'])
        if forecast.empty:
            result.loc[:, 'Medium'] = np.nan
            result.loc[:, 'Low'] = np.nan
            result.loc[:, 'High'] = np.nan
            return result

        columns = interpolation.matching_data_sources(data_sources=data_sources,
                name=tamconfig['source_until_2014'], groups_only=False)
        if columns and len(columns) > 1:
            # In Excel, the Mean computation is:
            # SUM($C521:$Q521)/COUNTIF($C521:$Q521,">0")
            #
            # The intent is to skip sources which are empty, but also means that
            # a source where the real data is 0.0 will not impact the Medium result.
            #
            # See this document for more information:
            # https://docs.google.com/document/d/19sq88J_PXY-y_EnqbSJDl0v9CdJArOdFLatNNUFhjEA/edit#heading=h.yvwwsbvutw2j
            #
            # We're matching the Excel behavior in the initial product. This decision can
            # be revisited later, when matching results from Excel is no longer required.
            # To revert, use:    m = forecast.loc[:2014, columns].mean(axis=1)
            # and:               m = forecast.loc[2015:, columns].mean(axis=1)
            m = forecast.loc[:2014, columns].mask(lambda f: f == 0.0, np.nan).mean(axis=1)
            m.name = 'Medium'
            result.update(m)
        elif columns and len(columns) == 1:
            m = forecast.loc[:2014, columns].mean(axis=1)
            m.name = 'Medium'
            result.update(m)

        columns = interpolation.matching_data_sources(data_sources=data_sources,
                name=tamconfig['source_after_2014'], groups_only=False)
        if columns and len(columns) > 1:
            # see comment above about Mean and this lambda function
            m = forecast.loc[2015:, columns].mask(lambda f: f == 0.0, np.nan).mean(axis=1)
            m.name = 'Medium'
            result.update(m)
        elif columns and len(columns) == 1:
            m = forecast.loc[2015:, columns].mean(axis=1)
            m.name = 'Medium'
            result.update(m)

        low_sd_mult = tamconfig['low_sd_mult']
        high_sd_mult = tamconfig['high_sd_mult']
        result.loc[:, 'Low'] = result.loc[:, 'Medium'] - (min_max_sd.loc[:, 'S.D'] * low_sd_mult)
        result.loc[:, 'High'] = result.loc[:, 'Medium'] + (min_max_sd.loc[:, 'S.D'] * high_sd_mult)
        return result
Ejemplo n.º 4
0
 def _low_med_high(self, adoption_data, min_max_sd, adconfig, source,
                   data_sources):
     """Return the selected data sources as Medium, and N stddev away as Low and High."""
     result = pd.DataFrame(index=adoption_data.index.copy(),
                           columns=['Low', 'Medium', 'High'])
     columns = interpolation.matching_data_sources(
         data_sources=data_sources, name=source, groups_only=False)
     if columns is None:
         result.loc[:, 'Medium'] = np.nan
         result.loc[:, 'Low'] = np.nan
         result.loc[:, 'High'] = np.nan
     else:
         # In Excel, the Mean computation is:
         # SUM($C46:$Q46)/COUNTIF($C46:$Q46,">0")
         #
         # The intent is to skip sources which are empty, but also means that
         # a source where the real data is 0.0 will not impact the Medium result.
         #
         # See this document for more information:
         # https://docs.google.com/document/d/19sq88J_PXY-y_EnqbSJDl0v9CdJArOdFLatNNUFhjEA/edit#heading=h.yvwwsbvutw2j
         #
         # We're matching the Excel behavior in the initial product. This decision can
         # be revisited later, when matching results from Excel is no longer required.
         # To revert, use:    medium = adoption_data.loc[:, columns].mean(axis=1)
         medium = adoption_data.loc[:,
                                    columns].mask(lambda f: f == 0.0,
                                                  np.nan).mean(axis=1)
         result.loc[:, 'Medium'] = medium
         result.loc[:, 'Low'] = medium - (min_max_sd.loc[:, 'S.D'] *
                                          adconfig.loc['low_sd_mult'])
         result.loc[:, 'High'] = medium + (min_max_sd.loc[:, 'S.D'] *
                                           adconfig.loc['high_sd_mult'])
     return result
Ejemplo n.º 5
0
    def _min_max_sd(self, adoption_data, source, data_sources, region):
        """Return the min, max, and standard deviation for adoption data."""
        result = pd.DataFrame(index=adoption_data.index.copy(), columns=['Min', 'Max', 'S.D'])
        result.loc[:, 'Min'] = adoption_data.min(axis=1)
        result.loc[:, 'Max'] = adoption_data.max(axis=1)

        region_key = None if region is None else f'Region: {region}'
        columns = interpolation.matching_data_sources(data_sources=data_sources, name=source,
                groups_only=False, region_key=region_key)
        if columns is None:
            result.loc[:, 'S.D'] = np.nan
        elif len(columns) > 1:
            # Excel STDDEV.P is a whole population stddev, ddof=0
            result.loc[:, 'S.D'] = adoption_data.loc[:, columns].std(axis=1, ddof=0)
        else:
            result.loc[:, 'S.D'] = adoption_data.std(axis=1, ddof=0)
        return result