def _clean_multindex(data, time_index_name, interpolation_method=None, extrapolation_method=None, newindex=None, **kwargs): if time_index_name not in data.index.names: raise ValueError('Time_index_name must match one of the index level names if cleaning a multi-index dataframe') # TODO: duplicate values should raise an error when doing data validation # remove duplicates data = data.groupby(level=data.index.names).first() if newindex is None: exist_index = data.index.get_level_values(time_index_name) newindex = np.arange(min(exist_index), max(exist_index) + 1, dtype=int) elif not isinstance(newindex, np.ndarray): # We use newindex to calculate extrap_index using a method that takes an array newindex = np.array(newindex, dtype=int) # this is done so that we can take use data that falls outside of the newindex wholeindex = np.array(sorted(list(set(newindex) | set(data.index.get_level_values(time_index_name)))), dtype=int) # Add new levels to data for missing time indices # full_levels = [list(newindex) if name==time_index_name else list(level) for name, level in zip(data.index.names, data.index.levels)] # data = data.join(pd.DataFrame(index=pd.MultiIndex.from_product(full_levels, names=data.index.names)), how='outer').sort_index() data = util.reindex_df_level_with_new_elements(data, time_index_name, wholeindex) group_levels = tuple([n for n in data.index.names if n != time_index_name]) data = data.groupby(level=group_levels).apply(TimeSeries._clean_multindex_helper, time_index_name=time_index_name, newindex=wholeindex, interpolation_method=interpolation_method, extrapolation_method=extrapolation_method, **kwargs) data = util.reindex_df_level_with_new_elements(data, time_index_name, newindex) return data
def _clean_multindex(data, time_index_name, interpolation_method=None, extrapolation_method=None, newindex=None, **kwargs): if time_index_name not in data.index.names: raise ValueError('Time_index_name must match one of the index level names if cleaning a multi-index dataframe') if newindex is None: exist_index = data.index.get_level_values(time_index_name) newindex = np.array(sorted(set(exist_index)), dtype=int) # newindex = np.arange(min(exist_index), max(exist_index) + 1, dtype=int) elif not isinstance(newindex, np.ndarray): # We use newindex to calculate extrap_index using a method that takes an array newindex = np.array(newindex, dtype=int) # this is done so that we can take use data that falls outside of the newindex wholeindex = np.array(sorted(list(set(newindex) | set(data.index.get_level_values(time_index_name)))), dtype=int) # Add new levels to data for missing time indices data = util.reindex_df_level_with_new_elements(data, time_index_name, wholeindex) group_levels = tuple([n for n in data.index.names if n != time_index_name]) data = data.groupby(level=group_levels).apply(TimeSeries._clean_multindex_helper, time_index_name=time_index_name, newindex=wholeindex, interpolation_method=interpolation_method, extrapolation_method=extrapolation_method, **kwargs) data = util.reindex_df_level_with_new_elements(data, time_index_name, newindex) return data
def _clean_multindex(data, time_index_name, interpolation_method=None, extrapolation_method=None, newindex=None, **kwargs): if time_index_name not in data.index.names: raise ValueError( 'Time_index_name must match one of the index level names if cleaning a multi-index dataframe' ) # TODO: duplicate values should raise an error when doing data validation # remove duplicates data = data.groupby(level=data.index.names).first() if newindex is None: exist_index = data.index.get_level_values(time_index_name) newindex = np.arange(min(exist_index), max(exist_index) + 1, dtype=int) elif not isinstance(newindex, np.ndarray): # We use newindex to calculate extrap_index using a method that takes an array newindex = np.array(newindex, dtype=int) # this is done so that we can take use data that falls outside of the newindex wholeindex = np.array(sorted( list( set(newindex) | set(data.index.get_level_values(time_index_name)))), dtype=int) # Add new levels to data for missing time indices # full_levels = [list(newindex) if name==time_index_name else list(level) for name, level in zip(data.index.names, data.index.levels)] # data = data.join(pd.DataFrame(index=pd.MultiIndex.from_product(full_levels, names=data.index.names)), how='outer').sort_index() data = util.reindex_df_level_with_new_elements(data, time_index_name, wholeindex) group_levels = tuple( [n for n in data.index.names if n != time_index_name]) data = data.groupby(level=group_levels).apply( TimeSeries._clean_multindex_helper, time_index_name=time_index_name, newindex=wholeindex, interpolation_method=interpolation_method, extrapolation_method=extrapolation_method, **kwargs) data = util.reindex_df_level_with_new_elements(data, time_index_name, newindex) return data
def _setup_and_validate(self): Abstract.__init__(self, self.id, primary_key='id', data_id_key='parent_id') if self.raw_values is None: self._setup_zero_constraints() return self._validate_gaus() self.values = self.clean_timeseries(attr='raw_values', inplace=False, time_index=cfg.supply_years, time_index_name='year', interpolation_method=self.interpolation_method, extrapolation_method=self.extrapolation_method) # fill in any missing combinations of geographies self.values = util.reindex_df_level_with_new_elements(self.values, 'geography_from', cfg.dispatch_geographies) self.values = util.reindex_df_level_with_new_elements(self.values, 'geography_to', cfg.dispatch_geographies) self.values = self.values.fillna(0) self.values = self.values.sort()
def _add_missing_geographies(self, df, current_geography, current_data_type): current_number_of_geographies = len(util.get_elements_from_level(df, current_geography)) propper_number_of_geographies = len(cfg.geo.geographies_unfiltered[current_geography]) if current_data_type == 'total' and current_number_of_geographies != propper_number_of_geographies: # we only want to do it when we have a total, otherwise we can't just fill with zero df = util.reindex_df_level_with_new_elements(df, current_geography, cfg.geo.geographies_unfiltered[current_geography], fill_value=np.nan) return df
def _clean_multindex(data, time_index_name, interpolation_method=None, extrapolation_method=None, newindex=None, **kwargs): if time_index_name not in data.index.names: raise ValueError( 'Time_index_name must match one of the index level names if cleaning a multi-index dataframe') # remove duplicates data = data.groupby(level=data.index.names).first() if newindex is None: time_index_level = data.index.names.index(time_index_name) exist_index = np.array(data.index.levels[time_index_level], dtype=int) newindex = np.arange(min(exist_index), max(exist_index) + 1, dtype=int) elif not isinstance(newindex, np.ndarray): # We use newindex to calculate extrap_index using a method that takes an array newindex = np.array(newindex, dtype=int) index2drop = list(np.setdiff1d(data.index.levels[data.index.names.index(time_index_name)], newindex)) # Add new levels to data for missing time indices # full_levels = [list(newindex) if name==time_index_name else list(level) for name, level in zip(data.index.names, data.index.levels)] # data = data.join(pd.DataFrame(index=pd.MultiIndex.from_product(full_levels, names=data.index.names)), how='outer').sort_index() data = util.reindex_df_level_with_new_elements(data, time_index_name, newindex) group_levels = tuple([n for n in data.index.names if n != time_index_name]) data = data.groupby(level=group_levels).apply(TimeSeries._clean_multindex_helper, time_index_name=time_index_name, newindex=newindex, interpolation_method=interpolation_method, extrapolation_method=extrapolation_method, **kwargs) data.drop(index2drop, level=time_index_name, inplace=True) return data
def calculate(self, vintages, years): self.vintages = vintages self.years = years self.remap(converted_geography=GeoMapper.supply_primary_geography) self.values['supply_node'] = self.supply_node self.values.set_index('supply_node',append=True,inplace=True) primary_geography = GeoMapper.supply_primary_geography self.values = util.reindex_df_level_with_new_elements(self.values, primary_geography, GeoMapper.geography_to_gau[primary_geography],fill_value=0.0)
def _add_missing_level_elements_to_foreign_gaus(df, current_geography): y_or_v = GeoMapper._get_df_time_index_name(df) for index_name in df.index.names: if index_name == current_geography or index_name == y_or_v: continue needed_elements = list(set(df.index.get_level_values(index_name))) df = util.reindex_df_level_with_new_elements(df, index_name, needed_elements) df = df.fillna(0).sort() return df
def _add_missing_level_elements_to_foreign_gaus(df, current_geography): y_or_v = GeoMapper._get_df_time_index_name(df) for index_name in df.index.names: if index_name == current_geography or index_name == y_or_v: continue needed_elements = list(set(df.index.get_level_values(index_name))) df = util.reindex_df_level_with_new_elements( df, index_name, needed_elements) df = df.fillna(0).sort() return df
def standardize_time_across_timezones(self, attr='values', inplace=True): self.final_dates_index = pd.date_range(self.active_dates_index[0], periods=len(self.active_dates_index), freq='H', tz=self.dispatch_outputs_timezone) df = util.reindex_df_level_with_new_elements(getattr(self, attr).copy(), 'weather_datetime', self.final_dates_index) levels = [n for n in self.values.index.names if n!='weather_datetime'] df = df.groupby(level=levels).fillna(method='bfill').fillna(method='ffill') if inplace: setattr(self, attr, df) else: return df
def _setup_and_validate(self): if self.raw_values is None: self._setup_zero_constraints() return "" # self._validate_gaus() self.values = self.clean_timeseries( attr='raw_values', inplace=False, time_index=cfg.supply_years, time_index_name='year', interpolation_method=self.interpolation_method, extrapolation_method=self.extrapolation_method) # fill in any missing combinations of geographies self.values = util.reindex_df_level_with_new_elements( self.values, 'gau_from', GeoMapper.dispatch_geographies) self.values = util.reindex_df_level_with_new_elements( self.values, 'gau_to', GeoMapper.dispatch_geographies) self.values = self.values.fillna(0) self.values = self.values.sort_index()
def calculate(self, vintages, years): self.vintages = vintages self.years = years self.input_type = 'intensity' self.remap() self.values['supply_node'] = self.supply_node_id self.values.set_index('supply_node', append=True, inplace=True) primary_geography = cfg.primary_geography self.values = util.reindex_df_level_with_new_elements( self.values, primary_geography, cfg.geo.geographies[primary_geography], fill_value=0.0)
def incorporate_foreign_gaus(self, df, current_geography, data_type, map_key, keep_oth_index_over_oth_gau=False, zero_out_negatives=True): native_gaus, current_gaus, foreign_gaus = self.get_native_current_foreign_gaus(df, current_geography) # we don't have any foreign gaus if not foreign_gaus or not cfg.include_foreign_gaus: return df, current_geography y_or_v = GeoMapper._get_df_time_index_name(df) index_with_nans = [df.index.names[i] for i in set(np.nonzero([np.isnan(row) for row in df.index.get_values()])[1])] # if we have an index with nan, that typically indicates that one of the foreign gaus didn't have all the index levels # if this is the case, we have two options (1) ignore the foreign gau (2) get rid of the other index if index_with_nans and (keep_oth_index_over_oth_gau or data_type=='intensity'): return self.filter_foreign_gaus(df, current_geography), current_geography else: assert (y_or_v not in index_with_nans) and (current_geography not in index_with_nans) # we need to eliminate levels with nan before moving on df = util.remove_df_levels(df, index_with_nans) # add missing level indicies for foreign gaus, this must be done before we fill in years because we use a fill value of zero df = self._add_missing_level_elements_to_foreign_gaus(df, current_geography) # we need all the index level combinations to have all years for this to work correctly df_no_foreign_gaus = self.filter_foreign_gaus(df, current_geography) df_years = sorted(list(set(df_no_foreign_gaus.index.get_level_values(y_or_v).values))) df = util.reindex_df_level_with_new_elements(df, y_or_v, df_years) base_gaus = np.array(self.values.index.get_level_values(current_geography), dtype=int) for foreign_gau in foreign_gaus: foreign_geography = self.gau_to_geography[foreign_gau] index = np.nonzero(self.values.index.get_level_values(self.gau_to_geography[foreign_gau])==foreign_gau)[0] impacted_gaus = list(set(base_gaus[index])) base_gaus[index] = foreign_gau if any(impacted in foreign_gaus for impacted in impacted_gaus): raise ValueError('foreign gaus in the database cannot overlap geographically') # if the data_type is a total, we need to net out the total if data_type=='total': df = self._update_dataframe_totals_after_foreign_gau(df, current_geography, foreign_geography, impacted_gaus, foreign_gau, map_key, zero_out_negatives) elif data_type == 'intensity': logging.debug('Foreign GAUs with intensities is not yet implemented, totals will not be conserved') assert not any([any(np.isnan(row)) for row in df.index.get_values()]) new_geography_name = self.make_new_geography_name(current_geography, list(foreign_gaus)) df.index = df.index.rename(new_geography_name, level=current_geography) if new_geography_name not in self.geographies: self.add_new_geography(new_geography_name, base_gaus) # df = GeoMapper.reorder_level_names_after_incorporating_foreign_gaus(df, new_geography_name, y_or_v) return df, new_geography_name
def process_shape(self, active_dates_index=None, time_slice_elements=None): self.num_active_years = len(active_dates_index)/8766. if active_dates_index is not None: self.active_dates_index = active_dates_index if active_dates_index is None: raise ValueError('processing a shape requires an active date index') self.time_slice_elements = Shapes.create_time_slice_elements(active_dates_index) if time_slice_elements is None else time_slice_elements if self.shape_type=='weather date': self.values = util.reindex_df_level_with_new_elements(self.raw_values, 'weather_datetime', active_dates_index) # this step is slow, consider replacing if self.values.isnull().values.any(): raise ValueError('Weather data did not give full coverage of the active dates') elif self.shape_type=='time slice': self.values = self.create_empty_shape_data() non_time_elements_in_levels = [list(util.get_elements_from_level(self.values, e)) for e in self._non_time_keys] time_elements_in_levels = [list(util.get_elements_from_level(self.values, e)) for e in self._active_time_keys] for ind, value in self.raw_values.iterrows(): non_time_portion = [ind[self._non_time_dict[e]] for e in self._non_time_keys] time_portion = [ind[self._active_time_dict[e]] for e in self._active_time_keys] if not np.all([s in l for s, l in zip(non_time_portion+time_portion, non_time_elements_in_levels+time_elements_in_levels)]): continue indexer = tuple(non_time_portion + time_portion + [slice(None)]) if self.shape_unit_type=='energy': len_slice = len(self.values.loc[indexer]) self.values.loc[indexer] = value[0]/float(len_slice)*self.num_active_years elif self.shape_unit_type=='power': self.values.loc[indexer] = value[0] if self.values.isnull().values.any(): raise ValueError('Shape time slice data did not give full coverage of the active dates') # reindex to remove the helper columns self.values.index = self.values.index.droplevel(self._active_time_keys) self.values = self.values.swaplevel('weather_datetime', -1).sort_index() self.geomap_to_time_zone() self.localize_shapes() self.standardize_time_across_timezones() self.geomap_to_primary_geography() self.sum_over_time_zone() self.normalize() self.add_timeshift_type()
def standardize_shape_type(self, raw_values): if self.shape_type == 'weather date': final_data = util.reindex_df_level_with_new_elements( raw_values, 'weather_datetime', self.active_dates_index ) # this step is slow, consider replacing if final_data.isnull().values.any(): # do some interpolation to fill missing values, that that still doesn't work to remove the NaNs, we raise an error final_data = final_data.groupby(level=[ name for name in final_data.index.names if name != 'weather_datetime' ]).apply(pd.DataFrame.interpolate).ffill().bfill() if final_data.isnull().values.any(): raise ValueError( 'Weather data for shape {} did not give full coverage of the active dates:\n {}' .format(self.name, final_data[final_data.isnull().values])) elif self.shape_type == 'time slice': final_data = self.create_empty_shape_data(raw_values) final_data = pd.merge(final_data, raw_values.reset_index(), how='left') final_data = final_data.set_index( [c for c in final_data.columns if c != 'value']).sort_index() if self.shape_unit_type == 'energy': if 'week' in self._active_time_keys: raise ValueError( 'Shape unit type energy with week timeslice is not recommended due to edge effects' ) final_data = self.convert_energy_to_power(final_data) if final_data.isnull().values.any(): print final_data[final_data.isnull().values] raise ValueError( 'Shape {} time slice data did not give full coverage of the active dates.' .format(self.name)) # reindex to remove the helper columns active_time_keys_keep_hydro_year = list( set(self._active_time_keys) - set(['hydro_year'])) final_data.index = final_data.index.droplevel( active_time_keys_keep_hydro_year) # drop any duplicates final_data = final_data.groupby( level=final_data.index.names).first() else: raise ValueError( '{} shape_type must be "weather date" or "time slice", not {}'. format(self.name, self.shape_type)) final_data = final_data.swaplevel('weather_datetime', -1).sort_index() return final_data
def standardize_time_across_timezones(self, df): tz = pytz.timezone(cfg.getParam('dispatch_outputs_timezone')) offset = (tz.utcoffset(DT.datetime(2015, 1, 1)) + tz.dst(DT.datetime(2015, 1, 1))).total_seconds() / 60. new_index = pd.DatetimeIndex(self.active_dates_index_unique, tz=pytz.FixedOffset(offset)) # if we have hydro year, when this does a reindex, it can introduce NaNs, so we want to remove them after assert not df.isnull().any().any() standardize_df = util.reindex_df_level_with_new_elements( df.copy(), 'weather_datetime', new_index) levels = [n for n in df.index.names if n != 'weather_datetime'] standardize_df = standardize_df.groupby(level=levels).fillna( method='bfill').fillna(method='ffill') standardize_df = standardize_df[~standardize_df.isnull().values] return standardize_df
def process_shape(self): logging.info(' shape: ' + self.name) self.num_active_years = num_active_years(self.active_dates_index) if self.shape_type=='weather date': self.values = util.reindex_df_level_with_new_elements(self.raw_values, 'weather_datetime', self.active_dates_index) self.values = self.values.replace(np.nan,0)# this step is slow, consider replacing if self.values.isnull().values.any(): raise ValueError('Weather data for shape {} did not give full coverage of the active dates'.format(self.name)) elif self.shape_type=='time slice': self.values = self.create_empty_shape_data() non_time_elements_in_levels = [list(util.get_elements_from_level(self.values, e)) for e in self._non_time_keys] time_elements_in_levels = [list(util.get_elements_from_level(self.values, e)) for e in self._active_time_keys] for ind, value in self.raw_values.iterrows(): non_time_portion = [ind[self._non_time_dict[e]] for e in self._non_time_keys] time_portion = [ind[self._active_time_dict[e]] for e in self._active_time_keys] if not np.all([s in l for s, l in zip(non_time_portion+time_portion, non_time_elements_in_levels+time_elements_in_levels)]): continue indexer = tuple(non_time_portion + time_portion + [slice(None)]) if self.shape_unit_type=='energy': len_slice = len(self.values.loc[indexer]) self.values.loc[indexer] = value[0]/float(len_slice)*self.num_active_years elif self.shape_unit_type=='power': self.values.loc[indexer] = value[0] if self.values.isnull().values.any(): raise ValueError('Shape time slice data did not give full coverage of the active dates') # reindex to remove the helper columns self.values.index = self.values.index.droplevel(self._active_time_keys) self.values = cfg.geo.filter_extra_geos_from_df(self.values.swaplevel('weather_datetime', -1).sort()) self.geomap_to_time_zone() self.localize_shapes() self.standardize_time_across_timezones() self.geomap_to_primary_geography() self.sum_over_time_zone() self.normalize() self.add_timeshift_type() # raw values can be very large, so we delete it in this one case del self.raw_values
def process_shape(self, active_dates_index=None, time_slice_elements=None): self.num_active_years = len(active_dates_index)/8766. if active_dates_index is not None: self.active_dates_index = active_dates_index if active_dates_index is None: raise ValueError('processing a shape requires an active date index') self.time_slice_elements = Shapes.create_time_slice_elements(active_dates_index) if time_slice_elements is None else time_slice_elements if self.shape_type=='weather date': self.convert_index_to_datetime('raw_values', 'weather_datetime') # Reindex with a day on either side so that data is preserved when it is shifted for time zones self.values = util.reindex_df_level_with_new_elements(self.raw_values, 'weather_datetime', active_dates_index) # this step is slow, consider replacing # self.values = pd.merge(self.raw_values.reset_index(), # pd.DataFrame(self.active_dates_index, columns=['weather_datetime']), # how='right').set_index(self.raw_values.index.names) if self.values.isnull().values.any(): raise ValueError('Weather data did not give full coverage of the active dates') elif self.shape_type=='time slice': self.values = self.create_empty_shape_data() for ind, value in self.raw_values.iterrows(): indexer = tuple([ind[self._non_time_dict[e]] for e in self._non_time_keys] + [ind[self._active_time_dict[e]] for e in self._active_time_keys] + [slice(None)]) if self.shape_unit_type=='energy': len_slice = len(self.values.loc[indexer]) self.values.loc[indexer] = value[0]/float(len_slice)*self.num_active_years elif self.shape_unit_type=='power': self.values.loc[indexer] = value[0] if self.values.isnull().values.any(): raise ValueError('Shape time slice data did not give full coverage of the active dates') # reindex to remove the helper columns self.values.index = self.values.index.droplevel(self._active_time_keys) self.values.sort() self.geomap_to_time_zone() self.localize_shapes() self.geomap_to_primary_geography() self.sum_over_time_zone() self.normalize()
def process_shape(self, active_dates_index=None, time_slice_elements=None): self.num_active_years = len(active_dates_index) / 8766. if active_dates_index is not None: self.active_dates_index = active_dates_index if active_dates_index is None: raise ValueError( 'processing a shape requires an active date index') self.time_slice_elements = Shapes.create_time_slice_elements( active_dates_index ) if time_slice_elements is None else time_slice_elements if self.shape_type == 'weather date': self.values = util.reindex_df_level_with_new_elements( self.raw_values, 'weather_datetime', active_dates_index) # this step is slow, consider replacing if self.values.isnull().values.any(): raise ValueError( 'Weather data did not give full coverage of the active dates' ) elif self.shape_type == 'time slice': self.values = self.create_empty_shape_data() non_time_elements_in_levels = [ list(util.get_elements_from_level(self.values, e)) for e in self._non_time_keys ] time_elements_in_levels = [ list(util.get_elements_from_level(self.values, e)) for e in self._active_time_keys ] for ind, value in self.raw_values.iterrows(): non_time_portion = [ ind[self._non_time_dict[e]] for e in self._non_time_keys ] time_portion = [ ind[self._active_time_dict[e]] for e in self._active_time_keys ] if not np.all([ s in l for s, l in zip( non_time_portion + time_portion, non_time_elements_in_levels + time_elements_in_levels) ]): continue indexer = tuple(non_time_portion + time_portion + [slice(None)]) if self.shape_unit_type == 'energy': len_slice = len(self.values.loc[indexer]) self.values.loc[indexer] = value[0] / float( len_slice) * self.num_active_years elif self.shape_unit_type == 'power': self.values.loc[indexer] = value[0] if self.values.isnull().values.any(): raise ValueError( 'Shape time slice data did not give full coverage of the active dates' ) # reindex to remove the helper columns self.values.index = self.values.index.droplevel( self._active_time_keys) self.values = self.values.swaplevel('weather_datetime', -1).sort_index() self.geomap_to_time_zone() self.localize_shapes() self.standardize_time_across_timezones() self.geomap_to_primary_geography() self.sum_over_time_zone() self.normalize() self.add_timeshift_type()
def incorporate_foreign_gaus(self, df, current_geography, data_type, map_key, keep_oth_index_over_oth_gau=False, zero_out_negatives=True): native_gaus, current_gaus, foreign_gaus = self.get_native_current_foreign_gaus( df, current_geography) # we don't have any foreign gaus if not foreign_gaus or not cfg.include_foreign_gaus: return df, current_geography y_or_v = GeoMapper._get_df_time_index_name(df) index_with_nans = [ df.index.names[i] for i in set( np.nonzero([np.isnan(row) for row in df.index.get_values()])[1]) ] # if we have an index with nan, that typically indicates that one of the foreign gaus didn't have all the index levels # if this is the case, we have two options (1) ignore the foreign gau (2) get rid of the other index if index_with_nans and (keep_oth_index_over_oth_gau or data_type == 'intensity'): return self.filter_foreign_gaus( df, current_geography), current_geography else: assert (y_or_v not in index_with_nans) and (current_geography not in index_with_nans) # we need to eliminate levels with nan before moving on df = util.remove_df_levels(df, index_with_nans) # add missing level indicies for foreign gaus, this must be done before we fill in years because we use a fill value of zero df = self._add_missing_level_elements_to_foreign_gaus( df, current_geography) # we need all the index level combinations to have all years for this to work correctly df_no_foreign_gaus = self.filter_foreign_gaus(df, current_geography) df_years = sorted( list(set( df_no_foreign_gaus.index.get_level_values(y_or_v).values))) df = util.reindex_df_level_with_new_elements(df, y_or_v, df_years) base_gaus = np.array( self.values.index.get_level_values(current_geography), dtype=int) for foreign_gau in foreign_gaus: foreign_geography = self.gau_to_geography[foreign_gau] index = np.nonzero( self.values.index.get_level_values( self.gau_to_geography[foreign_gau]) == foreign_gau)[0] impacted_gaus = list(set(base_gaus[index])) base_gaus[index] = foreign_gau if any(impacted in foreign_gaus for impacted in impacted_gaus): raise ValueError( 'foreign gaus in the database cannot overlap geographically' ) # if the data_type is a total, we need to net out the total if data_type == 'total': df = self._update_dataframe_totals_after_foreign_gau( df, current_geography, foreign_geography, impacted_gaus, foreign_gau, map_key, zero_out_negatives) elif data_type == 'intensity': logging.warning( 'Foreign GAUs with intensities is not yet implemented, totals will not be conserved' ) assert not any([any(np.isnan(row)) for row in df.index.get_values()]) new_geography_name = self.make_new_geography_name( current_geography, list(foreign_gaus)) df.index = df.index.rename(new_geography_name, level=current_geography) if new_geography_name not in self.geographies: self.add_new_geography(new_geography_name, base_gaus) # df = GeoMapper.reorder_level_names_after_incorporating_foreign_gaus(df, new_geography_name, y_or_v) return df, new_geography_name