def __init__(self, df, aggregates, dimensions, weight_col, convergence_rate=1e-6, max_iteration=50, closure=1e-4): self.df = df self.aggregates = aggregates self.dimensions = dimensions self.weight_col = weight_col self.convergence_rate = convergence_rate self.max_iteration = max_iteration self.closure = closure self.ipfn = ipfn.ipfn(self.df, self.aggregates, self.dimensions, weight_col=self.weight_col, convergence_rate=self.convergence_rate, max_iteration=self.max_iteration, rate_tolerance=self.closure, verbose=2)
def construct(self, a, b, relative = False, cf = np.asarray([])): """Construct a matrix based on known marginal sums""" if relative: a_i = a / a.sum() b_i = b / b.sum() else: a_i = a b_i = b if np.allclose(a.sum() , b.sum(), rtol=0.01): if self.verbose: print('Marginal sums are align!') else: print('Marginal sums are not align\nTable example:') print( u""" E.g.: Sector---------------+ +---+---+---+ | R | 5 | 6 | 7 | | +---+---+---+---+ V | . | . | . | 8 | Fuel Type | . | . | . | 6 | | . | . | . | 4 | +---+---+---+---+ TI ∑ R = ∑ TI """) print("a = ", a) print("b = ", b) m = np.ones((len(a_i), len(b_i))) IPF = ipfn(m, [b_i, a_i], [[1],[0]], verbose=(1 if self.verbose else 0)) m = IPF.iteration() if isinstance(m, tuple): n = m[0] else: n = m if self.verbose: print(u"a = ∑ m_i; {}".format(np.allclose(n.sum(1), a_i))) print(u"b = ∑ m_j; {}".format(np.allclose(n.sum(0), b_i))) if len(cf) > 0: self.raw = n n = n * cf self.conversion_factor = cf self.values = n # update pandas self.to_pandas() return(n)
def tripDistribution(tripGeneration, costMatrix): costMatrix['ozone'] = costMatrix.columns costMatrix = costMatrix.melt(id_vars=['ozone']) costMatrix.columns = ['ozone', 'dzone', 'total'] production = tripGeneration['Production'] production.index.name = 'ozone' attraction = tripGeneration['Attraction'] attraction.index.name = 'dzone' aggregates = [production, attraction] dimensions = [['ozone'], ['dzone']] IPF = ipfn.ipfn(costMatrix, aggregates, dimensions) trips = IPF.iteration() return(trips.pivot(index='ozone', columns='dzone', values='total'))
def gravity_model(self, trip_generation, cost_matrix): cost_matrix['ozone'] = cost_matrix.columns cost_matrix = cost_matrix.melt(id_vars=['ozone']) cost_matrix.columns = ['ozone', 'dzone', 'total'] production = trip_generation['Production'] production.index.name = 'ozone' attraction = trip_generation['Attraction'] attraction.index.name = 'dzone' aggregates = [production, attraction] dimensions = [['ozone'], ['dzone']] IPF = ipfn.ipfn(cost_matrix, aggregates, dimensions) trips = IPF.iteration() return (trips.pivot(index='ozone', columns='dzone', values='total'))
def from_ipfn(zones, cost_dataframe): # TODO fix this method cost_dataframe["origin_zone"] = cost_dataframe.index cost_dataframe = cost_dataframe.melt(id_vars=["origin_zone"]) cost_dataframe.columns = ["origin_zone", "destination_zone", "total"] production = zones["Production"] production.index.name = "origin_zone" attraction = zones["Attraction"] attraction.index.name = "destination_zone" aggregates = [production, attraction] dimensions = [["origin_zone"], ["destination_zone"]] IPF = ipfn.ipfn(cost_dataframe, aggregates, dimensions) trips = IPF.iteration() return OriginDestinationDataFrame( trips.pivot(index="origin_zone", columns="destination_zone", values="total").stack())
def update(self, b, dimension=1): """Update inputs of matrix `m` given the outputs `b`""" if self.verbose: print("updating table") m = self.values # a = np.dot(m.T, m) # b = np.diag(np.ones(a.shape[0])) - a # q = np.linalg.solve(b, (b_o - m.sum(0))) IPF = ipfn(m.copy(), [b], [[dimension]], verbose=self.verbose) self.values = IPF.iteration() if isinstance(self.values, tuple): self.values = self.values[0] # update pandas self.to_pandas()
def __compute(self, strata, year, state): mar = self.marginals[year][state] if strata == RefinerIO.STRATA1: self.__ui.display_text_box.AppendText( "\nIPF by race, gender and age..\n") self.m_sex = self.__sex_marginals(mar) self.m_age_cat = self.__age_marginals(mar) self.m_race = self.__race_marginals(mar) aggregates = [self.m_race, self.m_sex, self.m_age_cat] dimension = [["RACE_ETH"], ["SEX_"], ["AGE_CAT"]] ipf = ipfn.ipfn(self.seed_matrix, aggregates, dimension) df = ipf.iteration(self.__ui) df = self.__remove_rows(df) self.m_age_cat = df.groupby("AGE_CAT")["total"].sum() self.m_sex = df.groupby("SEX_")["total"].sum() self.m_race = df.groupby("RACE_ETH")["total"].sum() elif strata == RefinerIO.STRATA2: self.__ui.display_text_box.AppendText( "\nIPF by race, gender, age and education..\n") self.m_edu = self.__edu_marginals(mar) print() print(self.m_race) print() print(self.m_sex) print() print(self.m_age_cat) print() print(self.m_edu) print() aggregates = [self.m_race, self.m_sex, self.m_age_cat, self.m_edu] dimension = [["RACE_ETH"], ["SEX_"], ["AGE_CAT"], ["EDU"]] ipf = ipfn.ipfn(self.seed_matrix, aggregates, dimension) df = ipf.iteration(self.__ui) df_education = self.__merge_education(df=df) # df_some_college = self.__remove_education(df=df) df_education = self.__add_column(df=df_education, col_name="FIPS", value=int(mar["ID2"]), index=0) df_education = self.__add_column(df=df_education, col_name="YEAR", value="20" + year, index=1) df_education = self.__fill_dataframe(df=df_education, year=year, fips=int(mar["ID2"])) print() print(df_education) if self.ipf_acs_data.empty: self.ipf_acs_data = df_education else: self.ipf_acs_data = self.ipf_acs_data.append(df_education, ignore_index=True)
def survey_cleanup_weighting(path, file): df = pd.read_csv(path + file) # dropping duplicate survey takers and columns don't need df.drop_duplicates('What is your Amazon Worker ID?', inplace=True) #df.drop(['What is your Amazon Worker ID?', 'Unnamed: 9', 'Timestamp', 'In what zip code do you live?'], axis=1, inplace=True) # making human readable names df.rename( { 'Before the coronavirus crisis, how often did you eat out at a sit-down restaurant?': 'precovid', 'Currently how often do you eat out at a sit-down restaurant?': 'current', "Next year how often do you think you'll eat out at a sit-down restaurant?": 'postcovid', 'What is your gender?': 'gender', 'What is the highest level of education you have completed?': 'educ' }, axis=1, inplace=True) df['age_years'] = (2020 - df['In what year were you born?']) #creating category for min number of times currently eat out per category per month df['current_freq'] = df.current df['current_freq'] = df.current_freq.map({ 'Less than once per month': 0, 'Once a month': 1, 'Multiple time per month': 3, 'Once a week': 4, 'Multiple times per week': 8 }) # get age, bin, and drop year column df['age'] = pd.cut( 2020 - df['In what year were you born?'], bins=[-1, 24, 34, 44, 54, 64, 1000], labels=['18-24', '25-34', '35-44', '45-54', '55-64', '65+']) df.drop(['In what year were you born?'], axis=1, inplace=True) # too few responses to estimate df = df.loc[df.gender != 'Other', :] # grouping educ into college, not college df['educ'] = df.educ.map({ 'College graduate': 'college', 'Post-graduate (e.g. MS, MBA, PhD, etc.)': 'college', 'High school': 'not_college', 'Did not finish high school': 'not_college' }) # fixing indexing after dropping some rows df.reset_index(drop=True, inplace=True) # Weighting np.random.seed(1) bias_vars = ['gender', 'educ', 'age'] samp_aggs = [ df[col].value_counts().sort_index().values for col in bias_vars ] # marginal counts dims = [[i] for i in range(len(samp_aggs))] # i'm using 1-d marginals # print(aggs, dims) # random initial values, using poisson to get count-like values samp_init = np.random.poisson(np.mean([np.mean(x) for x in samp_aggs]), (2, 2, 6)) + 1 ipf = ipfn(samp_init, samp_aggs, dims) samp_dist = ipf.iteration() samp_dist = samp_dist / np.sum( samp_dist) # normalizing it to a proper distribution # repeating for population marginals np.random.seed(1) from collections import OrderedDict # ['gender', 'educ', 'age'] refdist = OrderedDict({ 'gender': { 'Female': .52, 'Male': .48 }, 'educ': { 'college': .35, 'not_college': .65 }, # i added .001 to '65+' to make it exactly sum to 1 'age': { '18-24': .141, '25-34': .194, '35-44': .187, '45-54': .194, '55-64': .160, '65+': .124 } }) ## checking that everything looks ok for i, k in enumerate(refdist.keys()): assert bias_vars[i] == k # names in right order? for k, v in refdist.items(): assert set(v.keys()) == set(df[k]) # unique values all correct? assert sum(v.values()) == 1. # each a proper distribution? ref_aggs = [pd.Series(v).sort_index().values for v in refdist.values()] # random initial values, using unif(.25,.75) to get probability-like values ref_init = np.random.uniform(.25, .75, (2, 2, 6)) ipf = ipfn(ref_init, ref_aggs, dims) # reusing same dims pop_dist = ipf.iteration() # creating table for weights wt_arr = pop_dist / samp_dist dimnames = [df[col].value_counts().sort_index().index for col in bias_vars] wt_df = pd.DataFrame(np.zeros( (np.prod(wt_arr.shape), len(refdist) + 1))) # +1 for wt column wt_df.columns = list(refdist.keys()) + ['wt'] l = 0 for i, f in enumerate(refdist['gender'].keys()): for j, e in enumerate(refdist['educ'].keys()): for k, a in enumerate(refdist['age'].keys()): wt_df.iloc[l, :len(refdist)] = [f, e, a] wt_df.iloc[l, len(refdist)] = wt_arr[i, j, k] l += 1 #Adding the weights back to the data frame df = pd.merge(df, wt_df, on=['gender', 'educ', 'age']) df['wt'] = df.wt / df.wt.mean() # new quesitons added for wave2 df.rename( { 'Currently, about how often do you order takeout?': 'how_often_takeout' }, axis=1, inplace=True) df.rename({'What is your annual household income?': 'household_income'}, axis=1, inplace=True) print(df.head(3)) return df