Beispiel #1
0
    def __init__(self,
                 df,
                 aggregates,
                 dimensions,
                 weight_col,
                 convergence_rate=1e-6,
                 max_iteration=50,
                 closure=1e-4):

        self.df = df
        self.aggregates = aggregates
        self.dimensions = dimensions
        self.weight_col = weight_col
        self.convergence_rate = convergence_rate
        self.max_iteration = max_iteration
        self.closure = closure

        self.ipfn = ipfn.ipfn(self.df,
                              self.aggregates,
                              self.dimensions,
                              weight_col=self.weight_col,
                              convergence_rate=self.convergence_rate,
                              max_iteration=self.max_iteration,
                              rate_tolerance=self.closure,
                              verbose=2)
Beispiel #2
0
    def construct(self, a, b, relative = False, cf = np.asarray([])):
        """Construct a matrix based on known marginal sums"""
        if relative:
            a_i = a / a.sum()
            b_i = b / b.sum()
        else:
            a_i = a
            b_i = b

        if np.allclose(a.sum() , b.sum(), rtol=0.01):
            if self.verbose:
                print('Marginal sums are align!')
        else:
            print('Marginal sums are not align\nTable example:')
            print(
                u"""
          E.g.:
          Sector---------------+
          +---+---+---+        |
        R | 5 | 6 | 7 |        |
          +---+---+---+---+    V
          | . | . | . | 8 | Fuel Type
          | . | . | . | 6 |
          | . | . | . | 4 |
          +---+---+---+---+
                        TI

          ∑ R = ∑ TI
                """)
            print("a = ", a)
            print("b = ", b)

        m = np.ones((len(a_i), len(b_i)))
        IPF = ipfn(m, [b_i, a_i], [[1],[0]], verbose=(1 if self.verbose else 0))
        m = IPF.iteration()

        if isinstance(m, tuple):
            n = m[0]
        else:
            n = m

        if self.verbose:
            print(u"a = ∑ m_i; {}".format(np.allclose(n.sum(1), a_i)))
            print(u"b = ∑ m_j; {}".format(np.allclose(n.sum(0), b_i)))

        if len(cf) > 0:
            self.raw = n
            n = n * cf
            self.conversion_factor = cf

        self.values = n

        # update pandas
        self.to_pandas()

        return(n)
Beispiel #3
0
def tripDistribution(tripGeneration, costMatrix):
  costMatrix['ozone'] = costMatrix.columns
  costMatrix = costMatrix.melt(id_vars=['ozone'])
  costMatrix.columns = ['ozone', 'dzone', 'total']
  production = tripGeneration['Production']
  production.index.name = 'ozone'
  attraction = tripGeneration['Attraction']
  attraction.index.name = 'dzone'
  aggregates = [production, attraction]
  dimensions = [['ozone'], ['dzone']]
  IPF = ipfn.ipfn(costMatrix, aggregates, dimensions)
  trips = IPF.iteration()
  return(trips.pivot(index='ozone', columns='dzone', values='total'))
Beispiel #4
0
 def gravity_model(self, trip_generation, cost_matrix):
     cost_matrix['ozone'] = cost_matrix.columns
     cost_matrix = cost_matrix.melt(id_vars=['ozone'])
     cost_matrix.columns = ['ozone', 'dzone', 'total']
     production = trip_generation['Production']
     production.index.name = 'ozone'
     attraction = trip_generation['Attraction']
     attraction.index.name = 'dzone'
     aggregates = [production, attraction]
     dimensions = [['ozone'], ['dzone']]
     IPF = ipfn.ipfn(cost_matrix, aggregates, dimensions)
     trips = IPF.iteration()
     return (trips.pivot(index='ozone', columns='dzone', values='total'))
Beispiel #5
0
 def from_ipfn(zones, cost_dataframe):
     # TODO fix this method
     cost_dataframe["origin_zone"] = cost_dataframe.index
     cost_dataframe = cost_dataframe.melt(id_vars=["origin_zone"])
     cost_dataframe.columns = ["origin_zone", "destination_zone", "total"]
     production = zones["Production"]
     production.index.name = "origin_zone"
     attraction = zones["Attraction"]
     attraction.index.name = "destination_zone"
     aggregates = [production, attraction]
     dimensions = [["origin_zone"], ["destination_zone"]]
     IPF = ipfn.ipfn(cost_dataframe, aggregates, dimensions)
     trips = IPF.iteration()
     return OriginDestinationDataFrame(
         trips.pivot(index="origin_zone",
                     columns="destination_zone",
                     values="total").stack())
Beispiel #6
0
    def update(self, b, dimension=1):
        """Update inputs of matrix `m` given the outputs `b`"""
        if self.verbose:
            print("updating table")
        m = self.values

        # a = np.dot(m.T, m)
        # b = np.diag(np.ones(a.shape[0])) - a
        # q = np.linalg.solve(b, (b_o - m.sum(0)))

        IPF = ipfn(m.copy(), [b], [[dimension]], verbose=self.verbose)
        self.values = IPF.iteration()
        if isinstance(self.values, tuple):
            self.values = self.values[0]

        # update pandas
        self.to_pandas()
Beispiel #7
0
    def __compute(self, strata, year, state):
        mar = self.marginals[year][state]
        if strata == RefinerIO.STRATA1:
            self.__ui.display_text_box.AppendText(
                "\nIPF by race, gender and age..\n")

            self.m_sex = self.__sex_marginals(mar)
            self.m_age_cat = self.__age_marginals(mar)
            self.m_race = self.__race_marginals(mar)

            aggregates = [self.m_race, self.m_sex, self.m_age_cat]
            dimension = [["RACE_ETH"], ["SEX_"], ["AGE_CAT"]]

            ipf = ipfn.ipfn(self.seed_matrix, aggregates, dimension)
            df = ipf.iteration(self.__ui)

            df = self.__remove_rows(df)

            self.m_age_cat = df.groupby("AGE_CAT")["total"].sum()
            self.m_sex = df.groupby("SEX_")["total"].sum()
            self.m_race = df.groupby("RACE_ETH")["total"].sum()

        elif strata == RefinerIO.STRATA2:
            self.__ui.display_text_box.AppendText(
                "\nIPF by race, gender, age and education..\n")
            self.m_edu = self.__edu_marginals(mar)

            print()
            print(self.m_race)
            print()

            print(self.m_sex)
            print()

            print(self.m_age_cat)
            print()

            print(self.m_edu)
            print()

            aggregates = [self.m_race, self.m_sex, self.m_age_cat, self.m_edu]
            dimension = [["RACE_ETH"], ["SEX_"], ["AGE_CAT"], ["EDU"]]

            ipf = ipfn.ipfn(self.seed_matrix, aggregates, dimension)
            df = ipf.iteration(self.__ui)

            df_education = self.__merge_education(df=df)

            # df_some_college = self.__remove_education(df=df)

            df_education = self.__add_column(df=df_education,
                                             col_name="FIPS",
                                             value=int(mar["ID2"]),
                                             index=0)
            df_education = self.__add_column(df=df_education,
                                             col_name="YEAR",
                                             value="20" + year,
                                             index=1)

            df_education = self.__fill_dataframe(df=df_education,
                                                 year=year,
                                                 fips=int(mar["ID2"]))

            print()
            print(df_education)

            if self.ipf_acs_data.empty:
                self.ipf_acs_data = df_education
            else:
                self.ipf_acs_data = self.ipf_acs_data.append(df_education,
                                                             ignore_index=True)
Beispiel #8
0
def survey_cleanup_weighting(path, file):
    df = pd.read_csv(path + file)
    # dropping duplicate survey takers and columns don't need
    df.drop_duplicates('What is your Amazon Worker ID?', inplace=True)
    #df.drop(['What is your Amazon Worker ID?', 'Unnamed: 9', 'Timestamp', 'In what zip code do you live?'], axis=1, inplace=True)
    # making human readable names
    df.rename(
        {
            'Before the coronavirus crisis, how often did you eat out at a sit-down restaurant?':
            'precovid',
            'Currently how often do you eat out at a sit-down restaurant?':
            'current',
            "Next year how often do you think you'll eat out at a sit-down restaurant?":
            'postcovid',
            'What is your gender?':
            'gender',
            'What is the highest level of education you have completed?':
            'educ'
        },
        axis=1,
        inplace=True)
    df['age_years'] = (2020 - df['In what year were you born?'])
    #creating category for min number of times currently eat out per category per month
    df['current_freq'] = df.current
    df['current_freq'] = df.current_freq.map({
        'Less than once per month': 0,
        'Once a month': 1,
        'Multiple time per month': 3,
        'Once a week': 4,
        'Multiple times per week': 8
    })
    # get age, bin, and drop year column
    df['age'] = pd.cut(
        2020 - df['In what year were you born?'],
        bins=[-1, 24, 34, 44, 54, 64, 1000],
        labels=['18-24', '25-34', '35-44', '45-54', '55-64', '65+'])
    df.drop(['In what year were you born?'], axis=1, inplace=True)
    # too few responses to estimate
    df = df.loc[df.gender != 'Other', :]
    # grouping educ into college, not college
    df['educ'] = df.educ.map({
        'College graduate': 'college',
        'Post-graduate (e.g. MS, MBA, PhD, etc.)': 'college',
        'High school': 'not_college',
        'Did not finish high school': 'not_college'
    })
    # fixing indexing after dropping some rows
    df.reset_index(drop=True, inplace=True)
    # Weighting
    np.random.seed(1)

    bias_vars = ['gender', 'educ', 'age']
    samp_aggs = [
        df[col].value_counts().sort_index().values for col in bias_vars
    ]  # marginal counts
    dims = [[i] for i in range(len(samp_aggs))]  # i'm using 1-d marginals
    # print(aggs, dims)

    # random initial values, using poisson to get count-like values
    samp_init = np.random.poisson(np.mean([np.mean(x) for x in samp_aggs]),
                                  (2, 2, 6)) + 1

    ipf = ipfn(samp_init, samp_aggs, dims)
    samp_dist = ipf.iteration()
    samp_dist = samp_dist / np.sum(
        samp_dist)  # normalizing it to a proper distribution

    # repeating for population marginals
    np.random.seed(1)
    from collections import OrderedDict
    # ['gender', 'educ', 'age']
    refdist = OrderedDict({
        'gender': {
            'Female': .52,
            'Male': .48
        },
        'educ': {
            'college': .35,
            'not_college': .65
        },
        # i added .001 to '65+' to make it exactly sum to 1
        'age': {
            '18-24': .141,
            '25-34': .194,
            '35-44': .187,
            '45-54': .194,
            '55-64': .160,
            '65+': .124
        }
    })
    ## checking that everything looks ok
    for i, k in enumerate(refdist.keys()):
        assert bias_vars[i] == k  # names in right order?
    for k, v in refdist.items():
        assert set(v.keys()) == set(df[k])  # unique values all correct?
        assert sum(v.values()) == 1.  # each a proper distribution?
    ref_aggs = [pd.Series(v).sort_index().values for v in refdist.values()]
    # random initial values, using unif(.25,.75) to get probability-like values
    ref_init = np.random.uniform(.25, .75, (2, 2, 6))
    ipf = ipfn(ref_init, ref_aggs, dims)  # reusing same dims
    pop_dist = ipf.iteration()
    # creating table for weights
    wt_arr = pop_dist / samp_dist

    dimnames = [df[col].value_counts().sort_index().index for col in bias_vars]
    wt_df = pd.DataFrame(np.zeros(
        (np.prod(wt_arr.shape), len(refdist) + 1)))  # +1 for wt column
    wt_df.columns = list(refdist.keys()) + ['wt']

    l = 0
    for i, f in enumerate(refdist['gender'].keys()):
        for j, e in enumerate(refdist['educ'].keys()):
            for k, a in enumerate(refdist['age'].keys()):
                wt_df.iloc[l, :len(refdist)] = [f, e, a]
                wt_df.iloc[l, len(refdist)] = wt_arr[i, j, k]
                l += 1
    #Adding the weights back to the data frame
    df = pd.merge(df, wt_df, on=['gender', 'educ', 'age'])
    df['wt'] = df.wt / df.wt.mean()
    # new quesitons added for wave2
    df.rename(
        {
            'Currently, about how often do you order takeout?':
            'how_often_takeout'
        },
        axis=1,
        inplace=True)
    df.rename({'What is your annual household income?': 'household_income'},
              axis=1,
              inplace=True)
    print(df.head(3))
    return df