def test_get_data_single_table(orca_session): """ Single table, no other params. """ df = utils.get_data(tables = 'households') assert(len(df) == 3)
def run(self): """ Run the model step: calculate simulated choices and use them to update a column. Alternatives that appear in the estimation data but not in the model expression will not be available for simulation. Predicted probabilities come from PyLogit. Monte Carlo simulation of choices is performed directly. (This functionality will move to ChoiceModels.) The predicted probabilities and simulated choices are saved to the class object for interactive use (`probabilities` with type pd.DataFrame, and `choices` with type pd.Series) but are not persisted in the dictionary representation of the model step. """ expr_cols = [t[0] for t in list(self.model_expression.items()) \ if t[0] != 'intercept'] df = get_data(tables=self.out_tables, fallback_tables=self.tables, filters=self.out_filters, extra_columns=expr_cols) long_df = self._to_long(df, 'predict') num_obs = len(df) num_alts = len(self._get_alts()) # Get predictions from underlying model - this is an ndarray with the same length # as the long-format df, representing choice probability for each alternative probs = self.model.predict(long_df) # Generate choices by adapting an approach from UrbanSim MNL # https://github.com/UDST/choicemodels/blob/master/choicemodels/mnl.py#L578-L583 cumprobs = probs.reshape((num_obs, num_alts)).cumsum(axis=1) rands = np.random.random(num_obs) diff = np.subtract(cumprobs.transpose(), rands).transpose() # The diff conversion replaces negative values with 0 and positive values with 1, # so that argmax can return the position of the first positive value choice_ix = np.argmax((diff + 1.0).astype('i4'), axis=1) choice_ix_1d = choice_ix + (np.arange(num_obs) * num_alts) choices = long_df._alt_id.values.take(choice_ix_1d) # Save results to the class object (via df to include indexes) long_df['_probability'] = probs self.probabilities = long_df[['_obs_id', '_alt_id', '_probability']] df['_choices'] = choices self.choices = df._choices # Save to Orca update_column(table=self.out_tables, fallback_table=self.tables, column=self.out_column, fallback_column=self.choice_column, data=self.choices)
def test_get_data_bad_columns(orca_session): """ Bad column name, should be ignored. """ df = utils.get_data(tables = ['households', 'buildings'], model_expression = 'tenure ~ pop + potato') assert(set(df.columns) == set(['tenure', 'pop']))
def test_alternative_filters_for_alts_as_list(m_alts_as_list): """ Test that the default alternative filters generate the correct data subset. """ m = m_alts_as_list m.defaults.alt_filters = 'altval_2 < 0.5' m.build_submodels() for k, v in m.submodels.items(): alts = get_data(tables=v.alternatives, filters=v.alt_filters) assert alts['altval_2'].max() < 0.5
def test_get_data(orca_session): """ General test - multiple tables, binding filters, extra columns. """ df = utils.get_data(tables = ['households', 'buildings'], model_expression = 'tenure ~ pop', filters = ['age > 20', 'age < 50'], extra_columns = 'zone_id') assert(set(df.columns) == set(['tenure', 'pop', 'age', 'zone_id'])) assert(len(df) == 2)
def run(self): """ Save a table to disk. Saving a table to an HDF store requires providing a ``key`` that will be used to identify the table in the store. We'll use the Orca table name, unless you provide a different ``key`` in the ``extra_settings``. Returns ------- None """ if self.output_type not in ['csv', 'hdf']: raise ValueError("Please provide an output type of 'csv' or 'hdf'") if self.table is None: raise ValueError("Please provide the table name") if self.path is None: raise ValueError("Please provide a file path") kwargs = self.extra_settings if kwargs is None: kwargs = dict() df = get_data(tables=self.table, filters=self.filters, extra_columns=self.columns) if self.output_type == 'csv': df.to_csv(self.get_dynamic_filepath(), **kwargs) elif self.output_type == 'hdf': if 'key' not in kwargs: kwargs['key'] = self.table df.to_hdf(self.get_dynamic_filepath(), **kwargs)
def fit(self): """ Fit the model; save and report results. This uses PyLogit via ChoiceModels. The `fit()` method can be run as many times as desired. Results will not be saved with Orca or ModelManager until the `register()` method is run. """ expr_cols = [t[0] for t in list(self.model_expression.items()) \ if t[0] != 'intercept'] df = get_data(tables=self.tables, filters=self.filters, extra_columns=expr_cols + [self.choice_column]) long_df = self._to_long(df) # Set initial coefs to 0 if none provided pc = self._get_param_count() if (self.initial_coefs is None) or (len(self.initial_coefs) != pc): self.initial_coefs = np.zeros(pc).tolist() model = MultinomialLogit(data=long_df, observation_id_col='_obs_id', choice_col='_chosen', model_expression=self.model_expression, model_labels=self.model_labels, alternative_id_col='_alt_id', initial_coefs=self.initial_coefs) results = model.fit() self.name = self._generate_name() self.summary_table = str(results.report_fit()) print(self.summary_table) # We need the PyLogit fitted model object for prediction, so save it directly self.model = results.get_raw_results()