def run_multiprocess(self, resources): resources = Resources(resources) profiler_name = resources.get("profile_filename", None) if resources['cache_directory'] is not None: cache_directory = resources['cache_directory'] else: cache_directory = SimulationState().get_cache_directory() ### TODO: Get rid of this! There is absolutely no good reason to be ### changing the Configuration! resources['cache_directory'] = cache_directory log_file = os.path.join(cache_directory, 'run_multiprocess.log') logger.enable_file_logging(log_file) start_year = resources["years"][0] end_year = resources["years"][-1] nyears = end_year - start_year + 1 root_seed = resources.get("seed", NO_SEED) if resources.get('_seed_dictionary_', None) is not None: # This is added by the RunManager to ensure reproducibility including restarted runs seed_dict = resources.get('_seed_dictionary_') seed_array = array( map(lambda year: seed_dict[year], range(start_year, end_year + 1))) else: seed(root_seed) seed_array = randint(1, 2**30, nyears) logger.log_status("Running simulation for years %d thru %d" % (start_year, end_year)) logger.log_status("Simulation root seed: %s" % root_seed) for iyear, year in enumerate(range(start_year, end_year + 1)): success = self._run_each_year_as_separate_process( iyear, year, seed=seed_array[iyear], resources=resources, profiler_name=profiler_name, log_file=log_file) if not success: break self._notify_stopped() if profiler_name is not None: # insert original value resources["profile_filename"] = profiler_name logger.log_status("Done running simulation for years %d thru %d" % (start_year, end_year))
def _compute_vacancy_and_total_units_variables(self, location_set, project_types, resources=None): compute_resources = Resources(resources) compute_resources.merge({"debug":self.debug}) self.variable_for_vacancy = {} self.variable_for_total_units = {} for ptype in project_types: self.variable_for_vacancy[ptype] = compute_resources.get( "%s_vacant_variable" % ptype, "urbansim_zone.%s.vacant_%s" % (location_set.get_dataset_name(), self.project_specific_units[ptype])) self.variable_for_total_units[ptype] = compute_resources.get( "%s_total_units_variable" % ptype, "%s.aggregate(urbansim_zone.building.total_%s)" % (location_set.get_dataset_name(), self.project_specific_units[ptype])) location_set.compute_variables([self.variable_for_vacancy[ptype], self.variable_for_total_units[ptype]], dataset_pool=self.dataset_pool, resources = compute_resources)
class RunSimulationFromMysql: def prepare_for_simulation(self, run_configuration, cache_directory=None): self.config = Resources(run_configuration) self.simulation_state = SimulationState( new_instance=True, base_cache_dir=cache_directory, start_time=self.config.get("base_year", 0) ) ### TODO: Get rid of this! There is no good reason to be changing the ### Configuration. if self.config["cache_directory"] is None: self.config["cache_directory"] = self.simulation_state.get_cache_directory() SessionConfiguration( new_instance=True, package_order=self.config["dataset_pool_configuration"].package_order, in_storage=AttributeCache(), ) ForkProcess().fork_new_process( self.config["creating_baseyear_cache_configuration"].cache_scenario_database, self.config ) # Create output database (normally done by run manager) if "estimation_database_configuration" in self.config: db_server = DatabaseServer(self.config["estimation_database_configuration"]) if not db_server.has_database(self.config["estimation_database_configuration"].database_name): db_server.create_database(self.config["estimation_database_configuration"].database_name) def run_simulation(self, simulation_instance=None): logger.start_block("Simulation on database %s" % self.config["scenario_database_configuration"].database_name) try: if simulation_instance is None: simulation_instance = ModelSystem() simulation_instance.run(self.config) # simulation_instance.run_multiprocess(self.config, is_run_subset=True) finally: logger.end_block() logger.log_status("Data cache in %s" % self.simulation_state.get_cache_directory()) def cleanup(self, remove_cache, remove_output_database): """Remove all outputs of this simulation.""" self.simulation_state.remove_singleton(delete_cache=remove_cache) # Remove SessionConfiguration singleton, if it exists Singleton().remove_singleton_for_class(SessionConfiguration) cache_dir = self.config["cache_directory"] if os.path.exists(cache_dir): rmtree(cache_dir) if remove_output_database and ("estimation_database_configuration" in self.config): db_server = DatabaseServer(self.config["estimation_database_configuration"]) db_server.drop_database(self.config["estimation_database_configuration"].database_name) def prepare_and_run(self, run_configuration, simulation_instance=None, remove_cache=True): self.prepare_for_simulation(run_configuration) self.run_simulation(simulation_instance) self.cleanup(remove_cache)
class RunSimulation(object): def prepare_for_simulation(self, config, cache_directory=None): self.config = Resources(config) base_cache_dir = self.config[ 'creating_baseyear_cache_configuration'].cache_directory_root self.simulation_state = SimulationState(new_instance=True, base_cache_dir=base_cache_dir, start_time=self.config.get( 'base_year', 0)) ### TODO: Get rid of this! There is no good reason to be changing the ### Configuration. if self.config['cache_directory'] is None: self.config[ 'cache_directory'] = self.simulation_state.get_cache_directory( ) SessionConfiguration( new_instance=True, package_order=self.config['dataset_pool_configuration']. package_order, in_storage=AttributeCache()) if config['creating_baseyear_cache_configuration'].cache_from_database: ForkProcess().fork_new_process( self.config['creating_baseyear_cache_configuration']. cache_scenario_database, self.config) else: CacheFltData().run(self.config) def run_simulation(self, simulation_instance=None): if simulation_instance is None: simulation_instance = ModelSystem() simulation_instance.run(self.config) #simulation_instance.run_multiprocess(self.config, is_run_subset=True) logger.log_status("Data cache in %s" % self.simulation_state.get_cache_directory()) def cleanup(self, remove_cache=True): """Remove all outputs of this simulation.""" self.simulation_state.remove_singleton(delete_cache=remove_cache) SessionConfiguration().remove_singleton() if remove_cache: cache_dir = self.config['cache_directory'] if os.path.exists(cache_dir): rmtree(cache_dir) def prepare_and_run(self, run_configuration, simulation_instance=None, remove_cache=True): self.prepare_for_simulation(run_configuration) self.run_simulation(simulation_instance) self.cleanup(remove_cache)
def run_multiprocess(self, resources): resources = Resources(resources) profiler_name = resources.get("profile_filename", None) if resources['cache_directory'] is not None: cache_directory = resources['cache_directory'] else: cache_directory = SimulationState().get_cache_directory() ### TODO: Get rid of this! There is absolutely no good reason to be ### changing the Configuration! resources['cache_directory'] = cache_directory log_file = os.path.join(cache_directory, 'run_multiprocess.log') logger.enable_file_logging(log_file) start_year = resources["years"][0] end_year = resources["years"][-1] nyears = end_year - start_year + 1 root_seed = resources.get("seed", NO_SEED) if resources.get('_seed_dictionary_', None) is not None: # This is added by the RunManager to ensure reproducibility including restarted runs seed_dict = resources.get('_seed_dictionary_') seed_array = array(map(lambda year : seed_dict[year], range(start_year, end_year+1))) else: seed(root_seed) seed_array = randint(1,2**30, nyears) logger.log_status("Running simulation for years %d thru %d" % (start_year, end_year)) logger.log_status("Simulation root seed: %s" % root_seed) for iyear, year in enumerate(range(start_year, end_year+1)): success = self._run_each_year_as_separate_process(iyear, year, seed=seed_array[iyear], resources=resources, profiler_name=profiler_name, log_file=log_file) if not success: break self._notify_stopped() if profiler_name is not None: # insert original value resources["profile_filename"] = profiler_name logger.log_status("Done running simulation for years %d thru %d" % (start_year, end_year))
class RunSimulationFromMysql: def prepare_for_simulation(self, run_configuration, cache_directory=None): self.config = Resources(run_configuration) self.simulation_state = SimulationState(new_instance=True, base_cache_dir=cache_directory, start_time=self.config.get('base_year', 0)) ### TODO: Get rid of this! There is no good reason to be changing the ### Configuration. if self.config['cache_directory'] is None: self.config['cache_directory'] = self.simulation_state.get_cache_directory() SessionConfiguration(new_instance=True, package_order=self.config['dataset_pool_configuration'].package_order, in_storage=AttributeCache()) ForkProcess().fork_new_process(self.config['creating_baseyear_cache_configuration'].cache_scenario_database, self.config) # Create output database (normally done by run manager) if 'estimation_database_configuration' in self.config: db_server = DatabaseServer(self.config['estimation_database_configuration']) if not db_server.has_database(self.config['estimation_database_configuration'].database_name): db_server.create_database(self.config['estimation_database_configuration'].database_name) def run_simulation(self, simulation_instance=None): logger.start_block('Simulation on database %s' % self.config['scenario_database_configuration'].database_name) try: if simulation_instance is None: simulation_instance = ModelSystem() simulation_instance.run(self.config) #simulation_instance.run_multiprocess(self.config, is_run_subset=True) finally: logger.end_block() logger.log_status("Data cache in %s" % self.simulation_state.get_cache_directory()) def cleanup(self, remove_cache, remove_output_database): """Remove all outputs of this simulation.""" self.simulation_state.remove_singleton(delete_cache=remove_cache) # Remove SessionConfiguration singleton, if it exists Singleton().remove_singleton_for_class(SessionConfiguration) cache_dir = self.config['cache_directory'] if os.path.exists(cache_dir): rmtree(cache_dir) if remove_output_database and ('estimation_database_configuration' in self.config): db_server = DatabaseServer(self.config['estimation_database_configuration']) db_server.drop_database(self.config['estimation_database_configuration'].database_name) def prepare_and_run(self, run_configuration, simulation_instance=None, remove_cache=True): self.prepare_for_simulation(run_configuration) self.run_simulation(simulation_instance) self.cleanup(remove_cache)
def _compute_vacancy_variables(self, location_set, dev_model_configs, resources): compute_resources = Resources(resources) compute_resources.merge({"debug": self.debug}) self.units_variable = {} self.variable_for_vacancy = {} for project_type in dev_model_configs: self.units_variable[project_type] = dev_model_configs[project_type]["units"] self.variable_for_vacancy[project_type] = compute_resources.get( "%s_vacant_variable" % project_type, "urbansim.%s.vacant_%s" % (location_set.get_dataset_name(), self.units_variable[project_type]), ) location_set.compute_variables([self.variable_for_vacancy[project_type]], resources=compute_resources)
def run(self, data, coefficients, resources=None): """ Like linear_utilities, but in addition it runs linear utilities for modified data and stores utilities when each variable is set to its 5%, 95% quantiles, keeping the other variables at their median. Last row in the resulting file is the difference in utilities between these two. The file name can be passed in resources - entry 'utilities_diagnose_file'. """ if data.ndim < 3: raise StandardError, "Argument 'data' must be a 3D numpy array." if not isinstance(resources, Resources): resources= Resources(resources) nobs, neqs, nvar = data.shape medians = zeros(nvar, dtype=float32) quant = zeros((2,nvar), dtype=float32) data_with_medians = array(data[0,:,:]) for ivar in range(nvar): # compute medain and quantiles for each variable medians[ivar], quant[0,ivar], quant[1,ivar] = quantile(data[:,:,ivar].ravel(), array([0.5, 0.05, 0.95])) data_with_medians[:,ivar] = medians[ivar] file_name = resources.get("utilities_diagnose_file", "util") if resources.get("submodel", None) is not None: file_name = "%s_submodel_%s" % (file_name, resources.get("submodel", 1)) diagnose_utilities = zeros((3, nvar), dtype=float32) argcor = () for ivar in range(nvar): # iterate over variables for iquant in [0,1]: # 0 for 5% quantile, 1 for 95% quantile mod_data = array(data_with_medians).reshape(1,neqs, nvar) # copy original data mod_data[0,:,ivar] = quant[iquant, ivar] utility = linear_utilities.run(self, mod_data, coefficients, resources) diagnose_utilities[iquant, ivar] = utility[0,0] argcor = argcor + (data[:,:,ivar].ravel(),) diagnose_utilities[2,:] = diagnose_utilities[1,:] - diagnose_utilities[0,:] coef_names = resources.get("coefficient_names", map(lambda x: 'x%s' % x, arange(nvar)+1)) #write_to_text_file(file_name, coef_names, delimiter=' ') #write_table_to_text_file( file_name, diagnose_utilities, mode='ab') logger.log_status("Diagnosed utilities written into %s." % file_name) return linear_utilities.run(self, data, coefficients, resources)
def _compute_vacancy_and_total_units_variables(self, location_set, project_types, resources=None): compute_resources = Resources(resources) compute_resources.merge({"debug": self.debug}) self.variable_for_vacancy = {} self.variable_for_total_units = {} for ptype in project_types: self.variable_for_vacancy[ptype] = compute_resources.get( "%s_vacant_variable" % ptype, "urbansim_zone.%s.vacant_%s" % (location_set.get_dataset_name(), self.project_specific_units[ptype])) self.variable_for_total_units[ptype] = compute_resources.get( "%s_total_units_variable" % ptype, "%s.aggregate(urbansim_zone.building.total_%s)" % (location_set.get_dataset_name(), self.project_specific_units[ptype])) location_set.compute_variables([ self.variable_for_vacancy[ptype], self.variable_for_total_units[ptype] ], dataset_pool=self.dataset_pool, resources=compute_resources)
def _compute_vacancy_variables(self, location_set, dev_model_configs, resources): compute_resources = Resources(resources) compute_resources.merge({"debug": self.debug}) self.units_variable = {} self.variable_for_vacancy = {} for project_type in dev_model_configs: self.units_variable[project_type] = dev_model_configs[ project_type]['units'] self.variable_for_vacancy[project_type] = compute_resources.get( "%s_vacant_variable" % project_type, "urbansim.%s.vacant_%s" % (location_set.get_dataset_name(), self.units_variable[project_type])) location_set.compute_variables( [self.variable_for_vacancy[project_type]], resources=compute_resources)
class RunSimulation(object): def prepare_for_simulation(self, config, cache_directory=None): self.config = Resources(config) base_cache_dir = self.config['creating_baseyear_cache_configuration'].cache_directory_root self.simulation_state = SimulationState(new_instance=True, base_cache_dir=base_cache_dir, start_time=self.config.get('base_year', 0)) ### TODO: Get rid of this! There is no good reason to be changing the ### Configuration. if self.config['cache_directory'] is None: self.config['cache_directory'] = self.simulation_state.get_cache_directory() SessionConfiguration(new_instance=True, package_order=self.config['dataset_pool_configuration'].package_order, in_storage=AttributeCache()) if config['creating_baseyear_cache_configuration'].cache_from_database: ForkProcess().fork_new_process(self.config['creating_baseyear_cache_configuration'].cache_scenario_database, self.config) else: CacheFltData().run(self.config) def run_simulation(self, simulation_instance=None): if simulation_instance is None: simulation_instance = ModelSystem() simulation_instance.run(self.config) #simulation_instance.run_multiprocess(self.config, is_run_subset=True) logger.log_status("Data cache in %s" % self.simulation_state.get_cache_directory()) def cleanup(self, remove_cache=True): """Remove all outputs of this simulation.""" self.simulation_state.remove_singleton(delete_cache=remove_cache) SessionConfiguration().remove_singleton() if remove_cache: cache_dir = self.config['cache_directory'] if os.path.exists(cache_dir): rmtree(cache_dir) def prepare_and_run(self, run_configuration, simulation_instance=None, remove_cache=True): self.prepare_for_simulation(run_configuration) self.run_simulation(simulation_instance) self.cleanup(remove_cache)
def run(self, config = None, ### TODO: Get rid of this parameter! unroll_gridcells = None, ### TODO: Get rid of this parameter! cache_directory = None, base_year = None, creating_baseyear_cache_configuration = None, debuglevel = None, ): """ Copy large baseyear datasets from MySQL into cache. """ config = Resources(config) if unroll_gridcells is None: unroll_gridcells = config['creating_baseyear_cache_configuration'].unroll_gridcells if cache_directory is None: cache_directory = config['cache_directory'] if base_year is None: base_year = config['base_year'] if creating_baseyear_cache_configuration is None: creating_baseyear_cache_configuration = copy.deepcopy(config['creating_baseyear_cache_configuration']) if debuglevel is None: debuglevel = config.get('debuglevel', 3) CoreCacheScenarioDatabase().run(config) self.prepare_data_before_baseyear( cache_directory, base_year, creating_baseyear_cache_configuration )
def run( self, config=None, ### TODO: Get rid of this parameter! unroll_gridcells=None, ### TODO: Get rid of this parameter! cache_directory=None, base_year=None, creating_baseyear_cache_configuration=None, debuglevel=None, ): """ Copy large baseyear datasets from MySQL into cache. """ config = Resources(config) if unroll_gridcells is None: unroll_gridcells = config[ 'creating_baseyear_cache_configuration'].unroll_gridcells if cache_directory is None: cache_directory = config['cache_directory'] if base_year is None: base_year = config['base_year'] if creating_baseyear_cache_configuration is None: creating_baseyear_cache_configuration = copy.deepcopy( config['creating_baseyear_cache_configuration']) if debuglevel is None: debuglevel = config.get('debuglevel', 3) CoreCacheScenarioDatabase().run(config) self.prepare_data_before_baseyear( cache_directory, base_year, creating_baseyear_cache_configuration)
class Estimator(GenericModelExplorer): def __init__(self, config=None, save_estimation_results=False): if 'cache_directory' not in config or config['cache_directory'] is None: raise KeyError("The cache directory must be specified in the " "given configuration, giving the filesystem path to the cache " "directory containing the data with which to estimate. Please " "check that your configuration contains the 'cache_directory' " "entry and that it is not None.") self.simulation_state = SimulationState(new_instance=True) self.simulation_state.set_cache_directory(config['cache_directory']) SessionConfiguration(new_instance=True, package_order=config['dataset_pool_configuration'].package_order, in_storage=AttributeCache()) self.config = Resources(config) self.save_estimation_results = save_estimation_results self.debuglevel = self.config.get("debuglevel", 4) self.model_system = ModelSystem() self.agents_index_for_prediction = None models = self.config.get('models',[]) self.model_name = None if "model_name" in config.keys(): self.model_name = config["model_name"] else: for model in models: if isinstance(model, dict): model_name = model.keys()[0] if (model[model_name] == "estimate") or (isinstance(model[model_name], list) and ("estimate" in model[model_name])): self.model_name = model_name break estimate_config_changes = self.config.get('config_changes_for_estimation', {}).get('estimate_config', {}) if len(estimate_config_changes) > 0: change = Resources({'models_configuration': {self.model_name: {'controller': {'init': {'arguments': {}}}}}}) estimate_config_str = self.config['models_configuration'].get(self.model_name, {}).get('controller', {}).get('init', {}).get('arguments', {}).get('estimate_config', '{}') estimate_config = Resources({}) try: estimate_config = eval(estimate_config_str) except: pass estimate_config.merge(estimate_config_changes) self.config.merge(change) self.config['models_configuration'][self.model_name]['controller']['init']['arguments']['estimate_config'] = 'Resources(%s)' % estimate_config def estimate(self, out_storage=None): self.model_system.run(self.config, write_datasets_to_cache_at_end_of_year=False) self.extract_coefficients_and_specification() if self.save_estimation_results: self.save_results(out_storage=out_storage) def reestimate(self, specification_module_name=None, specification_dict=None, out_storage=None, type=None, submodels=None): """specification_module_name is name of a module that contains a dictionary called 'specification'. If it is not given, the argument specification_dict must be given which is a dictionary object. 'type' is the name of model member, such as 'commercial', 'residential'. The specification dictionary is expected to have an entry of this name. If 'submodels' is given (list or a number), the restimation is done only for those submodels. """ if specification_module_name is not None: exec("import " + specification_module_name) eval("reload (" + specification_module_name + ")") exec("specification_dict =" + specification_module_name + ".specification") if type is not None: specification_dict = specification_dict[type] if submodels is not None: #remove all submodels but the given ones from specification submodels_to_be_deleted = specification_dict.keys() if not isinstance(submodels, list): submodels = [submodels] for sm in submodels: if sm not in submodels_to_be_deleted: raise ValueError, "Submodel %s not in the specification." % sm submodels_to_be_deleted.remove(sm) if "_definition_" in submodels_to_be_deleted: submodels_to_be_deleted.remove("_definition_") for sm in submodels_to_be_deleted: del specification_dict[sm] self.specification = EquationSpecification(specification_dict=specification_dict) new_namespace = self.model_system.run_year_namespace keys_coeff_spec = self.get_keys_for_coefficients_and_specification() new_namespace[keys_coeff_spec["specification"]] = self.specification self.coefficients, coeff_dict_dummy = self.model_system.do_process(new_namespace) ## update run_year_namespce since it's not been updated by do_process self.model_system.run_year_namespace = new_namespace self.model_system.run_year_namespace[keys_coeff_spec["coefficients"]] = self.coefficients ## this gets coeff and spec from run_year_namespce and is only updated in _run_year method #self.extract_coefficients_and_specification() if self.save_estimation_results: self.save_results(out_storage=out_storage) def predict(self, predicted_choice_id_name, agents_index=None): """ Run prediction. Currently makes sense only for choice models.""" # Create temporary configuration where all words 'estimate' are replaced by 'run' tmp_config = Resources(self.config) if self.agents_index_for_prediction is None: self.agents_index_for_prediction = self.get_agent_set_index().copy() if agents_index is None: agents_index = self.agents_index_for_prediction tmp_config['models_configuration'][self.model_name]['controller']['run']['arguments']['coefficients'] = "coeff_est" tmp_config['models_configuration'][self.model_name]['controller']['run']['arguments']['agents_index'] = "agents_index" tmp_config['models_configuration'][self.model_name]['controller']['run']['arguments']['chunk_specification'] = "{'nchunks':1}" ### save specification and coefficients to cache (no matter the save_estimation_results flag) ### so that the prepare_for_run method could load specification and coefficients from there #output_configuration = self.config['output_configuration'] #del self.config['output_configuration'] #self.save_results() #self.config['output_configuration'] = output_configuration #self.model_system.run_year_namespace["coefficients"] = self.coefficients #del tmp_config['models_configuration'][self.model_name]['controller']['prepare_for_run'] try: run_year_namespace = copy.copy(self.model_system.run_year_namespace) except: logger.log_error("The estimate() method must be run first") return False try: agents = self.get_agent_set() choice_id_name = self.get_choice_set().get_id_name()[0] # save current locations of agents current_choices = agents.get_attribute(choice_id_name).copy() dummy_data = zeros(current_choices.size, dtype=current_choices.dtype)-1 agents.modify_attribute(name=choice_id_name, data=dummy_data) #reset all choices run_year_namespace["process"] = "run" run_year_namespace["coeff_est"] = self.coefficients run_year_namespace["agents_index"] = agents_index run_year_namespace["processmodel_config"] = tmp_config['models_configuration'][self.model_name]['controller']['run'] new_choices = self.model_system.do_process(run_year_namespace) #self.model_system.run(tmp_config, write_datasets_to_cache_at_end_of_year=False) #new_choices = agents.get_attribute(choice_id_name).copy() agents.modify_attribute(name=choice_id_name, data=current_choices) dummy_data[agents_index] = new_choices if predicted_choice_id_name not in agents.get_known_attribute_names(): agents.add_primary_attribute(name=predicted_choice_id_name, data=dummy_data) else: agents.modify_attribute(name=predicted_choice_id_name, data=dummy_data) logger.log_status("Predictions saved into attribute " + predicted_choice_id_name) return True except Exception, e: logger.log_error("Error encountered in prediction: %s" % e) logger.log_stack_trace() return False
class InteractionDataset(Dataset): """Class serves as a holder of interaction variables.""" def __init__(self, resources=None, dataset1=None, dataset2=None, index1=None, index2=None, dataset_name=None, debug=None): """ Argument 'resources' is of type Resources. It is merged with arguments. It should contain: dataset1 - agent class dataset2 - class of the choice dataset Optional: index1 - 1D array, indices of dataset1 index2 - If 2D array: row i contains indices of individuals of dataset2 that belong to i-th individual of dataset1[index1]. If 1D array: indices of individuals of dataset2 for all individuals of dataset1[index1]. dataset_name - subdirectory in which implementation of the interaction variables is placed (default "") dataset1.resources and dataset2.resources should contain key 'dataset_name' (see Dataset.get_dataset_name()). """ self.resources = Resources(resources) self.resources.merge_if_not_None({ "dataset1":dataset1, "dataset2":dataset2, "index1":index1, "index2":index2, "dataset_name":dataset_name, "debug":debug}) self.attribute_boxes = {} self.attribute_names = [] self.debug = self.resources.get("debug", 0) if not isinstance(self.debug, DebugPrinter): self.debug = DebugPrinter(self.debug) self.resources.check_obligatory_keys(["dataset1", "dataset2"]) self.dataset1 = self.resources["dataset1"] self.dataset2 = self.resources["dataset2"] self.index1 = self.resources.get("index1", None) self.index2 = self.resources.get("index2", None) self.dataset_name = self.resources.get("dataset_name", None) if self.dataset_name == None: self.dataset_name = self.dataset1.get_dataset_name() + '_x_' + self.dataset2.get_dataset_name() self._primary_attribute_names=[] self.index1_mapping = {} if self.index1 <> None: self.index1_mapping = do_id_mapping_dict_from_array(self.index1) self._id_names = None # for compatibility with Dataset self.variable_factory = VariableFactory() self._aliases = {} # for compatibility with Dataset def _ensure_id_attribute_is_loaded(self): pass def get_attribute(self, name): """ Return an array of the (by the argument name) given attribute. """ if not isinstance(name, VariableName): attr_name = VariableName(name) else: attr_name = name alias = attr_name.get_alias() dataset_name = attr_name.get_dataset_name() if not (alias in self.get_attribute_names()): if dataset_name == self.get_dataset(1).dataset_name: index = self.get_2d_index_of_dataset1() return self.get_dataset(1).get_attribute_by_index(attr_name, index) if dataset_name == self.get_dataset(2).dataset_name: index = self.get_2d_index() return self.get_dataset(2).get_attribute_by_index(attr_name, index) if alias in self.get_dataset(1).get_known_attribute_names(): index = self.get_2d_index_of_dataset1() return self.get_dataset(1).get_attribute_by_index(attr_name, index) if alias in self.get_dataset(2).get_known_attribute_names(): index = self.get_2d_index() return self.get_dataset(2).get_attribute_by_index(attr_name, index) self._raise_error(NameError, "Variable %s not found!" % alias) return self.attribute_boxes[alias].get_data() def get_attribute_of_dataset(self, name, dataset_number=1): """ Return values of attribute given by 'name' belonging to the given dataset, possibly filtred by the corresponding indes. It is a 1d array of size reduced_n or reduced_m. """ index = self.get_index(dataset_number) if index <> None: return self.get_dataset(dataset_number).get_attribute_by_index(name, index) return self.get_dataset(dataset_number).get_attribute(name) def get_id_attribute_of_dataset(self, dataset_number=1): """Like 'get_attribute_of_dataset' where name is the id_name of the given dataset. """ index = self.get_index(dataset_number) if index <> None: return self.get_dataset(dataset_number).get_id_attribute()[index] return self.get_dataset(dataset_number).get_id_attribute() def add_primary_attribute(self, data, name): """ Add values given in argument 'data' to the dataset as an attribute 'name'. 'data' should be an array of the same size as the dataset. If this attribute already exists, its values are overwritten. The attribute is marked as a primary attribute. """ if not isinstance(data, ndarray): data=array(data) if data.shape[0] <> self.size()[0][0] or data.shape[1] <> self.size()[0][1]: logger.log_warning("In add_primary_attribute: Mismatch in sizes of the argument 'data' and the InteractionDataset object.") self.add_attribute(data, name, metadata=AttributeType.PRIMARY) def _compute_if_needed(self, name, dataset_pool, resources=None, quiet=False, version=None): """ Compute variable given by the argument 'name' only if this variable has not been computed before. Check first if this variable belongs to dataset1 or dataset2. dataset_pool holds available datasets. """ if not isinstance(name, VariableName): variable_name = VariableName(name) else: variable_name = name short_name = variable_name.get_alias() if (short_name in self.get_attribute_names()) and (self.are_dependent_variables_up_to_date( variable_name, version=version)): return version #nothing to be done dataset_name = variable_name.get_dataset_name() if dataset_name == self.get_dataset_name(): new_version = self._compute_one_variable(variable_name, dataset_pool, resources) else: owner_dataset, index = self.get_owner_dataset_and_index(dataset_name) if owner_dataset is None: self._raise_error(StandardError, "Cannot find variable '%s'\nin either dataset or in the interaction set." % variable_name.get_expression()) owner_dataset.compute_variables([variable_name], dataset_pool, resources=resources, quiet=True) new_version = self.add_attribute(data = owner_dataset.get_attribute_by_index(variable_name, index), name = variable_name, metadata = AttributeType.COMPUTED) attribute_box = owner_dataset._get_attribute_box(variable_name) variable = attribute_box.get_variable_instance() my_attribute_box = self._get_attribute_box(variable_name) my_attribute_box.set_variable_instance(variable) return new_version def get_owner_dataset_and_index(self, dataset_name): if dataset_name == self.dataset1.get_dataset_name(): return (self.dataset1, self.get_2d_index_of_dataset1()) elif dataset_name == self.dataset2.get_dataset_name(): return (self.dataset2, self.get_2d_index()) return (None, None) def are_dependent_variables_up_to_date(self, variable_name, version): """ Return True if the version of this variable correspond to versions of all dependent variables, otherwise False. That is, if any of the dependent variable must be recomputed, the method returns False. """ short_name = variable_name.get_alias() if short_name in self.get_primary_attribute_names(): return self.is_version(short_name, version) dataset_name = variable_name.get_dataset_name() owner_name = variable_name.get_dataset_name() if owner_name == self.dataset1.get_dataset_name(): owner_dataset = self.dataset1 elif owner_name == self.dataset2.get_dataset_name(): owner_dataset = self.dataset2 else: owner_dataset = self if not(dataset_name == owner_dataset.get_dataset_name()): self._raise_mismatch_dataset_name_error(variable_name) if owner_dataset is self: attribute_box = owner_dataset._get_attribute_box(variable_name) if attribute_box is None: return False variable = attribute_box.get_variable_instance() res = variable.are_dependent_variables_up_to_date(version) return not(False in res) return owner_dataset.are_dependent_variables_up_to_date(variable_name, version) def _prepare_dataset_pool_for_variable(self, dataset_pool=None, resources=None): dataset_pool, compute_resources = Dataset._prepare_dataset_pool_for_variable(self, dataset_pool, resources) dataset1_name = "dataset1" dataset2_name = "dataset2" dataset1 = self.get_dataset(1) dataset2 = self.get_dataset(2) if dataset1 <> None: dataset1_name=dataset1.get_dataset_name() if dataset2 <> None: dataset2_name=dataset2.get_dataset_name() dataset_pool.add_datasets_if_not_included({dataset1_name: dataset1, dataset2_name: dataset2}) return dataset_pool, compute_resources def get_n(self): """Return size of dataset 1. """ return self.dataset1.size() def get_m(self): """Return size of dataset 2. """ return self.dataset2.size() def get_reduced_n(self): if self.index1 == None: return self.get_n() if isinstance(self.index1, ndarray): return self.index1.shape[0] return self.get_n() def get_reduced_m(self): if self.index2 == None: return self.get_m() if isinstance(self.index2, ndarray): if self.index2.ndim == 1: return self.index2.shape[0] else: return self.index2.shape[1] return self.get_m() def size(self): return [(self.get_reduced_n(), self.get_reduced_m()), (self.get_n(), self.get_m())] def get_dataset(self, nr): if (nr == 1): return self.dataset1 if (nr == 2): return self.dataset2 return None def get_dataset_named(self, name): if name==self.dataset1.get_dataset_name(): return self.dataset1 if name==self.dataset2.get_dataset_name(): return self.dataset2 raise ValueError, 'trying to get an interaction set component named %s but it does not exist' % name def get_index(self, nr): if (nr == 1): return self.index1 if (nr == 2): return self.index2 return None def attribute_sum(self, name): """Return the sum of values of the given attribute. """ return (ma.ravel(self.get_attribute(name))).sum() def attribute_average(self, name): """Return the value of the given attribute averaged over the dataset. """ return ma.average(ma.ravel(self.get_attribute(name))) def summary(self, names, resources=None): """Print a marginal summary of the attributes given in the list 'names'. """ print "Summary\t\tsum\t\taverage" print "------------------------------------------------" if not isinstance(names,list): names = [names] for item in names: if not (item.get_alias() in self.get_attribute_names()): self.compute_variables([item], resources=resources) print item + "\t" + str(self.attribute_sum(item.alias))\ + "\t" + str(round(self.attribute_average(item.get_alias(),5))) def get_2d_dataset_attribute(self, name): """ Return a 2D array of the attribute given by 'name'. It is assumed to be an attribute of dataset2. The method should serve the purpose of preparing 1D arrays for computing intraction operations (between dataset1 and dataset2) by transfering them to the corresponding 2D array. The resulting array is of size n x m, where m is either the attribute length of dataset2, or, if index2 is a 1D array, its length, or, if index2 is a 2D array, the number of columns. n is size of dataset1 or of index1 if given. If index2 is None, all values of the given attribute are repeated n times. """ dataset = self.get_dataset(2) index = self.get_2d_index() return dataset.get_attribute_by_index(name, index) def get_2d_index(self): n = self.get_reduced_n() m = self.get_reduced_m() if self.index2 == None: index = indices((n,m))[1] elif isinstance(self.index2, ndarray): if self.index2.ndim == 1: # one-dim array index = repeat(reshape(self.index2,(1,self.index2.shape[0])), n, 0) else: index = self.index2 else: self._raise_error(StandardError, "'index2' has incompatible type. It should be a numpy array or None.") if (index.shape[0] <> n) or (index.shape[1] <> m): self._raise_error(StandardError, "'index2' has wrong dimensions.") return index def get_2d_index_of_dataset1(self): n = self.get_reduced_n() m = self.get_reduced_m() index = self.get_index(1) if index == None: index = arange(n) return repeat(reshape(index, (index.size,1)), m, 1) def create_logit_data(self, coefficients, index=None): """It creates a data array corresponding to specified coefficients (=coefficients connected to a specification) as one variable per column. 'coefficients' is of type "SpecifiedCoefficientsFor1Submodel". If 'index' is not None, it is considered as index (1D array) of dataset1 determining which individuals should be considered. Return a 3D array (nobservations|len(index) x nequations x nvariables). """ shape = coefficients.getshape() neqs, nvar = shape[0:2] other_dims = () if len(shape) > 2: other_dims = shape[2:] nparenteqs = coefficients.parent.nequations() if (neqs <> self.get_reduced_m()) and (nparenteqs <> self.get_reduced_m()): self._raise_error(StandardError, "create_logit_data: Mismatch in number of equations and size of dataset2.") if index <> None: nobs = index.size else: nobs = self.get_reduced_n() index = arange(nobs) variables = coefficients.get_full_variable_names() mapping = coefficients.get_coefficient_mapping() # Fill the x array from data array data_shape = tuple([nobs,neqs,nvar] + list(other_dims)) try: x = zeros(data_shape, dtype=float32) except: # in case it fails due to memory allocation error logger.log_warning("Not enough memory. Deleting not used attributes.", tags=["memory", "logit"]) var_names = map(lambda x: x.get_alias(), variables) self.dataset1.unload_not_used_attributes(var_names) self.dataset2.unload_not_used_attributes(var_names) collect() x = zeros(data_shape, dtype=float32) if (len(variables) <= 0) or (nobs <= 0): return x for ivar in range(nvar): # Iterate over variables if variables[ivar].is_constant_or_reserved_name(): c = where(mapping[:,ivar] < 0, 0.0, 1) x[:,:,ivar] = c else: data = ma.filled(self.get_attribute(variables[ivar]),0.0)[index,] if neqs < nparenteqs: data = take(data, coefficients.get_equations_index(), axis=1) if x.ndim > 3: data = resize(data, tuple(list(x.shape[0:2]) + list(other_dims))) x[:,:,ivar] = data return x def create_logit_data_from_beta_alt(self, coefficients, index=None): """It creates a data array corresponding to specified coefficients (=coefficients connected to a specification) as one coefficient per column. (Thus there can be multiple columns of one variable.) 'coefficients' is of type "SpecifiedCoefficientsFor1Submodel". If 'index' is not None, it is considered as index (1D array) of dataset1 determining which individuals should be considered. It puts zeros on spots where the corresponding coefficient is zero. It is meant to be used for preparing data for estimation. Return a 3D array (nobservations|len(index) x nequations x ncoefficients). """ shape = coefficients.getshape() neqs, nvar = shape[0:2] other_dims = () if len(shape) > 2: other_dims = shape[2:] nparenteqs = coefficients.parent.nequations() if (neqs <> self.get_reduced_m()) and (nparenteqs <> self.get_reduced_m()): self._raise_error(StandardError, "create_logit_data: Mismatch in number of equations and size of dataset2.") mapping = coefficients.get_coefmap_alt() ncoef = mapping.size if index <> None: nobs = index.size else: nobs = self.get_reduced_n() index = arange(nobs) variables = coefficients.get_variable_names_from_alt() # Fill the x array from data array data_shape = tuple([nobs,neqs,ncoef] + list(other_dims)) try: x = zeros(data_shape, dtype=float32) except: # in case it fails due to memory allocation error logger.log_warning("Not enough memory. Deleting not used attributes.", tags=["memory", "logit"]) self.dataset1.unload_not_used_attributes(unique(variables)) self.dataset2.unload_not_used_attributes(unique(variables)) collect() x = zeros(data_shape, dtype=float32) if (len(variables) <= 0) or (nobs <= 0): return x coefvalues = coefficients.get_beta_alt() for ivar in range(len(variables)): # Iterate over variables if coefficients.is_variable_constant_or_reserved_name(variables[ivar]): c = where(coefvalues[:,ivar] == 0, 0.0, 1) x[:,:,ivar] = c else: data = ma.filled(self.get_attribute(variables[ivar]),0.0)[index,] if neqs < nparenteqs: data = take(data, coefficients.get_equations_index(), axis=1) if x.ndim > 3: data = reshape(data, tuple(list(x.shape[0:2]) + len(other_dims)*[1])) for iodim in range(len(other_dims)): data = repeat(data, other_dims[iodim], axis=2+iodim) x[:,:,ivar] = data w = where(coefvalues[:,ivar] == 0) if x.ndim > 3: x[:,w[0], ivar, w[1:]] = 0.0 else: x[:,w,ivar] = 0.0 return x def modify_logit_data_for_estimation(self, data, choice, constants_positions=array([], dtype='int32')): """Modify the variable columns for alternative specific constants. It is set to one for choices where the actual choice have been made, otherwise zeros. 'data' is a 3D array (output of create_logit_data). 'choice' is a 1D array containing indices of the actual choices (within the sampled choice set) for each agent that was included in the data array. 'constants_positions' is an array with indices of the alternative specific constants within the data array. """ nobs, neqs, nvar = data.shape if where(choice<0)[0].size > 0: self._raise_error(StandardError, "There are no choices for some agents. Check argument 'choice'.") if constants_positions.size > 0: for const in constants_positions: data[:,:,const] = 0 data[arange(nobs), choice, const] = 1 return data def get_attribute_by_choice(self, name, choices, resources=None): """ 'name' is an attribute of dataset2, 'choices' is 1D array - choices[i] represents a choice (index of attribute 'name' among the values index2[i,]) for individual i of dataset1[index1]. If name == None, indices belonging to dataset2 are returned. The method returns 1D array - the actual values of the choices. """ if choices.size <> self.get_n(): self._raise_error(StandardError, "get_attribute_by_choice: Argument 'choices' must be the same size as dataset1") resources.merge_with_defaults(self.resources) if name == None: twoDattr = self.get_2d_index() else: twoDattr = self.get_2d_dataset_attribute(name, resources) return take_choices(twoDattr, choices) def is_same_as(self, name1, name2): """Test equality of 2 variables. 'name1' is an attribute of dataset1, 'name2' is an attribute of 'dataset2'. Return a 2D array. """ self.load_datasets() attr1 = reshape(self.get_attribute_of_dataset(name1),(self.get_reduced_n(), 1)) return attr1 == self.get_2d_dataset_attribute(name2) def is_less_or_equal(self, name1, name2): """Test if attribute 'name1' (attr. of dataset1) is <= than attr. 'name2' (attr. 'dataset2'). Return a 2D array. """ self.load_datasets() attr1 = reshape(self.get_attribute_of_dataset(name1),(self.get_reduced_n(), 1)) return attr1 <= self.get_2d_dataset_attribute(name2) def is_greater_or_equal(self, name1, name2): """est if attribute 'name1' (attr. of dataset1) is >= than attr. 'name2' (attr. 'dataset2'). Return a 2D array. """ self.load_datasets() attr1 = reshape(self.get_attribute_of_dataset(name1),(self.get_reduced_n(), 1)) return attr1 >= self.get_2d_dataset_attribute(name2) def multiply(self, name1, name2): """Multiply 2 variables. 'name1' is an attribute of dataset1, 'name2' is an attribute of 'dataset2'. Return a 2D array. """ self.load_datasets() attr1 = reshape(self.get_attribute_of_dataset(name1),(self.get_reduced_n(), 1)) return attr1 * self.get_2d_dataset_attribute(name2) def divide(self, name1, name2): """ Divide variable 'name1' (attribute of dataset1) by variable 'name2' (attribute of 'dataset2'). Return a masked 2D array. """ self.load_datasets() attr2 = reshape(self.get_attribute_of_dataset(name2),(self.get_reduced_n(), 1)) return self.get_2d_dataset_attribute(name1) / ma.masked_where(attr2 == 0.0, attr2.astype(float32)) def match_agent_attribute_to_choice(self, name, dataset_pool=None): """ Return a tuple where the first element is a 2D array of the attribute 'name_{postfix}'. It is assumed to be an attribute of dataset1 (possibly computed). {postfix} is created either by values of the attribute 'name' of dataset2 (if it has any such attribute), or by the id values of dataset2. The second value of the resulting tuple is a list of dependent variables. """ if 'name' in self.get_dataset(2).get_known_attribute_names(): name_postfix = self.get_attribute_of_dataset('name', 2) else: name_postfix = self.get_id_attribute_of_dataset(2) name_postfix_alt = self.get_id_attribute_of_dataset(2) dependencies = [] for i in range(self.get_reduced_m()): full_name = VariableName("%s_%s" % (name, name_postfix[i])) if full_name.get_dataset_name() is None: full_name = VariableName("%s.%s" % (self.get_dataset(1).get_dataset_name(), full_name.get_expression())) try: self.get_dataset(1).compute_variables(full_name, dataset_pool=dataset_pool) except: full_name = VariableName("%s_%s" % (name, name_postfix_alt[i])) if full_name.get_dataset_name() is None: full_name = VariableName("%s.%s" % (self.get_dataset(1).get_dataset_name(), full_name.get_expression())) self.get_dataset(1).compute_variables(full_name, dataset_pool=dataset_pool) dependencies.append(full_name.get_expression()) if i == 0: result = self.get_attribute(full_name) else: result[:,i] = self.get_attribute_of_dataset(full_name, 1) return result, dependencies def load_datasets(self): if self.dataset1.size() <= 0: self.dataset1.get_id_attribute() if self.dataset2.size() <= 0: self.dataset2.get_id_attribute() def get_index1_idx(self, ids): id = asarray(ids) try: return array(map(lambda x: self.index1_mapping[x], ids)) except: return None def get_dependent_datasets(self, variables): """Return a list of dataset names that the given variables depend on.""" result = [] for variable in variables: try: result = result + self.get_dataset(1).get_dependent_datasets(variables=[variable], quiet=True) except: try: result = result + self.get_dataset(2).get_dependent_datasets(variables=[variable], quiet=True) except: result = result + get_dependency_datasets(variables=[variable]) result = get_distinct_list(result) for i in [1,2]: # remove dependencies on datasets of this interaction, since it is implicitly given dataset_name = self.get_dataset(i).get_dataset_name() if dataset_name in result: result.remove(dataset_name) return result def _raise_error(self, error, msg): raise error("In interaction set '%s': %s'" % (self.name(), msg)) def name(self): return "%s -> %s" % (self.dataset1.get_dataset_name(), self.dataset2.get_dataset_name()) def get_mask(self, index): """index is an array of size reduced_n. The method returns array of 1's and 0's (of size reduced_n x reduced_m) where 0's are on rows determined by index. """ mask = ones((self.get_reduced_n(), self.get_reduced_m()), dtype="int32") for i in index: mask[i,:] = 0 return mask def interact_attribute_with_condition(self, attribute, condition, filled_value=0.0, do_logical_not=False): """Creates a 2D array (reduced_n x reduced_m) with values of 'attribute' on spots where values of the 'condition' attribute are > 0. All other spots have 'filled_value'. 'attribute' is an attribute name of the second dataset, condition is an attribute name of teh first dataset. If 'do_logical_not' is True, the condition is negated. """ cond_values = self.get_attribute_of_dataset(condition) if do_logical_not: cond_values = logical_not(cond_values) index = where(cond_values > 0)[0] mask = self.get_mask(index) return ma.filled(ma.masked_array(self.get_2d_dataset_attribute(attribute), mask=mask), filled_value) def create_and_check_qualified_variable_name(self, name): """Convert name to a VariableName if it isn't already, and add dataset_name to the VariableName if it is missing. If it already has a dataset_name, make sure it is the same as the name of this dataset. """ if isinstance(name, VariableName): vname = name else: vname = VariableName(name) if vname.get_dataset_name() is None: vname.set_dataset_name(self.get_dataset_name()) else: self._check_dataset_name(vname) return vname def get_flatten_dataset(self): """Creates a new dataset that is a 1D version of this dataset. All attributes are flattened. Id name is a combination of the two id attributes. """ storage = StorageFactory().get_storage('dict_storage') table_name = '%s_flatten' % self.get_dataset_name() data = {} for attr in self.get_known_attribute_names(): data[attr] = self.get_attribute(attr).ravel() ids = [] for i in [1,2]: id_name = self.get_dataset(i).get_id_name()[0] ids.append(id_name) if id_name not in data.keys(): data[id_name] = self.get_attribute(id_name).ravel() storage.write_table( table_name=table_name, table_data=data ) dataset = Dataset(in_storage=storage, id_name=ids, dataset_name=table_name, in_table_name=table_name) return dataset def _check_dataset_name(self, vname): """check that name is the name of this dataset or one of its components""" name = vname.get_dataset_name() dataset_names = set([self.get_dataset_name()] + list(self.get_dataset(i).get_dataset_name() for i in [1,2])) if name not in dataset_names: raise ValueError, "When checking dataset name of '%s': different dataset names for variable and dataset or a component: '%s' <> '%s'" % (vname.get_expression(), name, dataset_names) def add_mnl_bias_correction_term(self, probability, sampled_index, bias_attribute_name='__mnl_bias_correction_term'): """Compute and add an MNL bias correction term introduced by sampling. 'probability' is a probability array of the whole choice set. 'sampled_index' is an index of elements within the 'probability' array determining the sampled set of alternatives. The computed term is added to the interaction set as an additional attribute, using the name given in 'bias_attribute_name'. This method is mainly to be used by Samplers classes. """ lnprob = ln(probability) ln1minusprob = ln(1-probability) bias_term = ln1minusprob.sum() - \ take(ln1minusprob, sampled_index).sum(axis=1).reshape((self.get_reduced_n(),1)) + \ take(lnprob, sampled_index).sum(axis=1).reshape((self.get_reduced_n(),1)) - \ take(lnprob, sampled_index) self.add_attribute(bias_term, bias_attribute_name)
def create_from_parcel_and_development_template(parcel_dataset, development_template_dataset, parcel_index=None, template_index=None, filter_attribute=None, consider_constraints_as_rules=True, template_opus_path="urbansim_parcel.development_template", proposed_units_variable="urbansim_parcel.development_project_proposal.units_proposed", dataset_pool=None, resources=None): """create development project proposals from parcel and development_template_dataset, parcel_index - 1D array, indices of parcel_dataset. Status of the proposals is set to 'tentative'. template_index - index to templates that are available to create proposals; filter_attribute - variable that is used to filter proposals; If a development constraint table exists, create proposal dataset include only proposals that are allowed by constraints, otherwise, create a proposal dataset with Cartesian product of parcels x templates """ resources = Resources(resources) debug = resources.get("debug", 0) if not isinstance(debug, DebugPrinter): debug = DebugPrinter(debug) if parcel_index is not None and parcel_index.size <= 0: logger.log_warning("parcel index for creating development proposals is of size 0. No proposals will be created.") return None storage = StorageFactory().get_storage('dict_storage') current_year = SimulationState().get_current_time() def _get_data(parcel_ids, template_ids): return { "proposal_id": arange(1, parcel_ids.size+1, 1), "parcel_id" : parcel_ids, "template_id": template_ids, "start_year": array(parcel_ids.size*[current_year]), "status_id": resize(array([DevelopmentProjectProposalDataset.id_tentative], dtype="int16"), parcel_ids.size) } def _create_project_proposals(parcel_ids, template_ids): storage.write_table(table_name='development_project_proposals', table_data = _get_data(parcel_ids, template_ids) ) development_project_proposals = DevelopmentProjectProposalDataset(resources=Resources(resources), dataset1 = parcel_dataset, dataset2 = development_template_dataset, index1 = parcel_index, index2 = template_index, in_storage=storage, in_table_name='development_project_proposals', ) return development_project_proposals def _compute_filter(proposals): if filter_attribute is not None: proposals.compute_variables(filter_attribute, dataset_pool=dataset_pool, resources=Resources(resources)) filter_index = where(proposals.get_attribute(filter_attribute) > 0)[0] return filter_index return None def _subset_by_filter(proposals): filter_index = _compute_filter(proposals) if filter_index is not None: proposals.subset_by_index(filter_index, flush_attributes_if_not_loaded=False) return proposals if parcel_index is not None: index1 = parcel_index else: index1 = arange(parcel_dataset.size()) if template_index is not None: index2 = template_index else: index2 = arange(development_template_dataset.size()) has_constraint_dataset = True try: constraints = dataset_pool.get_dataset("development_constraint") constraints.load_dataset_if_not_loaded() except: has_constraint_dataset = False if has_constraint_dataset: constraint_types = unique(constraints.get_attribute("constraint_type")) #unit_per_acre, far etc development_template_dataset.compute_variables(map(lambda x: "%s.%s" % (template_opus_path, x), constraint_types), dataset_pool) parcel_dataset.get_development_constraints(constraints, dataset_pool, index=index1, consider_constraints_as_rules=consider_constraints_as_rules) generic_land_use_type_ids = development_template_dataset.compute_variables("urbansim_parcel.development_template.generic_land_use_type_id", dataset_pool=dataset_pool) parcel_ids = parcel_dataset.get_id_attribute() template_ids = development_template_dataset.get_id_attribute() proposal_parcel_ids = array([],dtype="int32") proposal_template_ids = array([],dtype="int32") logger.start_block("Combine parcels, templates and constraints") for i_template in index2: this_template_id = template_ids[i_template] fit_indicator = ones(index1.size, dtype="bool8") if has_constraint_dataset: generic_land_use_type_id = generic_land_use_type_ids[i_template] for constraint_type, constraint in parcel_dataset.development_constraints[generic_land_use_type_id].iteritems(): template_attribute = development_template_dataset.get_attribute(constraint_type)[i_template] #density converted to constraint variable name if template_attribute == 0: continue min_constraint = constraint[:, 0].copy() max_constraint = constraint[:, 1].copy() ## treat -1 as unconstrainted w_unconstr = min_constraint == -1 if w_unconstr.any(): min_constraint[w_unconstr] = template_attribute w_unconstr = max_constraint == -1 if w_unconstr.any(): max_constraint[w_unconstr] = template_attribute fit_indicator = logical_and(fit_indicator, logical_and(template_attribute >= min_constraint, template_attribute <= max_constraint)) if constraint_type == "units_per_acre": res_units_capacity = parcel_dataset.get_attribute("parcel_sqft")[index1] * max_constraint / 43560.0 debug.print_debug("template_id %s (GLU ID %s) max total residential capacity %s, %s of them fit constraints " % (this_template_id, generic_land_use_type_id, res_units_capacity.sum(), (res_units_capacity * fit_indicator).sum() ), 12) else: non_res_capacity = parcel_dataset.get_attribute("parcel_sqft")[index1] * max_constraint debug.print_debug("template_id %s (GLU ID %s) max total non residential capacity %s, %s of them fit constraints " % (this_template_id, generic_land_use_type_id, non_res_capacity.sum(), (non_res_capacity * fit_indicator).sum() ), 12) proposal_parcel_ids = concatenate((proposal_parcel_ids, parcel_ids[index1[fit_indicator]])) proposal_template_ids = concatenate( (proposal_template_ids, resize(array([this_template_id]), fit_indicator.sum()))) logger.end_block() proposals = _create_project_proposals(proposal_parcel_ids, proposal_template_ids) proposals = _subset_by_filter(proposals) # eliminate proposals with zero units_proposed units_proposed = proposals.compute_variables([proposed_units_variable], dataset_pool = dataset_pool) where_up_greater_zero = where(units_proposed > 0)[0] if where_up_greater_zero.size > 0: proposals.subset_by_index(where_up_greater_zero, flush_attributes_if_not_loaded=False) logger.log_status("proposal set created with %s proposals." % proposals.size()) #proposals.flush_dataset_if_low_memory_mode() return proposals
def run(self, dataset1, dataset2, index1=None, index2=None, sample_size=10, weight=None, include_chosen_choice=None, with_replacement=True, resources=None, dataset_pool=None): """ this function samples number of sample_size (scalar value) alternatives from dataset2 for agent set specified by dataset1. If index1 is not None, only samples alterantives for agents with indices in index1; if index2 is not None, only samples alternatives from indices in index2. sample_size specifies number of alternatives to be sampled for each agent. weight, to be used as sampling weight, is either an attribute name of dataset2, or a 1d array of the same length as index2 or 2d array of shape (index1.size, index2.size). Also refer to document of interaction_dataset""" if dataset_pool is None: sc = SessionConfiguration() try: dataset_pool=sc.get_dataset_pool() except: dataset_pool = DatasetPool(sc.package_order) local_resources = Resources(resources) local_resources.merge_if_not_None( {"dataset1": dataset1, "dataset2": dataset2, "index1":index1, "index2": index2, "sample_size": sample_size, "weight": weight, "with_replacement": with_replacement, "include_chosen_choice": include_chosen_choice}) local_resources.check_obligatory_keys(['dataset1', 'dataset2', 'sample_size']) agent = local_resources["dataset1"] choice = local_resources["dataset2"] index1 = local_resources.get("index1", None) if index1 is None: index1 = arange(agent.size()) index2 = local_resources.get("index2", None) if index2 is None: index2 = arange(choice.size()) if index1.size == 0 or index2.size == 0: err_msg = "either choice size or agent size is zero, return None" logger.log_warning(err_msg) return (None, None) agent_category_definition = local_resources.get("agent_category_definition", []) choice_category_definition = local_resources.get("choice_category_definition", []) agent_filter_attribute = local_resources.get("agent_filter_attribute", None) category_inflating_factor = local_resources.get("category_inflating_factor", 10) frequency, unique_agent_category_id, unique_choice_category_id, agent_category_id, choice_category_id = \ get_category_and_frequency(agent, agent_category_definition, choice, choice_category_definition, agent_filter_attribute, category_inflating_factor, dataset_pool=dataset_pool) include_chosen_choice = local_resources.get("include_chosen_choice", False) chosen_choice_id = agent.get_attribute(choice.get_id_name()[0])[index1] chosen_choice_index = choice.try_get_id_index(chosen_choice_id, return_value_if_not_found=-1) chosen_choice_index_to_index2 = lookup(chosen_choice_index, index2, index_if_not_found=UNPLACED_ID) J = local_resources["sample_size"] if include_chosen_choice: J = J - 1 local_resources.merge_with_defaults({'with_replacement': with_replacement}) with_replacement = local_resources.get("with_replacement") sampled_index = empty((index1.size, J), dtype="int32") sampling_prob = empty((index1.size, J), dtype="float64") _digitize, _where, _normalize = digitize, where, normalize _ncumsum, _rand, _searchsorted = ncumsum, rand, searchsorted #speed hack for i in range(unique_agent_category_id.size): category_id = unique_agent_category_id[i] agents_in_this_category = _where(agent_category_id[index1] == category_id)[0] num_agents = agents_in_this_category.size if num_agents == 0: continue #import pdb; pdb.set_trace() ## divide frequency by the mean frequency to avoid overflow weights = frequency[i, _digitize(choice_category_id[index2], unique_choice_category_id)-1] / frequency[i, :].mean() prob = _normalize(weights) index = _searchsorted(_ncumsum(prob), _rand(num_agents * J)).reshape(-1, J) if not with_replacement: raise NotImplementedError, "Sample without replacement is not implemented for this sampler yet." # nz = nonzero(prob)[0].size # if J < nz: # ## number of non zero weight less than alternatives, sample with replacement # logger.log_warning("There are %s non zero weights and are less than the number of alternatives proposed %s. " % (nz, J) + # "Sample with replacement instead.") # continue # i=0; max_iterations=200 # while True: # index = sort(index, axis=1) # where_repeats = nonzero( logical_not(diff(index, axis=1)) ) # num_repeats = where_repeats[0].size # if num_repeats == 0: break # index[where_repeats] = _searchsorted(_rand(num_repeats), prob) # i += 1 # if i > max_iterations: # logger.log_warning("weight_sampler_by_category is unable to sample %i alternatives without replacement in %i iterations; " % \ # (J, max_iterations) + # "give up sampling without replacement and results may contain replacement." # ) # break sampled_index[agents_in_this_category, :] = index sampling_prob[agents_in_this_category, :] = prob[index] sampled_index = index2[sampled_index] is_chosen_choice = zeros(sampled_index.shape, dtype="bool") #chosen_choice = -1 * ones(chosen_choice_index.size, dtype="int32") if include_chosen_choice: sampled_index = column_stack((chosen_choice_index[:,newaxis],sampled_index)) is_chosen_choice[chosen_choice_index!=UNPLACED_ID, 0] = 1 sampling_prob_for_chosen_choices = take(prob, chosen_choice_index_to_index2[:, newaxis]) ## if chosen choice chosen is unplaced has the sampling prob is 0 sampling_prob_for_chosen_choices[where(chosen_choice_index==UNPLACED_ID)[0],] = 0.0 sampling_prob = column_stack([sampling_prob_for_chosen_choices, sampling_prob]) #chosen_choice[where(is_chosen_choice)[0]] = where(is_chosen_choice)[1] interaction_dataset = self.create_interaction_dataset(dataset1, dataset2, index1, sampled_index) interaction_dataset.add_attribute(sampling_prob, '__sampling_probability') interaction_dataset.add_attribute(is_chosen_choice, 'chosen_choice') ## to get the older returns #sampled_index = interaction_dataset.get_2d_index() #chosen_choices = UNPLACED_ID * ones(index1.size, dtype="int32") #where_chosen = where(interaction_dataset.get_attribute("chosen_choice")) #chosen_choices[where_chosen[0]]=where_chosen[1] #return (sampled_index, chosen_choice) return interaction_dataset
def run(self, dataset1, dataset2, index1=None, index2=None, sample_size=10, weight=None, include_chosen_choice=False, with_replacement=False, resources=None, dataset_pool=None): """this function samples number of sample_size (scalar value) alternatives from dataset2 for agent set specified by dataset1. If index1 is not None, only samples alterantives for agents with indices in index1; if index2 is not None, only samples alternatives from indices in index2. sample_size specifies number of alternatives to be sampled for each agent. weight, to be used as sampling weight, is either an attribute name of dataset2, or a 1d array of the same length as index2 or 2d array of shape (index1.size, index2.size). Also refer to document of interaction_dataset""" if dataset_pool is None: try: sc = SessionConfiguration() dataset_pool = sc.get_dataset_pool() except: dataset_pool = DatasetPool() local_resources = Resources(resources) local_resources.merge_if_not_None({ "dataset1": dataset1, "dataset2": dataset2, "index1": index1, "index2": index2, "sample_size": sample_size, "weight": weight, "with_replacement": with_replacement, "include_chosen_choice": include_chosen_choice }) local_resources.check_obligatory_keys( ['dataset1', 'dataset2', 'sample_size']) agent = local_resources["dataset1"] index1 = local_resources.get("index1", None) if index1 is None: index1 = arange(agent.size()) choice = local_resources["dataset2"] index2 = local_resources.get("index2", None) if index2 is None: index2 = arange(choice.size()) if index1.size == 0 or index2.size == 0: err_msg = "either choice size or agent size is zero, return None" logger.log_warning(err_msg) return None include_chosen_choice = local_resources.get("include_chosen_choice", False) J = local_resources["sample_size"] if include_chosen_choice: J = J - 1 with_replacement = local_resources.get("with_replacement") weight = local_resources.get("weight", None) if isinstance(weight, str): if weight in choice.get_known_attribute_names(): weight = choice.get_attribute(weight) rank_of_weight = 1 else: varname = VariableName(weight) if varname.get_dataset_name() == choice.get_dataset_name(): weight = choice.compute_variables( weight, dataset_pool=dataset_pool) rank_of_weight = 1 elif varname.get_interaction_set_names() is not None: ## weights can be an interaction variable interaction_dataset = InteractionDataset(local_resources) weight = interaction_dataset.compute_variables( weight, dataset_pool=dataset_pool) rank_of_weight = 2 assert (len(weight.shape) >= rank_of_weight) else: err_msg = ("weight is neither a known attribute name " "nor a simple variable from the choice dataset " "nor an interaction variable: '%s'" % weight) logger.log_error(err_msg) raise ValueError, err_msg elif isinstance(weight, ndarray): rank_of_weight = weight.ndim elif not weight: ## weight is None or empty string weight = ones(index2.size) rank_of_weight = 1 else: err_msg = "unkown weight type" logger.log_error(err_msg) raise TypeError, err_msg if (weight.size <> index2.size) and (weight.shape[rank_of_weight - 1] <> index2.size): if weight.shape[rank_of_weight - 1] == choice.size(): if rank_of_weight == 1: weight = take(weight, index2) if rank_of_weight == 2: weight = take(weight, index2, axis=1) else: err_msg = "weight array size doesn't match to size of dataset2 or its index" logger.log_error(err_msg) raise ValueError, err_msg prob = normalize(weight) #chosen_choice = ones(index1.size) * UNPLACED_ID chosen_choice_id = agent.get_attribute(choice.get_id_name()[0])[index1] #index_of_placed_agent = where(greater(chosen_choice_id, UNPLACED_ID))[0] chosen_choice_index = choice.try_get_id_index( chosen_choice_id, return_value_if_not_found=UNPLACED_ID) chosen_choice_index_to_index2 = lookup(chosen_choice_index, index2, index_if_not_found=UNPLACED_ID) if rank_of_weight == 1: # if weight_array is 1d, then each agent shares the same weight for choices replace = with_replacement # sampling with no replacement non_zero_counts = nonzerocounts(weight) if non_zero_counts < J: logger.log_warning( "weight array dosen't have enough non-zero counts, use sample with replacement" ) replace = True if non_zero_counts > 0: sampled_index = prob2dsample( index2, sample_size=(index1.size, J), prob_array=prob, exclude_index=chosen_choice_index_to_index2, replace=replace, return_index=True) else: # all alternatives have a zero weight sampled_index = zeros((index1.size, 0), dtype=DTYPE) #return index2[sampled_index] if rank_of_weight == 2: sampled_index = zeros((index1.size, J), dtype=DTYPE) - 1 for i in range(index1.size): replace = with_replacement # sampling with/without replacement i_prob = prob[i, :] if nonzerocounts(i_prob) < J: logger.log_warning( "weight array dosen't have enough non-zero counts, use sample with replacement" ) replace = True #exclude_index passed to probsample_noreplace needs to be indexed to index2 sampled_index[i, :] = probsample_noreplace( index2, sample_size=J, prob_array=i_prob, exclude_index=chosen_choice_index_to_index2[i], return_index=True) sampling_prob = take(prob, sampled_index) sampled_index_within_prob = sampled_index.copy() sampled_index = index2[sampled_index] is_chosen_choice = zeros(sampled_index.shape, dtype="bool") #chosen_choice = -1 * ones(chosen_choice_index.size, dtype="int32") if include_chosen_choice: sampled_index = column_stack( (chosen_choice_index[:, newaxis], sampled_index)) is_chosen_choice = zeros(sampled_index.shape, dtype="bool") is_chosen_choice[chosen_choice_index != UNPLACED_ID, 0] = 1 #chosen_choice[where(is_chosen_choice)[0]] = where(is_chosen_choice)[1] ## this is necessary because prob is indexed to index2, not to the choice set (as is chosen_choice_index) sampling_prob_for_chosen_choices = take( prob, chosen_choice_index_to_index2[:, newaxis]) ## if chosen choice chosen equals unplaced_id then the sampling prob is 0 sampling_prob_for_chosen_choices[where( chosen_choice_index == UNPLACED_ID)[0], ] = 0.0 sampling_prob = column_stack( [sampling_prob_for_chosen_choices, sampling_prob]) interaction_dataset = self.create_interaction_dataset( dataset1, dataset2, index1, sampled_index) interaction_dataset.add_attribute(sampling_prob, '__sampling_probability') interaction_dataset.add_attribute(is_chosen_choice, 'chosen_choice') if local_resources.get("include_mnl_bias_correction_term", False): if include_chosen_choice: sampled_index_within_prob = column_stack( (chosen_choice_index_to_index2[:, newaxis], sampled_index_within_prob)) interaction_dataset.add_mnl_bias_correction_term( prob, sampled_index_within_prob) ## to get the older returns #sampled_index = interaction_dataset.get_2d_index() #chosen_choices = UNPLACED_ID * ones(index1.size, dtype="int32") #where_chosen = where(interaction_dataset.get_attribute("chosen_choice")) #chosen_choices[where_chosen[0]]=where_chosen[1] #return (sampled_index, chosen_choice) return interaction_dataset
class RegressionModel(ChunkModel): model_name = "Regression Model" model_short_name = "RM" def __init__(self, regression_procedure="opus_core.linear_regression", submodel_string=None, run_config=None, estimate_config=None, debuglevel=0, dataset_pool=None): self.debug = DebugPrinter(debuglevel) self.dataset_pool = self.create_dataset_pool(dataset_pool) self.regression = RegressionModelFactory().get_model( name=regression_procedure) if self.regression == None: raise StandardError, "No regression procedure given." self.submodel_string = submodel_string self.run_config = run_config if self.run_config == None: self.run_config = Resources() if not isinstance(self.run_config, Resources) and isinstance( self.run_config, dict): self.run_config = Resources(self.run_config) self.estimate_config = estimate_config if self.estimate_config == None: self.estimate_config = Resources() if not isinstance(self.estimate_config, Resources) and isinstance( self.estimate_config, dict): self.estimate_config = Resources(self.estimate_config) self.data = {} self.coefficient_names = {} ChunkModel.__init__(self) self.get_status_for_gui().initialize_pieces(3, pieces_description=array([ 'initialization', 'computing variables', 'submodel: 1' ])) def run(self, specification, coefficients, dataset, index=None, chunk_specification=None, data_objects=None, run_config=None, initial_values=None, procedure=None, debuglevel=0): """'specification' is of type EquationSpecification, 'coefficients' is of type Coefficients, 'dataset' is of type Dataset, 'index' are indices of individuals in dataset for which the model runs. If it is None, the whole dataset is considered. 'chunk_specification' determines number of chunks in which the simulation is processed. 'data_objects' is a dictionary where each key is the name of an data object ('zone', ...) and its value is an object of class Dataset. 'run_config' is of type Resources, it gives additional arguments for the run. If 'procedure' is given, it overwrites the regression_procedure of the constructor. 'initial_values' is an array of the initial values of the results. It will be overwritten by the results for those elements that are handled by the model (defined by submodels in the specification). By default the results are initialized with 0. 'debuglevel' overwrites the constructor 'debuglevel'. """ self.debug.flag = debuglevel if run_config == None: run_config = Resources() if not isinstance(run_config, Resources) and isinstance( run_config, dict): run_config = Resources(run_config) self.run_config = run_config.merge_with_defaults(self.run_config) self.run_config.merge({"debug": self.debug}) if data_objects is not None: self.dataset_pool.add_datasets_if_not_included(data_objects) if procedure is not None: self.regression = RegressionModelFactory().get_model( name=procedure) if initial_values is None: self.initial_values = zeros((dataset.size(), ), dtype=float32) else: self.initial_values = zeros((dataset.size(), ), dtype=initial_values.dtype) self.initial_values[index] = initial_values if dataset.size() <= 0: # no data loaded yet dataset.get_id_attribute() if index == None: index = arange(dataset.size()) result = ChunkModel.run(self, chunk_specification, dataset, index, float32, specification=specification, coefficients=coefficients) return result def run_chunk(self, index, dataset, specification, coefficients): self.specified_coefficients = SpecifiedCoefficients().create( coefficients, specification, neqs=1) compute_resources = Resources({"debug": self.debug}) submodels = self.specified_coefficients.get_submodels() self.get_status_for_gui().update_pieces_using_submodels( submodels=submodels, leave_pieces=2) self.map_agents_to_submodels(submodels, self.submodel_string, dataset, index, dataset_pool=self.dataset_pool, resources=compute_resources) variables = self.specified_coefficients.get_full_variable_names_without_constants( ) self.debug.print_debug("Compute variables ...", 4) self.increment_current_status_piece() dataset.compute_variables(variables, dataset_pool=self.dataset_pool, resources=compute_resources) data = {} coef = {} outcome = self.initial_values[index].copy() for submodel in submodels: coef[submodel] = SpecifiedCoefficientsFor1Submodel( self.specified_coefficients, submodel) self.coefficient_names[submodel] = coef[ submodel].get_coefficient_names_without_constant()[0, :] self.debug.print_debug( "Compute regression for submodel " + str(submodel), 4) self.increment_current_status_piece() self.data[submodel] = dataset.create_regression_data( coef[submodel], index=index[self.observations_mapping[submodel]]) nan_index = where(isnan(self.data[submodel]))[1] inf_index = where(isinf(self.data[submodel]))[1] if nan_index.size > 0: nan_var_index = unique(nan_index) raise ValueError, "NaN(Not A Number) is returned from variable %s; check the model specification table and/or attribute values used in the computation for the variable." % coef[ submodel].get_variable_names()[nan_var_index] if inf_index.size > 0: inf_var_index = unique(inf_index) raise ValueError, "Inf is returned from variable %s; check the model specification table and/or attribute values used in the computation for the variable." % coef[ submodel].get_variable_names()[inf_var_index] if (self.data[submodel].shape[0] > 0) and (self.data[submodel].size > 0): # observations for this submodel available outcome[self.observations_mapping[submodel]] = \ self.regression.run(self.data[submodel], coef[submodel].get_coefficient_values()[0,:], resources=self.run_config).astype(outcome.dtype) return outcome def correct_infinite_values(self, dataset, outcome_attribute_name, maxvalue=1e+38, clip_all_larger_values=False): """Check if the model resulted in infinite values. If yes, print warning and clip the values to maxvalue. If clip_all_larger_values is True, all values larger than maxvalue are clip to maxvalue. """ infidx = where(dataset.get_attribute(outcome_attribute_name) == inf)[0] if infidx.size > 0: logger.log_warning("Infinite values in %s. Clipped to %s." % (outcome_attribute_name, maxvalue)) dataset.set_values_of_one_attribute(outcome_attribute_name, maxvalue, infidx) if clip_all_larger_values: idx = where( dataset.get_attribute(outcome_attribute_name) > maxvalue)[0] if idx.size > 0: logger.log_warning( "Values in %s larger than %s. Clipped to %s." % (outcome_attribute_name, maxvalue, maxvalue)) dataset.set_values_of_one_attribute(outcome_attribute_name, maxvalue, idx) def estimate(self, specification, dataset, outcome_attribute, index=None, procedure=None, data_objects=None, estimate_config=None, debuglevel=0): """'specification' is of type EquationSpecification, 'dataset' is of type Dataset, 'outcome_attribute' - string that determines the dependent variable, 'index' are indices of individuals in dataset for which the model runs. If it is None, the whole dataset is considered. 'procedure' - name of the estimation procedure. If it is None, there should be an entry "estimation" in 'estimate_config' that determines the procedure. The class must have a method 'run' that takes as arguments 'data', 'regression_procedure' and 'resources'. It returns a dictionary with entries 'estimators', 'standard_errors' and 't_values' (all 1D numpy arrays). 'data_objects' is a dictionary where each key is the name of an data object ('zone', ...) and its value is an object of class Dataset. 'estimate_config' is of type Resources, it gives additional arguments for the estimation procedure. 'debuglevel' overwrites the class 'debuglevel'. """ #import wingdbstub self.debug.flag = debuglevel if estimate_config == None: estimate_config = Resources() if not isinstance(estimate_config, Resources) and isinstance( estimate_config, dict): estimate_config = Resources(estimate_config) self.estimate_config = estimate_config.merge_with_defaults( self.estimate_config) if data_objects is not None: self.dataset_pool.add_datasets_if_not_included(data_objects) self.procedure = procedure if self.procedure == None: self.procedure = self.estimate_config.get("estimation", None) if self.procedure is not None: self.procedure = ModelComponentCreator().get_model_component( self.procedure) else: logger.log_warning( "No estimation procedure given, or problems with loading the corresponding module." ) compute_resources = Resources({"debug": self.debug}) if dataset.size() <= 0: # no data loaded yet dataset.get_id_attribute() if index == None: index = arange(dataset.size()) if not isinstance(index, ndarray): index = array(index) estimation_size_agents = self.estimate_config.get( "estimation_size_agents", None) # should be a proportion of the agent_set if estimation_size_agents == None: estimation_size_agents = 1.0 else: estimation_size_agents = max(min(estimation_size_agents, 1.0), 0.0) # between 0 and 1 if estimation_size_agents < 1.0: self.debug.print_debug("Sampling agents for estimation ...", 3) estimation_idx = sample_noreplace( arange(index.size), int(index.size * estimation_size_agents)) else: estimation_idx = arange(index.size) estimation_idx = index[estimation_idx] self.debug.print_debug( "Number of observations for estimation: " + str(estimation_idx.size), 2) if estimation_idx.size <= 0: self.debug.print_debug("Nothing to be done.", 2) return (None, None) coefficients = create_coefficient_from_specification(specification) specified_coefficients = SpecifiedCoefficients().create(coefficients, specification, neqs=1) submodels = specified_coefficients.get_submodels() self.get_status_for_gui().update_pieces_using_submodels( submodels=submodels, leave_pieces=2) self.map_agents_to_submodels( submodels, self.submodel_string, dataset, estimation_idx, dataset_pool=self.dataset_pool, resources=compute_resources, submodel_size_max=self.estimate_config.get('submodel_size_max', None)) variables = specified_coefficients.get_full_variable_names_without_constants( ) self.debug.print_debug("Compute variables ...", 4) self.increment_current_status_piece() dataset.compute_variables(variables, dataset_pool=self.dataset_pool, resources=compute_resources) coef = {} estimated_coef = {} self.outcome = {} dataset.compute_variables([outcome_attribute], dataset_pool=self.dataset_pool, resources=compute_resources) regression_resources = Resources(estimate_config) regression_resources.merge({"debug": self.debug}) outcome_variable_name = VariableName(outcome_attribute) for submodel in submodels: coef[submodel] = SpecifiedCoefficientsFor1Submodel( specified_coefficients, submodel) self.increment_current_status_piece() logger.log_status("Estimate regression for submodel " + str(submodel), tags=["estimate"], verbosity_level=2) logger.log_status("Number of observations: " + str(self.observations_mapping[submodel].size), tags=["estimate"], verbosity_level=2) self.data[ submodel] = dataset.create_regression_data_for_estimation( coef[submodel], index=estimation_idx[self.observations_mapping[submodel]]) self.coefficient_names[submodel] = coef[ submodel].get_coefficient_names_without_constant()[0, :] if (self.data[submodel].shape[0] > 0 ) and (self.data[submodel].size > 0) and ( self.procedure is not None): # observations for this submodel available self.outcome[submodel] = dataset.get_attribute_by_index( outcome_variable_name.get_alias(), estimation_idx[self.observations_mapping[submodel]]) regression_resources.merge({"outcome": self.outcome[submodel]}) regression_resources.merge({ "coefficient_names": self.coefficient_names[submodel].tolist(), "constant_position": coef[submodel].get_constants_positions() }) estimated_coef[submodel] = self.procedure.run( self.data[submodel], self.regression, resources=regression_resources) if "estimators" in estimated_coef[submodel].keys(): coef[submodel].set_coefficient_values( estimated_coef[submodel]["estimators"]) if "standard_errors" in estimated_coef[submodel].keys(): coef[submodel].set_standard_errors( estimated_coef[submodel]["standard_errors"]) if "other_measures" in estimated_coef[submodel].keys(): for measure in estimated_coef[submodel][ "other_measures"].keys(): coef[submodel].set_measure( measure, estimated_coef[submodel]["other_measures"] [measure]) if "other_info" in estimated_coef[submodel].keys(): for info in estimated_coef[submodel]["other_info"]: coef[submodel].set_other_info( info, estimated_coef[submodel]["other_info"][info]) coefficients.fill_coefficients(coef) self.save_predicted_values_and_errors(specification, coefficients, dataset, outcome_variable_name, index=index, data_objects=data_objects) return (coefficients, estimated_coef) def prepare_for_run(self, dataset=None, dataset_filter=None, filter_threshold=0, **kwargs): spec, coef = prepare_specification_and_coefficients(**kwargs) if (dataset is not None) and (dataset_filter is not None): filter_values = dataset.compute_variables( [dataset_filter], dataset_pool=self.dataset_pool) index = where(filter_values > filter_threshold)[0] else: index = None return (spec, coef, index) def prepare_for_estimate(self, dataset=None, dataset_filter=None, filter_threshold=0, **kwargs): spec = get_specification_for_estimation(**kwargs) if (dataset is not None) and (dataset_filter is not None): filter_values = dataset.compute_variables( [dataset_filter], dataset_pool=self.dataset_pool) index = where(filter_values > filter_threshold)[0] else: index = None return (spec, index) def get_data_as_dataset(self, submodel=-2): """Like get_all_data, but the retuning value is a Dataset containing attributes that correspond to the data columns. Their names are coefficient names.""" all_data = self.get_all_data(submodel) if all_data is None: return None names = self.get_coefficient_names(submodel) if names is None: return None dataset_data = {} for i in range(names.size): dataset_data[names[i]] = all_data[:, i].reshape(all_data.shape[0]) dataset_data["id"] = arange(all_data.shape[0]) + 1 storage = StorageFactory().get_storage('dict_storage') storage.write_table(table_name='dataset', table_data=dataset_data) ds = Dataset(in_storage=storage, id_name="id", in_table_name='dataset') return ds def save_predicted_values_and_errors(self, specification, coefficients, dataset, outcome_variable, index=None, data_objects=None): if self.estimate_config.get('save_predicted_values_and_errors', False): logger.log_status('Computing predicted values and residuals.') original_values = dataset.get_attribute_by_index( outcome_variable, index) predicted_values = zeros(dataset.size(), dtype='float32') predicted_values[index] = self.run_after_estimation( specification, coefficients, dataset, index=index, data_objects=data_objects) predicted_attribute_name = 'predicted_%s' % outcome_variable.get_alias( ) dataset.add_primary_attribute(name=predicted_attribute_name, data=predicted_values) dataset.flush_attribute(predicted_attribute_name) predicted_error_attribute_name = 'residuals_%s' % outcome_variable.get_alias( ) error_values = zeros(dataset.size(), dtype='float32') error_values[index] = (original_values - predicted_values[index]).astype( error_values.dtype) dataset.add_primary_attribute(name=predicted_error_attribute_name, data=error_values) dataset.flush_attribute(predicted_error_attribute_name) logger.log_status( 'Predicted values saved as %s (for the %s dataset)' % (predicted_attribute_name, dataset.get_dataset_name())) logger.log_status( 'Residuals saved as %s (for the %s dataset)' % (predicted_error_attribute_name, dataset.get_dataset_name())) def export_estimation_data(self, submodel=-2, file_name='./estimation_data_regression.txt', delimiter='\t'): import os from numpy import newaxis data = concatenate((self.outcome[submodel][..., newaxis], self.get_all_data(submodel=submodel)), axis=1) header = ['outcome'] + self.get_coefficient_names(submodel).tolist() nrows = data.shape[0] file_name_root, file_name_ext = os.path.splitext(file_name) out_file = "%s_submodel_%s.txt" % (file_name_root, submodel) fh = open(out_file, 'w') fh.write(delimiter.join(header) + '\n') #file header for row in range(nrows): line = [str(x) for x in data[row, ]] fh.write(delimiter.join(line) + '\n') fh.flush() fh.close print 'Data written into %s' % out_file def run_after_estimation(self, *args, **kwargs): return self.run(*args, **kwargs) def _get_status_total_pieces(self): return ChunkModel._get_status_total_pieces( self) * self.get_status_for_gui().get_total_number_of_pieces() def _get_status_current_piece(self): return ChunkModel._get_status_current_piece( self) * self.get_status_for_gui().get_total_number_of_pieces( ) + self.get_status_for_gui().get_current_piece() def _get_status_piece_description(self): return "%s %s" % (ChunkModel._get_status_piece_description( self), self.get_status_for_gui().get_current_piece_description()) def get_specified_coefficients(self): return self.specified_coefficients
def create_from_parcel_and_development_template( parcel_dataset, development_template_dataset, parcel_index=None, template_index=None, filter_attribute=None, consider_constraints_as_rules=True, template_opus_path="urbansim_parcel.development_template", proposed_units_variable="urbansim_parcel.development_project_proposal.units_proposed", dataset_pool=None, resources=None): """create development project proposals from parcel and development_template_dataset, parcel_index - 1D array, indices of parcel_dataset. Status of the proposals is set to 'tentative'. template_index - index to templates that are available to create proposals; filter_attribute - variable that is used to filter proposals; If a development constraint table exists, create proposal dataset include only proposals that are allowed by constraints, otherwise, create a proposal dataset with Cartesian product of parcels x templates """ resources = Resources(resources) debug = resources.get("debug", 0) if not isinstance(debug, DebugPrinter): debug = DebugPrinter(debug) if parcel_index is not None and parcel_index.size <= 0: logger.log_warning( "parcel index for creating development proposals is of size 0. No proposals will be created." ) return None storage = StorageFactory().get_storage('dict_storage') current_year = SimulationState().get_current_time() def _get_data(parcel_ids, template_ids): return { "proposal_id": arange(1, parcel_ids.size + 1, 1), "parcel_id": parcel_ids, "template_id": template_ids, "start_year": array(parcel_ids.size * [current_year]), "status_id": resize( array([DevelopmentProjectProposalDataset.id_tentative], dtype="int16"), parcel_ids.size) } def _create_project_proposals(parcel_ids, template_ids): storage.write_table(table_name='development_project_proposals', table_data=_get_data(parcel_ids, template_ids)) development_project_proposals = DevelopmentProjectProposalDataset( resources=Resources(resources), dataset1=parcel_dataset, dataset2=development_template_dataset, index1=parcel_index, index2=template_index, in_storage=storage, in_table_name='development_project_proposals', ) return development_project_proposals def _compute_filter(proposals): if filter_attribute is not None: proposals.compute_variables(filter_attribute, dataset_pool=dataset_pool, resources=Resources(resources)) filter_index = where( proposals.get_attribute(filter_attribute) > 0)[0] return filter_index return None def _subset_by_filter(proposals): filter_index = _compute_filter(proposals) if filter_index is not None: proposals.subset_by_index(filter_index, flush_attributes_if_not_loaded=False) return proposals if parcel_index is not None: index1 = parcel_index else: index1 = arange(parcel_dataset.size()) if template_index is not None: index2 = template_index else: index2 = arange(development_template_dataset.size()) has_constraint_dataset = True try: constraints = dataset_pool.get_dataset("development_constraint") constraints.load_dataset_if_not_loaded() except: has_constraint_dataset = False if has_constraint_dataset: constraint_types = unique(constraints.get_attribute( "constraint_type")) #unit_per_acre, far etc development_template_dataset.compute_variables( map(lambda x: "%s.%s" % (template_opus_path, x), constraint_types), dataset_pool) parcel_dataset.get_development_constraints( constraints, dataset_pool, index=index1, consider_constraints_as_rules=consider_constraints_as_rules) generic_land_use_type_ids = development_template_dataset.compute_variables( "urbansim_parcel.development_template.generic_land_use_type_id", dataset_pool=dataset_pool) parcel_ids = parcel_dataset.get_id_attribute() template_ids = development_template_dataset.get_id_attribute() proposal_parcel_ids = array([], dtype="int32") proposal_template_ids = array([], dtype="int32") logger.start_block("Combine parcels, templates and constraints") for i_template in index2: this_template_id = template_ids[i_template] fit_indicator = ones(index1.size, dtype="bool8") if has_constraint_dataset: generic_land_use_type_id = generic_land_use_type_ids[i_template] for constraint_type, constraint in parcel_dataset.development_constraints[ generic_land_use_type_id].iteritems(): template_attribute = development_template_dataset.get_attribute( constraint_type )[i_template] #density converted to constraint variable name if template_attribute == 0: continue min_constraint = constraint[:, 0].copy() max_constraint = constraint[:, 1].copy() ## treat -1 as unconstrainted w_unconstr = min_constraint == -1 if w_unconstr.any(): min_constraint[w_unconstr] = template_attribute w_unconstr = max_constraint == -1 if w_unconstr.any(): max_constraint[w_unconstr] = template_attribute fit_indicator = logical_and( fit_indicator, logical_and(template_attribute >= min_constraint, template_attribute <= max_constraint)) if constraint_type == "units_per_acre": res_units_capacity = parcel_dataset.get_attribute( "parcel_sqft")[index1] * max_constraint / 43560.0 debug.print_debug( "template_id %s (GLU ID %s) max total residential capacity %s, %s of them fit constraints " % (this_template_id, generic_land_use_type_id, res_units_capacity.sum(), (res_units_capacity * fit_indicator).sum()), 12) else: non_res_capacity = parcel_dataset.get_attribute( "parcel_sqft")[index1] * max_constraint debug.print_debug( "template_id %s (GLU ID %s) max total non residential capacity %s, %s of them fit constraints " % (this_template_id, generic_land_use_type_id, non_res_capacity.sum(), (non_res_capacity * fit_indicator).sum()), 12) proposal_parcel_ids = concatenate( (proposal_parcel_ids, parcel_ids[index1[fit_indicator]])) proposal_template_ids = concatenate((proposal_template_ids, resize(array([this_template_id]), fit_indicator.sum()))) logger.end_block() proposals = _create_project_proposals(proposal_parcel_ids, proposal_template_ids) proposals = _subset_by_filter(proposals) # eliminate proposals with zero units_proposed units_proposed = proposals.compute_variables([proposed_units_variable], dataset_pool=dataset_pool) where_up_greater_zero = where(units_proposed > 0)[0] if where_up_greater_zero.size > 0: proposals.subset_by_index(where_up_greater_zero, flush_attributes_if_not_loaded=False) logger.log_status("proposal set created with %s proposals." % proposals.size()) #proposals.flush_dataset_if_low_memory_mode() return proposals
def run(self, dataset1, dataset2, index1=None, index2=None, stratum=None, weight=None, sample_size=1, sample_size_from_each_stratum=None, sample_size_from_chosen_stratum=None, sample_rate=None, include_chosen_choice=False, resources=None, with_replacement=False, dataset_pool=None, **kwargs): """this function samples number of sample_size (scalar value) alternatives from dataset2 for agent set specified by dataset1. If index1 is not None, only samples alternatives for agents with indices in index1; if index2 is not None, only samples alternatives from indices in index2. sample_size specifies number of alternatives to be sampled from each stratum, and is overwritten by sample_size_from_each_stratum if it's not None weight, to be used as sampling weight, is either an attribute name of dataset2, or a 1d array of the same length as index2 or 2d array of shape (index1.size, index2.size). Also refer to document of interaction_dataset""" if dataset_pool is None: try: sc = SessionConfiguration() dataset_pool = sc.get_dataset_pool() except: dataset_pool = DatasetPool() local_resources = Resources(resources) local_resources.merge_if_not_None({ "dataset1": dataset1, "dataset2": dataset2, "index1": index1, "index2": index2, "with_replacement": with_replacement, "stratum": stratum, "weight": weight, "sample_size": sample_size, "sample_size_from_each_stratum": sample_size_from_each_stratum, "sample_size_from_chosen_stratum": sample_size_from_chosen_stratum, "sample_rate": sample_rate, "include_chosen_choice": include_chosen_choice }) local_resources.check_obligatory_keys(['dataset1', 'dataset2']) index1 = local_resources.get("index1", None) agent = dataset1 if index1 is None: agent.get_id_attribute() index1 = arange(agent.size()) choice = local_resources["dataset2"] index2 = local_resources.get("index2", None) if index2 is None: choice.get_id_attribute() index2 = arange(choice.size()) if index1.size == 0 or index2.size == 0: err_msg = "either choice size or agent size is zero, return None" logger.log_warning(err_msg) return (None, None) include_chosen_choice = local_resources.get("include_chosen_choice", False) weight = local_resources.get("weight", None) if isinstance(weight, str): choice.compute_variables(weight, resources=local_resources) weight = choice.get_attribute(weight) rank_of_weight = 1 elif isinstance(weight, ndarray): rank_of_weight = weight.ndim elif weight is None: weight = ones(index2.size) rank_of_weight = 1 else: err_msg = "unknown weight type" logger.log_error(err_msg) raise TypeError, err_msg if (weight.size <> index2.size) and (weight.shape[rank_of_weight - 1] <> index2.size): if weight.shape[rank_of_weight - 1] == choice.size(): weight = take(weight, index2) else: err_msg = "weight array size doesn't match to size of dataset2 or its index" logger.log_error(err_msg) raise ValueError, err_msg prob = normalize(weight) stratum = local_resources.get("stratum", None) if stratum is None: raise StandardError, "'stratum' must be defined for stratified sampling." if isinstance(stratum, str): choice.compute_variables(stratum, resources=local_resources) stratum = choice.get_attribute(stratum) #chosen_choice = ones(index1.size) * UNPLACED_ID chosen_choice_id = agent.get_attribute(choice.get_id_name()[0])[index1] #index_of_placed_agent = where(greater(chosen_choice_id, UNPLACED_ID))[0] chosen_choice_index = choice.try_get_id_index( chosen_choice_id, return_value_if_not_found=-1) chosen_choice_index_to_index2 = lookup(chosen_choice_index, index2, index_if_not_found=UNPLACED_ID) ##TODO: check all chosen strata are in selectable strata #i.e. chosen_choice_index is in index2 chosen_stratum = ones(chosen_choice_index.size, dtype=DTYPE) * NO_STRATUM_ID chosen_stratum[where( chosen_choice_index != -1)] = stratum[chosen_choice_index[where( chosen_choice_index != -1)]] selectable_strata = stratum[index2] unique_strata = unique(selectable_strata) unique_strata = unique_strata[where(unique_strata != NO_STRATUM_ID)] # if rank_of_weight == 2: # raise RuntimeError, "stratified sampling for 2d weight is unimplemented yet" # sampled_index = zeros((index1.size,1)) - 1 sample_size = local_resources.get("sample_size", None) sample_size_from_each_stratum = local_resources.get( "sample_size_from_each_stratum", None) if sample_size_from_each_stratum is None: sample_size_from_each_stratum = sample_size strata_sample_size = ones(unique_strata.size, dtype=DTYPE) * sample_size_from_each_stratum sample_rate = local_resources.get("sample_rate", None) if sample_rate is not None: raise UnImplementedError, "sample_rate is not implemented yet." ##TODO: to be finished #num_elements_in_strata = histogram(selectable_strata, unique_strata) #strata_sample_size = round(num_elements_in_strata * sample_rate) sample_size_from_chosen_stratum = local_resources.get( "sample_size_from_chosen_stratum", None) if sample_size_from_chosen_stratum is None and not include_chosen_choice: strata_sample_pairs = array( map(lambda x, y: [x, y], unique_strata, strata_sample_size)) if rank_of_weight == 1: sampled_index = self._sample_by_stratum( index1, index2, selectable_strata, prob, chosen_choice_index_to_index2, strata_sample_pairs) elif rank_of_weight == 2: sampled_index = self._sample_by_agent_and_stratum( index1, index2, selectable_strata, prob, chosen_choice_index_to_index2, strata_sample_pairs) else: strata_sample_setting = zeros((index1.size, unique_strata.size, 2), dtype=DTYPE) for i in range(index1.size): agents_strata_sample_size = copy.copy(strata_sample_size) if sample_size_from_chosen_stratum is None: ## if sample_size_from_chosen_stratum is None and include_chosen_choice is True, ## sample one less from the chosen stratum agents_strata_sample_size[where( unique_strata == chosen_stratum[i])] += -1 else: agents_strata_sample_size[where( unique_strata == chosen_stratum[i])] = sample_size_from_chosen_stratum strata_sample_pairs = array( map(lambda x, y: [x, y], unique_strata, agents_strata_sample_size)) strata_sample_setting[i, ...] = strata_sample_pairs sampled_index = self._sample_by_agent_and_stratum( index1, index2, selectable_strata, prob, chosen_choice_index_to_index2, strata_sample_setting) #chosen_choice = None is_chosen_choice = zeros(sampled_index.shape, dtype="bool") if include_chosen_choice: sampled_index = concatenate( (chosen_choice_index[:, newaxis], sampled_index), axis=1) #chosen_choice = zeros(chosen_choice_index.shape, dtype="int32") - 1 #chosen_choice[where(chosen_choice_index>UNPLACED_ID)] = 0 #make chosen_choice index to sampled_index, instead of choice (as chosen_choice_index does) #since the chosen choice index is attached to the first column, the chosen choice should be all zeros #for valid chosen_choice_index is_chosen_choice = zeros(sampled_index.shape, dtype="bool") is_chosen_choice[chosen_choice_index != UNPLACED_ID, 0] = 1 chosen_probability = zeros( (chosen_choice_index.size, ), dtype=float32) - 1 for stratum in unique_strata: w = chosen_stratum == stratum chosen_probability[w] = ( prob[chosen_choice_index[w]] / prob[selectable_strata == stratum].sum()).astype(float32) self._sampling_probability = concatenate( (chosen_probability[:, newaxis], self._sampling_probability), axis=1) self._stratum_id = concatenate( (chosen_stratum[:, newaxis], self._stratum_id), axis=1) interaction_dataset = self.create_interaction_dataset( dataset1, dataset2, index1, sampled_index) interaction_dataset.add_attribute(self._sampling_probability, '__sampling_probability') interaction_dataset.add_attribute(is_chosen_choice, 'chosen_choice') interaction_dataset.add_attribute(self._stratum_id, 'stratum_id') ## to get the older returns #sampled_index = interaction_dataset.get_2d_index() #chosen_choices = UNPLACED_ID * ones(index1.size, dtype="int32") #where_chosen = where(interaction_dataset.get_attribute("chosen_choice")) #chosen_choices[where_chosen[0]]=where_chosen[1] #return (sampled_index, chosen_choice) return interaction_dataset
class PandasDataset(Dataset): """ This is under construction. It is an attempt to have an analogous to an Opus Dataset that would use Pandas DataFrame. The actual data is stored in an attribute called df which is a DataFrame and is indexed by the dataset's unique identifier. The dataset can be created from the same inputs as Opus dataset. Alternatively, it can be created from an existing Opus dataset using the constructor PandasClassFactory. """ def __init__(self, create_from_data=True, **kwargs): if create_from_data: self.create_from_data(**kwargs) def create_from_data(self, resources=None, id_name=None, in_storage=None, dataset_name=None, out_storage=None, in_table_name=None, out_table_name=None): self.resources = Resources(resources) self.resources.merge_if_not_None({ "id_name":id_name, "dataset_name":dataset_name, "in_storage":in_storage, "out_storage":out_storage, "in_table_name":in_table_name, "out_table_name":out_table_name}) self.resources.merge_with_defaults({"dataset_name":"dataset"}) self.dataset_name = self.resources.get("dataset_name", None) self.attribute_cache = AttributeCache() self._aliases = {} self._id_names = self.resources.get("id_name", []) if not isinstance(self._id_names, list): self._id_names = [self._id_names] self.variable_factory = VariableFactory() self.debug = self.resources.get("debug", 0) self.df = pd.DataFrame(self.resources.get('in_storage').load_table(self.resources.get('in_table_name'))) self._primary_attribute_names = self.get_attribute_names() self.df.set_index(self._id_names, inplace=True) self.attribute_boxes = {} for attr in self._primary_attribute_names: self.attribute_boxes[attr] = AttributeBox(self, [], variable_name=self.create_and_check_qualified_variable_name(attr), type=AttributeType.PRIMARY, is_in_memory=True, header=None, version=0) self.n = self.df.shape[0] def __getitem__(self, attr): """ dataset[attr] """ return self.get_attribute(attr) def __setitem__(self, attr, values): """ dataset[attr] = values """ self.df[attr] = values def get_attribute(self, name): if isinstance(name, VariableName): name = name.get_alias() else: name = VariableName(name).get_alias() if name in self.get_id_name(): return self.get_id_attribute() return self.df[name].values def get_id_attribute(self): return self.df.index.values def get_attribute_by_id(self, name, id): return self.df[name][id] def get_attribute_names(self): return self.df.columns def _do_flush_attribute(self, name): """For now don't do anything.""" pass def load_dataset(self, resources=None, attributes=None, in_storage=None, in_table_name=None, lowercase=None, **kwargs): #set defaults attributes_default = '*' lower_default = 1 # if 1, use lowercase for attribute names # merge arguments with dictionaries and add missing entries local_resources = Resources(self.resources) if resources is not None: local_resources.merge_if_not_None(resources) local_resources.merge_if_not_None({"attributes":attributes, "in_storage":in_storage, "in_table_name":in_table_name, "lowercase":lowercase}) local_resources.merge_with_defaults({"attributes":attributes_default, "lowercase":lower_default, }) # check obligatory entries local_resources.check_obligatory_keys(["in_storage", "in_table_name"]) # prepare for loading in_storage = local_resources["in_storage"] if not self._is_hidden_id(): local_resources.merge({"id_name":self._id_names}) table_name = local_resources['in_table_name'] column_names = local_resources['attributes'] chunked_attributes = self.chunk_columns(storage=in_storage, table_name=table_name, column_names=column_names, nchunks=1) # flatten list column_names = [name for name in chunked_attributes[0] if name in in_storage.get_column_names(table_name)] data = in_storage.load_table(table_name = table_name, column_names = column_names) self.df = pd.DataFrame(data) self.df.set_index(self._id_names, inplace=True) data_computed = {} if table_name+".computed" in in_storage.get_table_names(): column_names_computed = [name for name in column_names if name in in_storage.get_column_names(table_name+".computed")] data_computed = in_storage.load_table(table_name = table_name+".computed", column_names = column_names_computed) dfcomp = pd.DataFrame(data_computed) dfcomp.set_index(self._id_names, inplace=True) self.df = concat(self.df, dfcomp) for attr in data: if not ((attr in self._id_names) and self.attribute_boxes.has_key(attr)): #do not store id_name every time self.attribute_boxes[attr] = AttributeBox(self, [], variable_name=self.create_and_check_qualified_variable_name(attr), type=AttributeType.PRIMARY, is_in_memory=True, header=None, version=0) for attr in data_computed: if not ((attr in self._id_names) and self.attribute_boxes.has_key(attr)): #do not store id_name every time self.attribute_boxes[attr] = AttributeBox(self, [], variable_name=self.create_and_check_qualified_variable_name(attr), type=AttributeType.COMPUTED, is_in_memory=True, header=None, version=0) self.n = self.df.shape[0] def add_attribute(self, data, name, metadata=2): """Add values given in argument 'data' to dataset as an attribute 'name' as type 'metadata'. If this attribute already exists, its values are overwritten. 'metadata' should be of type AttributeType (PRIMARY=1, COMPUTED=2). The method increments and returns the version number of the attribute. """ if not (isinstance(data, ndarray) or is_masked_array(data)): data=array(data) name = self.create_and_check_qualified_variable_name(name) short_name = name.get_alias() if short_name in self.get_attribute_names(): self.attribute_boxes[short_name].set_is_in_memory(True) self.attribute_boxes[short_name].set_type(metadata) else: self.attribute_boxes[short_name] = AttributeBox(self, data=[], variable_name=name, type=metadata) if metadata == AttributeType.PRIMARY: self._add_to_primary_attribute_names(short_name) self.df[short_name] = data self.__increment_version(short_name) return self.get_version(short_name) def attribute_sum(self, name): """Return the sum of values of the attribute 'name'. """ return self.df[name].sum() def attribute_average(self, name): """Return the value of the given attribute averaged over the dataset. """ return self.df[name].mean() def summary(self, index=None): if index is not None: self.df[index].describe() else: self.df.describe() def size(self): """Return size of the dataset.""" return self.df.shape[0] def get_data_element_by_id(self, id, all_attributes=False): """Return an object of class DataElement of the given identifier id. See get_data_element.""" return self.get_data_element(id, all_attributes) def get_data_element(self, id, **kwargs): """Return an object of class DataElement of the given index. """ object = DataElement() for col in self.get_attribute_names(): setattr(object, col, self.df[col][id]) return object def subset_by_ids(self, ids, **kwargs): """Shrink the dataset to values given by 'index'. The removed data are then lost. """ self.df = self.df.loc[ids] self.n = self.df.shape[0] def aggregate_dataset_over_ids(self, dataset, function='sum', attribute_name=None, constant=None): """Aggregate attribute (given by 'attribute_name') of the given 'dataset' over self by applying the given function. The dataset is expected to have an attribute of the same name as the unique identifier of self. If attribute_name is not given, the argument 'constant' must be given, which is either a scalar or a numpy array. if it is a scalar, for each individual to be counted the constant value is taken into the function; if it is a numpy array of the same size as dataset, the value in the same index as individual is counted into the function. """ workdf = dataset.df if attribute_name == None: if constant == None: self._raise_error(StandardError, "Either 'attribute_name' or 'constant' must be given.") elif isinstance(constant, ndarray): if constant.size <> dataset_id_values.size: self._raise_error(StandardError, "constant's size (%d) must be of the same as dataset's size (%d)" % (constant.size, dataset_id_values.size)) values = constant else: values = resize(array([constant]), dataset.size()) attribute_name = '__constant__' workdf[attribute_name] = values else: if is_masked_array(dataset[attribute_name]): w = where(ma.getmask(dataset[attribute_name])) if len(w)>0: where_masked = w[0] # do not consider those elements in the computation workdf[attribute_name] = ma.filled(workdf[attribute_name], NaN) #logger.start_block('Aggregate Pandas') grouped = workdf.groupby(self.get_id_name())[attribute_name] f = getattr(np, function) res = grouped.aggregate(f) #logger.end_block() return res def get_join_data(self, dataset, name, join_attribute=None, return_value_if_not_found=None, **kwargs): """Does a join on a attribute of two datasets (self and 'dataset'). 'join_attribute' specifies the join attribute of self. If this is None it is assumed to be identical to dataset._id_names which is the join attribute of 'dataset'. The method returns values of the attribute 'name' (which is an attribute of 'dataset') for the joined ids, i.e. the resulting array should have the same size as self. """ default_return_values_by_type = default_filled_values_by_type = {'S':'', 'U':'', 'b':False, 'i':-1, 'u':0, 'f':-1.0} id_name = dataset.get_id_name() jattr = join_attribute if jattr == None: jattr = id_name if not isinstance(jattr, list): jattr = [jattr] if not isinstance(name, list): name = [name] #logger.start_block('Disaggregate Pandas') result = self.df[jattr].join(dataset.df[name], on=jattr)[name] #result = dataset.df[name].loc[self.df[jattr[0]]] #logger.end_block() for attr in result.columns: if result[attr].dtype == object: result[attr] = result[attr].astype(dataset.df[attr].dtype) if np.isnan(result[attr].values).any(): k = dataset.df[attr].values.dtype.kind if return_value_if_not_found is None and default_return_values_by_type.has_key(k): val = default_return_values_by_type[k] else: val = return_value_if_not_found result[attr].iloc[where(np.isnan(result[attr].values))] = val return result def __set_version(self, name, version): self.attribute_boxes[name].set_version(version) def __increment_version(self, name): if self.get_version(name) == None: self.__set_version(name, 0) else: self.__set_version(name, self.get_version(name)+1)
def run( self, building_set, building_types_table, vacancy_table, year, location_set, building_categories=None, dataset_pool=None, resources=None ): building_types = building_types_table.get_attribute("name") building_id_name = building_set.get_id_name()[0] location_id_name = location_set.get_id_name()[0] new_buildings = {building_id_name: array([], dtype=building_set.get_data_type(building_id_name)), "building_type_id":array([], dtype=building_set.get_data_type("building_type_id", int8)), "year_built": array([], dtype=building_set.get_data_type("year_built", int32)), "sqft": array([], dtype=building_set.get_data_type("sqft", int32)), "residential_units": array([], dtype=building_set.get_data_type("residential_units", int32)), "improvement_value": array([], dtype= building_set.get_data_type("improvement_value", float32)), "land_value": array([], dtype= building_set.get_data_type("land_value", float32)), location_id_name: array([], dtype=building_set.get_data_type(location_id_name, int32))} max_id = building_set.get_id_attribute().max() buildings_set_size_orig = building_set.size() for itype in range(building_types_table.size()): # iterate over building types type = building_types[itype] type_code = building_types_table.get_id_attribute()[itype] is_residential = building_types_table.get_attribute("is_residential")[itype] vacancy_attribute = 'target_total_%s_vacancy' % type if vacancy_attribute not in vacancy_table.get_known_attribute_names(): logger.log_warning("No target vacancy for building type '%s'. Transition model for this building type skipped." % type) continue vacancy_table.get_attribute(vacancy_attribute) # ensures that the attribute is loaded target_vacancy_rate = eval("vacancy_table.get_data_element_by_id( year ).%s" % vacancy_attribute) compute_resources = Resources(resources) compute_resources.merge({"debug":self.debug}) units_attribute = building_types_table.get_attribute('units')[itype] # determine current-year vacancy rates if is_residential: default_vacancy_variable = "urbansim.%s.vacant_%s_units_from_buildings" % ( location_set.get_dataset_name(), type) else: default_vacancy_variable = "urbansim.%s.vacant_%s_sqft_from_buildings" % ( location_set.get_dataset_name(), type) variable_for_vacancy = compute_resources.get( "%s_vacant_variable" % type, default_vacancy_variable) location_set.compute_variables([variable_for_vacancy, "urbansim.%s.buildings_%s_space" % ( location_set.get_dataset_name(),type)], dataset_pool=dataset_pool, resources = compute_resources) vacant_units_sum = location_set.get_attribute(variable_for_vacancy).sum() units_sum = float( location_set.get_attribute("buildings_%s_space" % type).sum() ) vacant_rate = self.safe_divide(vacant_units_sum, units_sum) should_develop_units = int(round(max( 0, ( target_vacancy_rate * units_sum - vacant_units_sum ) / ( 1 - target_vacancy_rate ) ))) logger.log_status(type + ": vacant units: %d, should be vacant: %f, sum units: %d" % (vacant_units_sum, target_vacancy_rate * units_sum, units_sum)) if not should_develop_units: logger.log_note(("Will not build any " + type + " units, because the current vacancy of %d units\n" + "is more than the %d units desired for the vacancy rate of %f.") % (vacant_units_sum, target_vacancy_rate * units_sum, target_vacancy_rate)) continue improvement_value = building_set.compute_variables("urbansim.%s.%s_improvement_value" % ( building_set.get_dataset_name(), type), dataset_pool=dataset_pool, resources=compute_resources) average_improvement_value = improvement_value.sum()/ units_sum #create buildings is_building_type = building_set.compute_variables("urbansim.building.is_building_type_%s" % type, dataset_pool=dataset_pool, resources=compute_resources) units_of_this_type = building_set.compute_variables(units_attribute, dataset_pool=dataset_pool, resources=compute_resources) units_of_this_type = units_of_this_type*is_building_type units_without_zeros_idx = where(units_of_this_type > 0)[0] history_values_without_zeros = units_of_this_type[units_without_zeros_idx] history_improvement_values_without_zeros = where(improvement_value[units_without_zeros_idx]>0, improvement_value[units_without_zeros_idx], average_improvement_value) mean_size = history_values_without_zeros.mean() idx = array( [], dtype="int32" ) # Ensure that there are some development projects to choose from. num_of_projects_to_select = max( 10, int( should_develop_units / mean_size ) ) while True: idx = concatenate( ( idx, randint( 0, history_values_without_zeros.size, size=num_of_projects_to_select) ) ) csum = history_values_without_zeros[idx].cumsum() idx = idx[where( csum <= should_develop_units )] if csum[-1] >= should_develop_units: break nbuildings = idx.size new_buildings["building_type_id"] = concatenate((new_buildings["building_type_id"], type_code*ones(nbuildings))) new_buildings["year_built"] = concatenate((new_buildings["year_built"], year*ones(nbuildings))) new_max_id = max_id + nbuildings new_buildings[building_id_name]=concatenate((new_buildings[building_id_name], arange(max_id+1, new_max_id+1))) max_id = new_max_id new_buildings["improvement_value"] = concatenate((new_buildings["improvement_value"], history_improvement_values_without_zeros[idx])) if is_residential: target_size_attribute = "residential_units" zero_attribute = "sqft" else: target_size_attribute = "sqft" zero_attribute = "residential_units" new_buildings[target_size_attribute] = concatenate((new_buildings[target_size_attribute], history_values_without_zeros[idx])) new_buildings[zero_attribute] = concatenate((new_buildings[zero_attribute], zeros(nbuildings))) new_buildings[location_id_name] = concatenate((new_buildings[location_id_name], zeros(nbuildings))) new_buildings["land_value"] = concatenate((new_buildings["land_value"], zeros(nbuildings))) logger.log_status("Creating %s %s of %s %s buildings." % (history_values_without_zeros[idx].sum(), target_size_attribute, nbuildings, type)) building_set.add_elements(new_buildings, require_all_attributes=False) if building_categories: # should be a dictionary of categories for each building type building_set.resources['building_categories'] = building_categories # add submodel attribute category_variables = map(lambda type: "urbansim.%s.size_category_%s" % (building_set.get_dataset_name(), type), building_types) for category_var in category_variables: var = VariableName(category_var) if var.get_alias() in building_set.get_known_attribute_names(): building_set.delete_one_attribute(var) building_set.compute_variables(var, dataset_pool=dataset_pool, resources = compute_resources) building_set.add_primary_attribute(building_set.get_attribute(var), var.get_alias()) difference = building_set.size() - buildings_set_size_orig return difference
def run(self, dataset1, dataset2, index1=None, index2=None, sample_size=10, weight=None, include_chosen_choice=None, with_replacement=True, resources=None, dataset_pool=None): """ this function samples number of sample_size (scalar value) alternatives from dataset2 for agent set specified by dataset1. If index1 is not None, only samples alterantives for agents with indices in index1; if index2 is not None, only samples alternatives from indices in index2. sample_size specifies number of alternatives to be sampled for each agent. weight, to be used as sampling weight, is either an attribute name of dataset2, or a 1d array of the same length as index2 or 2d array of shape (index1.size, index2.size). Also refer to document of interaction_dataset""" if dataset_pool is None: sc = SessionConfiguration() try: dataset_pool = sc.get_dataset_pool() except: dataset_pool = DatasetPool(sc.package_order) local_resources = Resources(resources) local_resources.merge_if_not_None({ "dataset1": dataset1, "dataset2": dataset2, "index1": index1, "index2": index2, "sample_size": sample_size, "weight": weight, "with_replacement": with_replacement, "include_chosen_choice": include_chosen_choice }) local_resources.check_obligatory_keys( ['dataset1', 'dataset2', 'sample_size']) agent = local_resources["dataset1"] choice = local_resources["dataset2"] index1 = local_resources.get("index1", None) if index1 is None: index1 = arange(agent.size()) index2 = local_resources.get("index2", None) if index2 is None: index2 = arange(choice.size()) if index1.size == 0 or index2.size == 0: err_msg = "either choice size or agent size is zero, return None" logger.log_warning(err_msg) return (None, None) agent_category_definition = local_resources.get( "agent_category_definition", []) choice_category_definition = local_resources.get( "choice_category_definition", []) agent_filter_attribute = local_resources.get("agent_filter_attribute", None) category_inflating_factor = local_resources.get( "category_inflating_factor", 10) frequency, unique_agent_category_id, unique_choice_category_id, agent_category_id, choice_category_id = \ get_category_and_frequency(agent, agent_category_definition, choice, choice_category_definition, agent_filter_attribute, category_inflating_factor, dataset_pool=dataset_pool) include_chosen_choice = local_resources.get("include_chosen_choice", False) chosen_choice_id = agent.get_attribute(choice.get_id_name()[0])[index1] chosen_choice_index = choice.try_get_id_index( chosen_choice_id, return_value_if_not_found=-1) chosen_choice_index_to_index2 = lookup(chosen_choice_index, index2, index_if_not_found=UNPLACED_ID) J = local_resources["sample_size"] if include_chosen_choice: J = J - 1 local_resources.merge_with_defaults( {'with_replacement': with_replacement}) with_replacement = local_resources.get("with_replacement") sampled_index = empty((index1.size, J), dtype=DTYPE) sampling_prob = empty((index1.size, J), dtype="float64") _digitize, _where, _normalize = digitize, where, normalize _ncumsum, _rand, _searchsorted = ncumsum, rand, searchsorted #speed hack for i in range(unique_agent_category_id.size): category_id = unique_agent_category_id[i] agents_in_this_category = _where( agent_category_id[index1] == category_id)[0] num_agents = agents_in_this_category.size if num_agents == 0: continue #import pdb; pdb.set_trace() ## divide frequency by the mean frequency to avoid overflow weights = frequency[ i, _digitize(choice_category_id[index2], unique_choice_category_id ) - 1] / frequency[i, :].mean() prob = _normalize(weights) index = _searchsorted(_ncumsum(prob), _rand(num_agents * J)).reshape(-1, J) if not with_replacement: raise NotImplementedError, "Sample without replacement is not implemented for this sampler yet." # nz = nonzero(prob)[0].size # if J < nz: # ## number of non zero weight less than alternatives, sample with replacement # logger.log_warning("There are %s non zero weights and are less than the number of alternatives proposed %s. " % (nz, J) + # "Sample with replacement instead.") # continue # i=0; max_iterations=200 # while True: # index = sort(index, axis=1) # where_repeats = nonzero( logical_not(diff(index, axis=1)) ) # num_repeats = where_repeats[0].size # if num_repeats == 0: break # index[where_repeats] = _searchsorted(_rand(num_repeats), prob) # i += 1 # if i > max_iterations: # logger.log_warning("weight_sampler_by_category is unable to sample %i alternatives without replacement in %i iterations; " % \ # (J, max_iterations) + # "give up sampling without replacement and results may contain replacement." # ) # break sampled_index[agents_in_this_category, :] = index sampling_prob[agents_in_this_category, :] = prob[index] sampled_index = index2[sampled_index] is_chosen_choice = zeros(sampled_index.shape, dtype="bool") #chosen_choice = -1 * ones(chosen_choice_index.size, dtype="int32") if include_chosen_choice: sampled_index = column_stack( (chosen_choice_index[:, newaxis], sampled_index)) is_chosen_choice[chosen_choice_index != UNPLACED_ID, 0] = 1 sampling_prob_for_chosen_choices = take( prob, chosen_choice_index_to_index2[:, newaxis]) ## if chosen choice chosen is unplaced has the sampling prob is 0 sampling_prob_for_chosen_choices[where( chosen_choice_index == UNPLACED_ID)[0], ] = 0.0 sampling_prob = column_stack( [sampling_prob_for_chosen_choices, sampling_prob]) #chosen_choice[where(is_chosen_choice)[0]] = where(is_chosen_choice)[1] interaction_dataset = self.create_interaction_dataset( dataset1, dataset2, index1, sampled_index) interaction_dataset.add_attribute(sampling_prob, '__sampling_probability') interaction_dataset.add_attribute(is_chosen_choice, 'chosen_choice') ## to get the older returns #sampled_index = interaction_dataset.get_2d_index() #chosen_choices = UNPLACED_ID * ones(index1.size, dtype="int32") #where_chosen = where(interaction_dataset.get_attribute("chosen_choice")) #chosen_choices[where_chosen[0]]=where_chosen[1] #return (sampled_index, chosen_choice) return interaction_dataset
def run(self, dataset1, dataset2, index1=None, index2=None, sample_size=10, weight=None, include_chosen_choice=False, with_replacement=False, resources=None, dataset_pool=None): """this function samples number of sample_size (scalar value) alternatives from dataset2 for agent set specified by dataset1. If index1 is not None, only samples alterantives for agents with indices in index1; if index2 is not None, only samples alternatives from indices in index2. sample_size specifies number of alternatives to be sampled for each agent. weight, to be used as sampling weight, is either an attribute name of dataset2, or a 1d array of the same length as index2 or 2d array of shape (index1.size, index2.size). Also refer to document of interaction_dataset""" if dataset_pool is None: try: sc = SessionConfiguration() dataset_pool=sc.get_dataset_pool() except: dataset_pool = DatasetPool() local_resources = Resources(resources) local_resources.merge_if_not_None( {"dataset1": dataset1, "dataset2": dataset2, "index1":index1, "index2": index2, "sample_size": sample_size, "weight": weight, "with_replacement": with_replacement, "include_chosen_choice": include_chosen_choice}) local_resources.check_obligatory_keys(['dataset1', 'dataset2', 'sample_size']) agent = local_resources["dataset1"] index1 = local_resources.get("index1", None) if index1 is None: index1 = arange(agent.size()) choice = local_resources["dataset2"] index2 = local_resources.get("index2", None) if index2 is None: index2 = arange(choice.size()) if index1.size == 0 or index2.size == 0: err_msg = "either choice size or agent size is zero, return None" logger.log_warning(err_msg) return None include_chosen_choice = local_resources.get("include_chosen_choice", False) J = local_resources["sample_size"] if include_chosen_choice: J = J - 1 with_replacement = local_resources.get("with_replacement") weight = local_resources.get("weight", None) if isinstance(weight, str): if weight in choice.get_known_attribute_names(): weight=choice.get_attribute(weight) rank_of_weight = 1 elif VariableName(weight).get_dataset_name() == choice.get_dataset_name(): weight=choice.compute_variables(weight, dataset_pool=dataset_pool) rank_of_weight = 1 else: ## weights can be an interaction variable interaction_dataset = InteractionDataset(local_resources) weight=interaction_dataset.compute_variables(weight, dataset_pool=dataset_pool) rank_of_weight = 2 elif isinstance(weight, ndarray): rank_of_weight = weight.ndim elif not weight: ## weight is None or empty string weight = ones(index2.size) rank_of_weight = 1 else: err_msg = "unkown weight type" logger.log_error(err_msg) raise TypeError, err_msg if (weight.size <> index2.size) and (weight.shape[rank_of_weight-1] <> index2.size): if weight.shape[rank_of_weight-1] == choice.size(): if rank_of_weight == 1: weight = take(weight, index2) if rank_of_weight == 2: weight = take(weight, index2, axis=1) else: err_msg = "weight array size doesn't match to size of dataset2 or its index" logger.log_error(err_msg) raise ValueError, err_msg prob = normalize(weight) #chosen_choice = ones(index1.size) * UNPLACED_ID chosen_choice_id = agent.get_attribute(choice.get_id_name()[0])[index1] #index_of_placed_agent = where(greater(chosen_choice_id, UNPLACED_ID))[0] chosen_choice_index = choice.try_get_id_index(chosen_choice_id, return_value_if_not_found=UNPLACED_ID) chosen_choice_index_to_index2 = lookup(chosen_choice_index, index2, index_if_not_found=UNPLACED_ID) if rank_of_weight == 1: # if weight_array is 1d, then each agent shares the same weight for choices replace = with_replacement # sampling with no replacement if nonzerocounts(weight) < J: logger.log_warning("weight array dosen't have enough non-zero counts, use sample with replacement") replace = True sampled_index = prob2dsample( index2, sample_size=(index1.size, J), prob_array=prob, exclude_index=chosen_choice_index_to_index2, replace=replace, return_index=True ) #return index2[sampled_index] if rank_of_weight == 2: sampled_index = zeros((index1.size,J), dtype="int32") - 1 for i in range(index1.size): replace = with_replacement # sampling with/without replacement i_prob = prob[i,:] if nonzerocounts(i_prob) < J: logger.log_warning("weight array dosen't have enough non-zero counts, use sample with replacement") replace = True #exclude_index passed to probsample_noreplace needs to be indexed to index2 sampled_index[i,:] = probsample_noreplace( index2, sample_size=J, prob_array=i_prob, exclude_index=chosen_choice_index_to_index2[i], return_index=True ) sampling_prob = take(prob, sampled_index) sampled_index = index2[sampled_index] is_chosen_choice = zeros(sampled_index.shape, dtype="bool") #chosen_choice = -1 * ones(chosen_choice_index.size, dtype="int32") if include_chosen_choice: sampled_index = column_stack((chosen_choice_index[:,newaxis],sampled_index)) is_chosen_choice = zeros(sampled_index.shape, dtype="bool") is_chosen_choice[chosen_choice_index!=UNPLACED_ID, 0] = 1 #chosen_choice[where(is_chosen_choice)[0]] = where(is_chosen_choice)[1] ## this is necessary because prob is indexed to index2, not to the choice set (as is chosen_choice_index) sampling_prob_for_chosen_choices = take(prob, chosen_choice_index_to_index2[:, newaxis]) ## if chosen choice chosen equals unplaced_id then the sampling prob is 0 sampling_prob_for_chosen_choices[where(chosen_choice_index==UNPLACED_ID)[0],] = 0.0 sampling_prob = column_stack([sampling_prob_for_chosen_choices, sampling_prob]) interaction_dataset = self.create_interaction_dataset(dataset1, dataset2, index1, sampled_index) interaction_dataset.add_attribute(sampling_prob, '__sampling_probability') interaction_dataset.add_attribute(is_chosen_choice, 'chosen_choice') ## to get the older returns #sampled_index = interaction_dataset.get_2d_index() #chosen_choices = UNPLACED_ID * ones(index1.size, dtype="int32") #where_chosen = where(interaction_dataset.get_attribute("chosen_choice")) #chosen_choices[where_chosen[0]]=where_chosen[1] #return (sampled_index, chosen_choice) return interaction_dataset
def run(self, dataset1, dataset2, index1=None, index2=None, stratum=None, weight=None, sample_size=1, sample_size_from_each_stratum=None, sample_size_from_chosen_stratum=None, sample_rate=None, include_chosen_choice=False, resources=None, with_replacement=False, dataset_pool=None, **kwargs): """this function samples number of sample_size (scalar value) alternatives from dataset2 for agent set specified by dataset1. If index1 is not None, only samples alternatives for agents with indices in index1; if index2 is not None, only samples alternatives from indices in index2. sample_size specifies number of alternatives to be sampled from each stratum, and is overwritten by sample_size_from_each_stratum if it's not None weight, to be used as sampling weight, is either an attribute name of dataset2, or a 1d array of the same length as index2 or 2d array of shape (index1.size, index2.size). Also refer to document of interaction_dataset""" if dataset_pool is None: try: sc = SessionConfiguration() dataset_pool=sc.get_dataset_pool() except: dataset_pool = DatasetPool() local_resources = Resources(resources) local_resources.merge_if_not_None( {"dataset1": dataset1, "dataset2": dataset2, "index1":index1, "index2": index2, "with_replacement": with_replacement, "stratum":stratum, "weight": weight, "sample_size": sample_size, "sample_size_from_each_stratum": sample_size_from_each_stratum, "sample_size_from_chosen_stratum": sample_size_from_chosen_stratum, "sample_rate": sample_rate, "include_chosen_choice": include_chosen_choice}) local_resources.check_obligatory_keys(['dataset1', 'dataset2']) index1 = local_resources.get("index1", None) agent = dataset1 if index1 is None: agent.get_id_attribute() index1 = arange(agent.size()) choice = local_resources["dataset2"] index2 = local_resources.get("index2", None) if index2 is None: choice.get_id_attribute() index2 = arange(choice.size()) if index1.size == 0 or index2.size == 0: err_msg = "either choice size or agent size is zero, return None" logger.log_warning(err_msg) return (None, None) include_chosen_choice = local_resources.get("include_chosen_choice", False) weight = local_resources.get("weight", None) if isinstance(weight, str): choice.compute_variables(weight, resources = local_resources ) weight=choice.get_attribute(weight) rank_of_weight = 1 elif isinstance(weight, ndarray): rank_of_weight = weight.ndim elif weight is None: weight = ones(index2.size) rank_of_weight = 1 else: err_msg = "unknown weight type" logger.log_error(err_msg) raise TypeError, err_msg if (weight.size <> index2.size) and (weight.shape[rank_of_weight-1] <> index2.size): if weight.shape[rank_of_weight-1] == choice.size(): weight = take(weight, index2) else: err_msg = "weight array size doesn't match to size of dataset2 or its index" logger.log_error(err_msg) raise ValueError, err_msg prob = normalize(weight) stratum = local_resources.get("stratum", None) if stratum is None: raise StandardError, "'stratum' must be defined for stratified sampling." if isinstance(stratum, str): choice.compute_variables(stratum, resources = local_resources ) stratum=choice.get_attribute(stratum) #chosen_choice = ones(index1.size) * UNPLACED_ID chosen_choice_id = agent.get_attribute(choice.get_id_name()[0])[index1] #index_of_placed_agent = where(greater(chosen_choice_id, UNPLACED_ID))[0] chosen_choice_index = choice.try_get_id_index(chosen_choice_id, return_value_if_not_found=-1) chosen_choice_index_to_index2 = lookup(chosen_choice_index, index2, index_if_not_found=UNPLACED_ID) ##TODO: check all chosen strata are in selectable strata #i.e. chosen_choice_index is in index2 chosen_stratum = ones(chosen_choice_index.size, dtype=DTYPE) * NO_STRATUM_ID chosen_stratum[where(chosen_choice_index!=-1)] = stratum[chosen_choice_index[where(chosen_choice_index!=-1)]] selectable_strata = stratum[index2] unique_strata = unique(selectable_strata) unique_strata = unique_strata[where(unique_strata!=NO_STRATUM_ID)] # if rank_of_weight == 2: # raise RuntimeError, "stratified sampling for 2d weight is unimplemented yet" # sampled_index = zeros((index1.size,1)) - 1 sample_size = local_resources.get("sample_size", None) sample_size_from_each_stratum = local_resources.get("sample_size_from_each_stratum", None) if sample_size_from_each_stratum is None: sample_size_from_each_stratum = sample_size strata_sample_size = ones(unique_strata.size, dtype=DTYPE) * sample_size_from_each_stratum sample_rate = local_resources.get("sample_rate", None) if sample_rate is not None: raise UnImplementedError, "sample_rate is not implemented yet." ##TODO: to be finished #num_elements_in_strata = histogram(selectable_strata, unique_strata) #strata_sample_size = round(num_elements_in_strata * sample_rate) sample_size_from_chosen_stratum = local_resources.get("sample_size_from_chosen_stratum", None) if sample_size_from_chosen_stratum is None and not include_chosen_choice: strata_sample_pairs = array(map(lambda x,y: [x,y], unique_strata, strata_sample_size)) if rank_of_weight == 1: sampled_index = self._sample_by_stratum(index1, index2, selectable_strata, prob, chosen_choice_index_to_index2, strata_sample_pairs) elif rank_of_weight == 2: sampled_index = self._sample_by_agent_and_stratum(index1, index2, selectable_strata, prob, chosen_choice_index_to_index2, strata_sample_pairs) else: strata_sample_setting = zeros((index1.size,unique_strata.size,2), dtype=DTYPE) for i in range(index1.size): agents_strata_sample_size = copy.copy(strata_sample_size) if sample_size_from_chosen_stratum is None: ## if sample_size_from_chosen_stratum is None and include_chosen_choice is True, ## sample one less from the chosen stratum agents_strata_sample_size[where(unique_strata==chosen_stratum[i])] += - 1 else: agents_strata_sample_size[where(unique_strata==chosen_stratum[i])] = sample_size_from_chosen_stratum strata_sample_pairs = array(map(lambda x,y: [x,y], unique_strata, agents_strata_sample_size)) strata_sample_setting[i,...] = strata_sample_pairs sampled_index = self._sample_by_agent_and_stratum(index1, index2, selectable_strata, prob, chosen_choice_index_to_index2, strata_sample_setting) #chosen_choice = None is_chosen_choice = zeros(sampled_index.shape, dtype="bool") if include_chosen_choice: sampled_index = concatenate((chosen_choice_index[:,newaxis],sampled_index), axis=1) #chosen_choice = zeros(chosen_choice_index.shape, dtype="int32") - 1 #chosen_choice[where(chosen_choice_index>UNPLACED_ID)] = 0 #make chosen_choice index to sampled_index, instead of choice (as chosen_choice_index does) #since the chosen choice index is attached to the first column, the chosen choice should be all zeros #for valid chosen_choice_index is_chosen_choice = zeros(sampled_index.shape, dtype="bool") is_chosen_choice[chosen_choice_index!=UNPLACED_ID, 0] = 1 chosen_probability = zeros((chosen_choice_index.size,),dtype=float32) - 1 for stratum in unique_strata: w = chosen_stratum==stratum chosen_probability[w] = (prob[chosen_choice_index[w]] / prob[selectable_strata==stratum].sum()).astype(float32) self._sampling_probability = concatenate((chosen_probability[:,newaxis], self._sampling_probability), axis=1) self._stratum_id = concatenate((chosen_stratum[:,newaxis], self._stratum_id), axis=1) interaction_dataset = self.create_interaction_dataset(dataset1, dataset2, index1, sampled_index) interaction_dataset.add_attribute(self._sampling_probability, '__sampling_probability') interaction_dataset.add_attribute(is_chosen_choice, 'chosen_choice') interaction_dataset.add_attribute(self._stratum_id, 'stratum_id') ## to get the older returns #sampled_index = interaction_dataset.get_2d_index() #chosen_choices = UNPLACED_ID * ones(index1.size, dtype="int32") #where_chosen = where(interaction_dataset.get_attribute("chosen_choice")) #chosen_choices[where_chosen[0]]=where_chosen[1] #return (sampled_index, chosen_choice) return interaction_dataset
class RegressionModel(ChunkModel): model_name = "Regression Model" model_short_name = "RM" def __init__(self, regression_procedure="opus_core.linear_regression", submodel_string=None, run_config=None, estimate_config=None, debuglevel=0, dataset_pool=None): self.debug = DebugPrinter(debuglevel) self.dataset_pool = self.create_dataset_pool(dataset_pool) self.regression = RegressionModelFactory().get_model(name=regression_procedure) if self.regression == None: raise StandardError, "No regression procedure given." self.submodel_string = submodel_string self.run_config = run_config if self.run_config == None: self.run_config = Resources() if not isinstance(self.run_config,Resources) and isinstance(self.run_config, dict): self.run_config = Resources(self.run_config) self.estimate_config = estimate_config if self.estimate_config == None: self.estimate_config = Resources() if not isinstance(self.estimate_config,Resources) and isinstance(self.estimate_config, dict): self.estimate_config = Resources(self.estimate_config) self.data = {} self.coefficient_names = {} ChunkModel.__init__(self) self.get_status_for_gui().initialize_pieces(3, pieces_description = array(['initialization', 'computing variables', 'submodel: 1'])) def run(self, specification, coefficients, dataset, index=None, chunk_specification=None, data_objects=None, run_config=None, initial_values=None, procedure=None, debuglevel=0): """'specification' is of type EquationSpecification, 'coefficients' is of type Coefficients, 'dataset' is of type Dataset, 'index' are indices of individuals in dataset for which the model runs. If it is None, the whole dataset is considered. 'chunk_specification' determines number of chunks in which the simulation is processed. 'data_objects' is a dictionary where each key is the name of an data object ('zone', ...) and its value is an object of class Dataset. 'run_config' is of type Resources, it gives additional arguments for the run. If 'procedure' is given, it overwrites the regression_procedure of the constructor. 'initial_values' is an array of the initial values of the results. It will be overwritten by the results for those elements that are handled by the model (defined by submodels in the specification). By default the results are initialized with 0. 'debuglevel' overwrites the constructor 'debuglevel'. """ self.debug.flag = debuglevel if run_config == None: run_config = Resources() if not isinstance(run_config,Resources) and isinstance(run_config, dict): run_config = Resources(run_config) self.run_config = run_config.merge_with_defaults(self.run_config) self.run_config.merge({"debug":self.debug}) if data_objects is not None: self.dataset_pool.add_datasets_if_not_included(data_objects) self.dataset_pool.replace_dataset(dataset.get_dataset_name(), dataset) if procedure is not None: self.regression = RegressionModelFactory().get_model(name=procedure) if initial_values is None: self.initial_values = zeros((dataset.size(),), dtype=float32) else: self.initial_values = zeros((dataset.size(),), dtype=initial_values.dtype) self.initial_values[index] = initial_values if dataset.size()<=0: # no data loaded yet dataset.get_id_attribute() if index == None: index = arange(dataset.size()) result = ChunkModel.run(self, chunk_specification, dataset, index, float32, specification=specification, coefficients=coefficients) return result def run_chunk (self, index, dataset, specification, coefficients): self.specified_coefficients = SpecifiedCoefficients().create(coefficients, specification, neqs=1) compute_resources = Resources({"debug":self.debug}) submodels = self.specified_coefficients.get_submodels() self.get_status_for_gui().update_pieces_using_submodels(submodels=submodels, leave_pieces=2) self.map_agents_to_submodels(submodels, self.submodel_string, dataset, index, dataset_pool=self.dataset_pool, resources = compute_resources) variables = self.specified_coefficients.get_full_variable_names_without_constants() self.debug.print_debug("Compute variables ...",4) self.increment_current_status_piece() dataset.compute_variables(variables, dataset_pool = self.dataset_pool, resources = compute_resources) data = {} coef = {} outcome=self.initial_values[index].copy() for submodel in submodels: coef[submodel] = SpecifiedCoefficientsFor1Submodel(self.specified_coefficients,submodel) self.coefficient_names[submodel] = coef[submodel].get_coefficient_names_without_constant()[0,:] self.debug.print_debug("Compute regression for submodel " +str(submodel),4) self.increment_current_status_piece() self.data[submodel] = dataset.create_regression_data(coef[submodel], index = index[self.observations_mapping[submodel]]) nan_index = where(isnan(self.data[submodel]))[1] inf_index = where(isinf(self.data[submodel]))[1] vnames = asarray(coef[submodel].get_variable_names()) if nan_index.size > 0: nan_var_index = unique(nan_index) self.data[submodel] = nan_to_num(self.data[submodel]) logger.log_warning("NaN(Not A Number) is returned from variable %s; it is replaced with %s." % (vnames[nan_var_index], nan_to_num(nan))) #raise ValueError, "NaN(Not A Number) is returned from variable %s; check the model specification table and/or attribute values used in the computation for the variable." % vnames[nan_var_index] if inf_index.size > 0: inf_var_index = unique(inf_index) self.data[submodel] = nan_to_num(self.data[submodel]) logger.log_warning("Inf is returned from variable %s; it is replaced with %s." % (vnames[inf_var_index], nan_to_num(inf))) #raise ValueError, "Inf is returned from variable %s; check the model specification table and/or attribute values used in the computation for the variable." % vnames[inf_var_index] if (self.data[submodel].shape[0] > 0) and (self.data[submodel].size > 0): # observations for this submodel available outcome[self.observations_mapping[submodel]] = \ self.regression.run(self.data[submodel], coef[submodel].get_coefficient_values()[0,:], resources=self.run_config).astype(outcome.dtype) return outcome def correct_infinite_values(self, dataset, outcome_attribute_name, maxvalue=1e+38, clip_all_larger_values=False): """Check if the model resulted in infinite values. If yes, print warning and clip the values to maxvalue. If clip_all_larger_values is True, all values larger than maxvalue are clip to maxvalue. """ infidx = where(dataset.get_attribute(outcome_attribute_name) == inf)[0] if infidx.size > 0: logger.log_warning("Infinite values in %s. Clipped to %s." % (outcome_attribute_name, maxvalue)) dataset.set_values_of_one_attribute(outcome_attribute_name, maxvalue, infidx) if clip_all_larger_values: idx = where(dataset.get_attribute(outcome_attribute_name) > maxvalue)[0] if idx.size > 0: logger.log_warning("Values in %s larger than %s. Clipped to %s." % (outcome_attribute_name, maxvalue, maxvalue)) dataset.set_values_of_one_attribute(outcome_attribute_name, maxvalue, idx) def estimate(self, specification, dataset, outcome_attribute, index = None, procedure=None, data_objects=None, estimate_config=None, debuglevel=0): """'specification' is of type EquationSpecification, 'dataset' is of type Dataset, 'outcome_attribute' - string that determines the dependent variable, 'index' are indices of individuals in dataset for which the model runs. If it is None, the whole dataset is considered. 'procedure' - name of the estimation procedure. If it is None, there should be an entry "estimation" in 'estimate_config' that determines the procedure. The class must have a method 'run' that takes as arguments 'data', 'regression_procedure' and 'resources'. It returns a dictionary with entries 'estimators', 'standard_errors' and 't_values' (all 1D numpy arrays). 'data_objects' is a dictionary where each key is the name of an data object ('zone', ...) and its value is an object of class Dataset. 'estimate_config' is of type Resources, it gives additional arguments for the estimation procedure. 'debuglevel' overwrites the class 'debuglevel'. """ #import wingdbstub self.debug.flag = debuglevel if estimate_config == None: estimate_config = Resources() if not isinstance(estimate_config,Resources) and isinstance(estimate_config, dict): estimate_config = Resources(estimate_config) self.estimate_config = estimate_config.merge_with_defaults(self.estimate_config) if data_objects is not None: self.dataset_pool.add_datasets_if_not_included(data_objects) self.procedure=procedure if self.procedure == None: self.procedure = self.estimate_config.get("estimation", None) if self.procedure is not None: self.procedure = ModelComponentCreator().get_model_component(self.procedure) else: logger.log_warning("No estimation procedure given, or problems with loading the corresponding module.") compute_resources = Resources({"debug":self.debug}) if dataset.size()<=0: # no data loaded yet dataset.get_id_attribute() if index == None: index = arange(dataset.size()) if not isinstance(index,ndarray): index=array(index) estimation_size_agents = self.estimate_config.get("estimation_size_agents", None) # should be a proportion of the agent_set if estimation_size_agents == None: estimation_size_agents = 1.0 else: estimation_size_agents = max(min(estimation_size_agents,1.0),0.0) # between 0 and 1 if estimation_size_agents < 1.0: self.debug.print_debug("Sampling agents for estimation ...",3) estimation_idx = sample_noreplace(arange(index.size), int(index.size*estimation_size_agents)) else: estimation_idx = arange(index.size) estimation_idx = index[estimation_idx] self.debug.print_debug("Number of observations for estimation: " + str(estimation_idx.size),2) if estimation_idx.size <= 0: self.debug.print_debug("Nothing to be done.",2) return (None, None) coefficients = create_coefficient_from_specification(specification) self.specified_coefficients = SpecifiedCoefficients().create(coefficients, specification, neqs=1) submodels = self.specified_coefficients.get_submodels() self.get_status_for_gui().update_pieces_using_submodels(submodels=submodels, leave_pieces=2) self.map_agents_to_submodels(submodels, self.submodel_string, dataset, estimation_idx, dataset_pool=self.dataset_pool, resources = compute_resources, submodel_size_max=self.estimate_config.get('submodel_size_max', None)) variables = self.specified_coefficients.get_full_variable_names_without_constants() self.debug.print_debug("Compute variables ...",4) self.increment_current_status_piece() dataset.compute_variables(variables, dataset_pool=self.dataset_pool, resources = compute_resources) coef = {} estimated_coef={} self.outcome = {} dataset.compute_variables([outcome_attribute], dataset_pool=self.dataset_pool, resources=compute_resources) regression_resources=Resources(estimate_config) regression_resources.merge({"debug":self.debug}) outcome_variable_name = VariableName(outcome_attribute) for submodel in submodels: coef[submodel] = SpecifiedCoefficientsFor1Submodel(self.specified_coefficients,submodel) self.increment_current_status_piece() logger.log_status("Estimate regression for submodel " +str(submodel), tags=["estimate"], verbosity_level=2) #logger.log_status("Number of observations: " +str(self.observations_mapping[submodel].size), #tags=["estimate"], verbosity_level=2) self.data[submodel] = dataset.create_regression_data_for_estimation(coef[submodel], index = estimation_idx[self.observations_mapping[submodel]]) self.coefficient_names[submodel] = coef[submodel].get_coefficient_names_without_constant()[0,:] if (self.data[submodel].shape[0] > 0) and (self.data[submodel].size > 0) and (self.procedure is not None): # observations for this submodel available self.outcome[submodel] = dataset.get_attribute_by_index(outcome_variable_name.get_alias(), estimation_idx[self.observations_mapping[submodel]]) regression_resources.merge({"outcome": self.outcome[submodel]}) regression_resources.merge({"coefficient_names":self.coefficient_names[submodel].tolist(), "constant_position": coef[submodel].get_constants_positions()}) regression_resources.merge({"submodel": submodel}) estimated_coef[submodel] = self.procedure.run(self.data[submodel], self.regression, resources=regression_resources) if "estimators" in estimated_coef[submodel].keys(): coef[submodel].set_coefficient_values(estimated_coef[submodel]["estimators"]) if "standard_errors" in estimated_coef[submodel].keys(): coef[submodel].set_standard_errors(estimated_coef[submodel]["standard_errors"]) if "other_measures" in estimated_coef[submodel].keys(): for measure in estimated_coef[submodel]["other_measures"].keys(): coef[submodel].set_measure(measure, estimated_coef[submodel]["other_measures"][measure]) if "other_info" in estimated_coef[submodel].keys(): for info in estimated_coef[submodel]["other_info"]: coef[submodel].set_other_info(info, estimated_coef[submodel]["other_info"][info]) coefficients.fill_coefficients(coef) self.specified_coefficients.coefficients = coefficients self.save_predicted_values_and_errors(specification, coefficients, dataset, outcome_variable_name, index=index, data_objects=data_objects) return (coefficients, estimated_coef) def prepare_for_run(self, dataset=None, dataset_filter=None, filter_threshold=0, **kwargs): spec, coef = prepare_specification_and_coefficients(**kwargs) if (dataset is not None) and (dataset_filter is not None): filter_values = dataset.compute_variables([dataset_filter], dataset_pool=self.dataset_pool) index = where(filter_values > filter_threshold)[0] else: index = None return (spec, coef, index) def prepare_for_estimate(self, dataset=None, dataset_filter=None, filter_threshold=0, **kwargs): spec = get_specification_for_estimation(**kwargs) if (dataset is not None) and (dataset_filter is not None): filter_values = dataset.compute_variables([dataset_filter], dataset_pool=self.dataset_pool) index = where(filter_values > filter_threshold)[0] else: index = None return (spec, index) def get_data_as_dataset(self, submodel=-2): """Like get_all_data, but the retuning value is a Dataset containing attributes that correspond to the data columns. Their names are coefficient names.""" all_data = self.get_all_data(submodel) if all_data is None: return None names = self.get_coefficient_names(submodel) if names is None: return None dataset_data = {} for i in range(names.size): dataset_data[names[i]] = all_data[:, i].reshape(all_data.shape[0]) dataset_data["id"] = arange(all_data.shape[0])+1 storage = StorageFactory().get_storage('dict_storage') storage.write_table(table_name='dataset', table_data=dataset_data) ds = Dataset(in_storage=storage, id_name="id", in_table_name='dataset') return ds def save_predicted_values_and_errors(self, specification, coefficients, dataset, outcome_variable, index=None, data_objects=None): if self.estimate_config.get('save_predicted_values_and_errors', False): logger.log_status('Computing predicted values and residuals.') original_values = dataset.get_attribute_by_index(outcome_variable, index) predicted_values = zeros(dataset.size(), dtype='float32') predicted_values[index] = self.run_after_estimation(specification, coefficients, dataset, index=index, data_objects=data_objects) predicted_attribute_name = 'predicted_%s' % outcome_variable.get_alias() dataset.add_primary_attribute(name=predicted_attribute_name, data=predicted_values) dataset.flush_attribute(predicted_attribute_name) predicted_error_attribute_name = 'residuals_%s' % outcome_variable.get_alias() error_values = zeros(dataset.size(), dtype='float32') error_values[index] = (original_values - predicted_values[index]).astype(error_values.dtype) dataset.add_primary_attribute(name=predicted_error_attribute_name, data = error_values) dataset.flush_attribute(predicted_error_attribute_name) logger.log_status('Predicted values saved as %s (for the %s dataset)' % (predicted_attribute_name, dataset.get_dataset_name())) logger.log_status('Residuals saved as %s (for the %s dataset)' % (predicted_error_attribute_name, dataset.get_dataset_name())) def export_estimation_data(self, submodel=-2, file_name='./estimation_data_regression.txt', delimiter = '\t'): import os from numpy import newaxis data = concatenate((self.outcome[submodel][...,newaxis], self.get_all_data(submodel=submodel)), axis=1) header = ['outcome'] + self.get_coefficient_names(submodel).tolist() nrows = data.shape[0] file_name_root, file_name_ext = os.path.splitext(file_name) out_file = "%s_submodel_%s.txt" % (file_name_root, submodel) fh = open(out_file,'w') fh.write(delimiter.join(header) + '\n') #file header for row in range(nrows): line = [str(x) for x in data[row,]] fh.write(delimiter.join(line) + '\n') fh.flush() fh.close print 'Data written into %s' % out_file def run_after_estimation(self, *args, **kwargs): return self.run(*args, **kwargs) def _get_status_total_pieces(self): return ChunkModel._get_status_total_pieces(self) * self.get_status_for_gui().get_total_number_of_pieces() def _get_status_current_piece(self): return ChunkModel._get_status_current_piece(self)*self.get_status_for_gui().get_total_number_of_pieces() + self.get_status_for_gui().get_current_piece() def _get_status_piece_description(self): return "%s %s" % (ChunkModel._get_status_piece_description(self), self.get_status_for_gui().get_current_piece_description()) def get_specified_coefficients(self): return self.specified_coefficients
class Estimator(GenericModelExplorer): def __init__(self, config=None, save_estimation_results=False): if 'cache_directory' not in config or config['cache_directory'] is None: raise KeyError( "The cache directory must be specified in the " "given configuration, giving the filesystem path to the cache " "directory containing the data with which to estimate. Please " "check that your configuration contains the 'cache_directory' " "entry and that it is not None.") self.simulation_state = SimulationState(new_instance=True) self.simulation_state.set_cache_directory(config['cache_directory']) SessionConfiguration( new_instance=True, package_order=config['dataset_pool_configuration'].package_order, in_storage=AttributeCache()) self.config = Resources(config) self.save_estimation_results = save_estimation_results self.debuglevel = self.config.get("debuglevel", 4) self.model_system = ModelSystem() self.agents_index_for_prediction = None models = self.config.get('models', []) self.model_name = None if "model_name" in config.keys(): self.model_name = config["model_name"] else: for model in models: if isinstance(model, dict): model_name = model.keys()[0] if (model[model_name] == "estimate") or ( isinstance(model[model_name], list) and ("estimate" in model[model_name])): self.model_name = model_name break estimate_config_changes = self.config.get( 'config_changes_for_estimation', {}).get('estimate_config', {}) if len(estimate_config_changes) > 0: change = Resources({ 'models_configuration': { self.model_name: { 'controller': { 'init': { 'arguments': {} } } } } }) estimate_config_str = self.config['models_configuration'].get( self.model_name, {}).get('controller', {}).get('init', {}).get('arguments', {}).get('estimate_config', '{}') estimate_config = Resources({}) try: estimate_config = eval(estimate_config_str) except: pass estimate_config.merge(estimate_config_changes) self.config.merge(change) self.config['models_configuration'][ self.model_name]['controller']['init']['arguments'][ 'estimate_config'] = 'Resources(%s)' % estimate_config def estimate(self, out_storage=None): self.model_system.run(self.config, write_datasets_to_cache_at_end_of_year=False) self.extract_coefficients_and_specification() if self.save_estimation_results: self.save_results(out_storage=out_storage) def reestimate(self, specification_module_name=None, specification_dict=None, out_storage=None, type=None, submodels=None): """specification_module_name is name of a module that contains a dictionary called 'specification'. If it is not given, the argument specification_dict must be given which is a dictionary object. 'type' is the name of model member, such as 'commercial', 'residential'. The specification dictionary is expected to have an entry of this name. If 'submodels' is given (list or a number), the restimation is done only for those submodels. """ if specification_module_name is not None: exec("import " + specification_module_name) eval("reload (" + specification_module_name + ")") exec("specification_dict =" + specification_module_name + ".specification") if type is not None: specification_dict = specification_dict[type] if submodels is not None: #remove all submodels but the given ones from specification submodels_to_be_deleted = specification_dict.keys() if not isinstance(submodels, list): submodels = [submodels] for sm in submodels: if sm not in submodels_to_be_deleted: raise ValueError, "Submodel %s not in the specification." % sm submodels_to_be_deleted.remove(sm) if "_definition_" in submodels_to_be_deleted: submodels_to_be_deleted.remove("_definition_") for sm in submodels_to_be_deleted: del specification_dict[sm] self.specification = EquationSpecification( specification_dict=specification_dict) new_namespace = self.model_system.run_year_namespace keys_coeff_spec = self.get_keys_for_coefficients_and_specification() new_namespace[keys_coeff_spec["specification"]] = self.specification self.coefficients, coeff_dict_dummy = self.model_system.do_process( new_namespace) ## update run_year_namespce since it's not been updated by do_process self.model_system.run_year_namespace = new_namespace self.model_system.run_year_namespace[ keys_coeff_spec["coefficients"]] = self.coefficients ## this gets coeff and spec from run_year_namespce and is only updated in _run_year method #self.extract_coefficients_and_specification() if self.save_estimation_results: self.save_results(out_storage=out_storage) def predict(self, predicted_choice_id_name, agents_index=None): """ Run prediction. Currently makes sense only for choice models.""" # Create temporary configuration where all words 'estimate' are replaced by 'run' tmp_config = Resources(self.config) if self.agents_index_for_prediction is None: self.agents_index_for_prediction = self.get_agent_set_index().copy( ) if agents_index is None: agents_index = self.agents_index_for_prediction tmp_config['models_configuration'][self.model_name]['controller'][ 'run']['arguments']['coefficients'] = "coeff_est" tmp_config['models_configuration'][self.model_name]['controller'][ 'run']['arguments']['agents_index'] = "agents_index" tmp_config['models_configuration'][self.model_name]['controller'][ 'run']['arguments']['chunk_specification'] = "{'nchunks':1}" ### save specification and coefficients to cache (no matter the save_estimation_results flag) ### so that the prepare_for_run method could load specification and coefficients from there #output_configuration = self.config['output_configuration'] #del self.config['output_configuration'] #self.save_results() #self.config['output_configuration'] = output_configuration #self.model_system.run_year_namespace["coefficients"] = self.coefficients #del tmp_config['models_configuration'][self.model_name]['controller']['prepare_for_run'] try: run_year_namespace = copy.copy( self.model_system.run_year_namespace) except: logger.log_error("The estimate() method must be run first") return False try: agents = self.get_agent_set() choice_id_name = self.get_choice_set().get_id_name()[0] # save current locations of agents current_choices = agents.get_attribute(choice_id_name).copy() dummy_data = zeros(current_choices.size, dtype=current_choices.dtype) - 1 agents.modify_attribute(name=choice_id_name, data=dummy_data) #reset all choices run_year_namespace["process"] = "run" run_year_namespace["coeff_est"] = self.coefficients run_year_namespace["agents_index"] = agents_index run_year_namespace["processmodel_config"] = tmp_config[ 'models_configuration'][self.model_name]['controller']['run'] new_choices = self.model_system.do_process(run_year_namespace) #self.model_system.run(tmp_config, write_datasets_to_cache_at_end_of_year=False) #new_choices = agents.get_attribute(choice_id_name).copy() agents.modify_attribute(name=choice_id_name, data=current_choices) dummy_data[agents_index] = new_choices if predicted_choice_id_name not in agents.get_known_attribute_names( ): agents.add_primary_attribute(name=predicted_choice_id_name, data=dummy_data) else: agents.modify_attribute(name=predicted_choice_id_name, data=dummy_data) logger.log_status("Predictions saved into attribute " + predicted_choice_id_name) return True except Exception, e: logger.log_error("Error encountered in prediction: %s" % e) logger.log_stack_trace() return False
def match_parcels_to_constraints_and_templates(parcel_dataset, development_template_dataset, output_dir, log_scale=True, strict=True, output_points=False, parcel_index=None, template_index=None, consider_constraints_as_rules=True, template_opus_path="urbansim_parcel.development_template", dataset_pool=None, resources=None): """ This function matches parcels to their constraints and templates and gives a summary about how many parcels have no match. It also creates a plot for each GLU and unit type of template ranges and densities. parcel_index - 1D array, indices of parcel_dataset (default is all parcels). template_index - index to templates that are available (default is all templates). If strict is True, parcels without templates are considered across GLU, otherwise only within each GLU. """ if not os.path.exists(output_dir): os.makedirs(output_dir) resources = Resources(resources) debug = resources.get("debug", 0) if not isinstance(debug, DebugPrinter): debug = DebugPrinter(debug) if parcel_index is not None and parcel_index.size <= 0: return None if parcel_index is not None: index1 = parcel_index else: index1 = arange(parcel_dataset.size()) if template_index is not None: index2 = template_index else: index2 = arange(development_template_dataset.size()) has_constraint_dataset = True try: constraints = dataset_pool.get_dataset("development_constraint") constraints.load_dataset_if_not_loaded() except: has_constraint_dataset = False parcels_glu = parcel_dataset.compute_variables(['parcel.disaggregate(land_use_type.generic_land_use_type_id)'], dataset_pool=dataset_pool) if has_constraint_dataset: constraint_types = unique(constraints.get_attribute("constraint_type")) #unit_per_acre, far etc development_template_dataset.compute_variables(map(lambda x: "%s.%s" % (template_opus_path, x), constraint_types), dataset_pool) parcel_dataset.get_development_constraints(constraints, dataset_pool, index=index1, consider_constraints_as_rules=consider_constraints_as_rules) generic_land_use_type_ids = development_template_dataset.compute_variables("urbansim_parcel.development_template.generic_land_use_type_id", dataset_pool=dataset_pool) parcel_ids = parcel_dataset.get_id_attribute() template_ids = development_template_dataset.get_id_attribute() has_template = zeros(index1.size, dtype="int32") vacant_land = parcel_dataset.compute_variables(['urbansim_parcel.parcel.vacant_land_area'], dataset_pool=dataset_pool)[index1] is_vacant = vacant_land>0 #vacant_land = vacant_land*logical_or(parcels_glu==1, parcels_glu==2) is_developable_parcel = zeros(index1.size, dtype="int32") accepted_by_constraints = zeros(index1.size, dtype="int32") #parcels_to_template = {} parcels_to_template_acc_by_constr = {} density_types = development_template_dataset['density_type'] parcels_acc_by_constr_wo_templ = {} parcels_acc_by_constr = {} #pidx = parcel_dataset.get_id_index(804461) logger.start_block("Combine parcels, templates and constraints") for i_template in index2: this_template_id = template_ids[i_template] fit_indicator = ones(index1.size, dtype="bool8") parcels_to_template_acc_by_constr[this_template_id] = [] this_templ_accepted_by_constraints = zeros(index1.size, dtype="int32") has_this_template = zeros(index1.size, dtype="int32") if has_constraint_dataset: generic_land_use_type_id = generic_land_use_type_ids[i_template] if generic_land_use_type_id not in parcels_acc_by_constr_wo_templ.keys(): parcels_acc_by_constr_wo_templ[generic_land_use_type_id] = zeros(index1.size, dtype="int32") if generic_land_use_type_id not in parcels_acc_by_constr.keys(): parcels_acc_by_constr[generic_land_use_type_id] = zeros(index1.size, dtype="int32") #if generic_land_use_type_id not in [1,2]: # continue units_proposed = parcel_dataset.compute_variables(['psrc_parcel.parcel.units_proposed_for_template_%s' % this_template_id], dataset_pool=dataset_pool)[index1] is_size_fit = parcel_dataset.compute_variables(['psrc_parcel.parcel.is_size_fit_for_template_%s' % this_template_id], dataset_pool=dataset_pool)[index1] for constraint_type, constraint in parcel_dataset.development_constraints[generic_land_use_type_id].iteritems(): if density_types[i_template] <> constraint_type: continue template_attribute = development_template_dataset.get_attribute(constraint_type)[i_template] #density converted to constraint variable name if template_attribute == 0: continue min_constraint = constraint[:, 0].copy() max_constraint = constraint[:, 1].copy() ## treat -1 as unconstrained w_unconstr = min_constraint == -1 if w_unconstr.any(): min_constraint[w_unconstr] = template_attribute w_unconstr = max_constraint == -1 if w_unconstr.any(): max_constraint[w_unconstr] = template_attribute this_accepted_by_constraints = logical_and(template_attribute >= min_constraint, template_attribute <= max_constraint) fit_indicator = logical_and(fit_indicator, logical_and(logical_and(this_accepted_by_constraints, units_proposed > 0), is_size_fit)) is_developable_parcel = logical_or(is_developable_parcel, max_constraint > 0) this_templ_accepted_by_constraints = logical_or(this_templ_accepted_by_constraints, logical_and(is_developable_parcel, logical_and(this_accepted_by_constraints, units_proposed > 0))) has_this_template = logical_or(has_this_template, fit_indicator) accepted_by_constraints = logical_or(accepted_by_constraints, this_templ_accepted_by_constraints) has_template = logical_or(has_template, has_this_template) #parcels_to_template[this_template_id] = where(logical_and(vacant_land>0, # logical_and(logical_and(is_developable_parcel, this_accepted_by_constraints), # logical_not(fit_indicator))))[0] #parcels_to_template_acc_by_constr[this_template_id].append(where(accepted_by_constraints)[0].tolist()) not_accepted = logical_and(this_templ_accepted_by_constraints, logical_and(logical_not(has_this_template), is_vacant)) parcels_to_template_acc_by_constr[this_template_id].append(where(not_accepted)[0].tolist()) parcels_acc_by_constr_wo_templ[generic_land_use_type_id] = logical_or(parcels_acc_by_constr_wo_templ[generic_land_use_type_id], not_accepted) parcels_acc_by_constr[generic_land_use_type_id] = logical_or(parcels_acc_by_constr[generic_land_use_type_id], logical_and(this_templ_accepted_by_constraints, is_vacant)) #if fit_indicator[pidx]: # print 'Parcel 804461: template %s accepted.' % this_template_id logger.end_block() ### Print summary ################## unique_glu = parcels_acc_by_constr_wo_templ.keys() #parcels_wo_templ = zeros(index1.size, dtype="int32") #parcels_wo_templ = where(logical_and(vacant_land>0, logical_and(is_developable_parcel, logical_not(has_template))))[0] #nr_parcels_wo_templ = parcels_wo_templ.size #is_vacant = vacant_land>0 #logger.log_status("\nGLU\tvacant land\tconstraint out\tno template") logger.log_status("\nGLU\tconsidered\tno template") no_glu_templ = [] parcels_wo_temp_by_glu = {} sum1 = 0 sum2 = 0 parcels_wo_templ = logical_not(has_template) for glu in unique_glu: if strict: parcels_acc_by_constr_wo_templ[glu] = logical_and(parcels_acc_by_constr_wo_templ[glu], parcels_wo_templ) #if glu == 3: #parcels_wo_templ = logical_or(parcels_wo_templ, parcels_acc_by_constr_wo_templ[glu]) # if glu not in generic_land_use_type_ids: # no_glu_templ.append(glu) #idx = parcels_glu==glu # if idx.sum() > 0: # logger.log_status("%s\t%7i\t\t%7i\t\t%7i" % (glu, is_vacant[idx].sum(), # is_vacant[idx].sum() - logical_and(is_vacant[idx], is_developable_parcel[idx]).sum(), # logical_and(is_vacant[idx], logical_and(is_developable_parcel[idx], logical_not(has_template[idx]))).sum())) # parcels_wo_temp_by_glu[glu] = where(logical_and(idx, logical_and(is_vacant, # logical_and(is_developable_parcel, logical_not(has_template)))))[0] logger.log_status("%s\t%7i\t\t%7i" % (glu, parcels_acc_by_constr[glu].sum(), parcels_acc_by_constr_wo_templ[glu].sum())) sum1 = sum1 + parcels_acc_by_constr[glu].sum() sum2 = sum2 + parcels_acc_by_constr_wo_templ[glu].sum() logger.log_status("\nall\t%7i\t\t%7i" % (sum1, sum2)) #if len(no_glu_templ) > 0: # logger.log_status("\nNo templates for GLUs: %s" % no_glu_templ) ### Create plots ################# templ_min_max = {} for glu in unique_glu: gidx = where(parcels_acc_by_constr_wo_templ[glu])[0] logger.start_block("Creating figures for GLU %s using %s parcels" % (glu,gidx.size)) templ_min_max[glu] = [] max_land_sqft = {'far': 0, 'units_per_acre': 0} min_land_sqft = {'far': 9999999, 'units_per_acre': 9999999} max_templ_attr = {'far': 0, 'units_per_acre': 0} min_templ_attr = {'far': 999999, 'units_per_acre': 9999999} xy = {'far':[], 'units_per_acre':[]} points = {'far':zeros((0,3)), 'units_per_acre':zeros((0,3))} npoints = {'far': 0, 'units_per_acre': 0} for i_template in index2: if glu <> generic_land_use_type_ids[i_template]: continue this_template_id = template_ids[i_template] #units_proposed = parcel_dataset['units_proposed_for_template_%s' % this_template_id] #is_size_fit = parcel_dataset['is_size_fit_for_template_%s' % this_template_id] #is_constraint = zeros(parcel_dataset.size(), dtype='bool8') #is_constraint[array(parcels_to_template_acc_by_constr[this_template_id])]=True #is_size_fit = logical_and(logical_and(logical_not(is_size_fit), # logical_and(is_vacant, units_proposed>0)), # logical_and(is_constraint, # is_developable_parcel)) missed_to_match = zeros(parcel_dataset.size(), dtype='bool8') missed_to_match[(unique(array(parcels_to_template_acc_by_constr[this_template_id]).flatten())).astype('int32')] = True missed_to_match = where(logical_and(missed_to_match, parcels_acc_by_constr_wo_templ[glu]))[0] #missed_to_match = unique(array(parcels_to_template_acc_by_constr[this_template_id]).flatten()) for constraint_type, constraint in parcel_dataset.development_constraints[glu].iteritems(): if density_types[i_template] <> constraint_type: continue template_attribute = development_template_dataset.get_attribute(constraint_type)[i_template] #density converted to constraint variable name if template_attribute == 0: continue templ_min_max[glu].append([development_template_dataset["land_sqft_min"][i_template], development_template_dataset["land_sqft_max"][i_template]]) xy[constraint_type] = xy[constraint_type] + [[development_template_dataset["land_sqft_min"][i_template], development_template_dataset["land_sqft_max"][i_template]], [template_attribute, template_attribute]] #if is_size_fit[gidx].sum() > 0: if missed_to_match.size > 0: npoints[constraint_type] = npoints[constraint_type] + missed_to_match.size #is_size_fit[gidx].sum() #if is_size_fit[gidx].sum() > 100: if missed_to_match.size > 100: draw = sample_noreplace(missed_to_match, 100) thisidx = draw else: thisidx = missed_to_match points[constraint_type] = concatenate((points[constraint_type], concatenate((parcel_dataset['vacant_land_area'][thisidx][:,newaxis], template_attribute*ones((thisidx.size,1)), parcel_ids[thisidx][:,newaxis]), axis=1)), axis=0) max_land_sqft[constraint_type] = max(max_land_sqft[constraint_type], parcel_dataset['vacant_land_area'][thisidx].max()) min_land_sqft[constraint_type] = min(min_land_sqft[constraint_type], parcel_dataset['vacant_land_area'][thisidx].max()) max_templ_attr[constraint_type] = max(max_templ_attr[constraint_type], template_attribute) min_templ_attr[constraint_type] = min(min_templ_attr[constraint_type], template_attribute) import matplotlib.ticker as ticker import matplotlib.pyplot as plt def myexp(x, pos): return '%i' % (round(exp(x))) def myexp2(x, pos): return '%.2f' % (round(exp(x), 2)) for type in ['far', 'units_per_acre']: if points[type].size == 0: continue #print xy[type] lxy = array(xy[type]) dots = points[type][:,0:2] minx = min_land_sqft[type]-100 maxx = max_land_sqft[type]+100 miny = min_templ_attr[type]-0.05 maxy = max_templ_attr[type]+0.05 if log_scale: lxy = log(lxy) dots = log(dots) minx = log(minx) maxx = log(maxx) miny = log(miny) maxy = log(maxy) fig = plt.figure() ax = fig.add_subplot(111) lines = ax.plot(*lxy) # template lines po = ax.plot(dots[:,0], dots[:,1]) # parcel points if log_scale: xformatter = ticker.FuncFormatter(myexp) yformatter = ticker.FuncFormatter(myexp2) ax.xaxis.set_major_formatter(xformatter) ax.yaxis.set_major_formatter(yformatter) # The following would be better but throws an error #locator = ticker.LogLocator(base=2.718282, subs=0.1) #ax.xaxis.set_major_locator(locator) plt.setp(lines, color='b', linewidth=1) plt.setp(po, marker='o', linestyle='None', linewidth=0) ax.axis([min(dots[:,0].min(), minx), max(dots[:,0].max(), maxx), min(dots[:,1].min(), miny), max(dots[:,1].max(), maxy)]) plt.title('GLU: %s, units: %s, missing: %s' % (glu, type, npoints[type])) #ax.grid(True) plt.xlabel('land sqft range') plt.ylabel('density') log_suffix = '' if log_scale: log_suffix = '_log' plt.savefig(os.path.join(output_dir, 'match_templates%s_%s_%s.pdf' % (log_suffix, glu, type))) plt.close() #plt.show() if output_points: #if glu == 3: write_table_to_text_file(os.path.join(output_dir, 'points_%s_%s.txt' % (glu, type)), points[type], delimiter=', ') logger.end_block() logger.log_status('Resulting figures stored into %s' % output_dir) return parcel_ids[index1][parcels_wo_templ]