def write(self, resources=None, out_storage=None, out_table_name=None): """ """ # TODO: insert docstring local_resources = Resources(resources) local_resources.merge_with_defaults({ "field_submodel_id": self.field_submodel_id, "field_equation_id": self.field_equation_id, "field_coefficient_name": self.field_coefficient_name, "field_variable_name": self.field_variable_name, "field_fixed_value": self.field_fixed_value, "out_table_name": out_table_name }) if out_storage <> None: self.out_storage = out_storage if not isinstance(self.out_storage, Storage): logger.log_warning( "out_storage has to be of type Storage. No EquationSpecifications written." ) return submodel_ids = self.get_submodels() if submodel_ids.size == 0: submodel_ids = resize(array( [-2], dtype="int32"), len(self.get_coefficient_names( ))) #set sub_model_id = -2 when there is no or 1 submodels equation_ids = self.get_equations() if equation_ids.size == 0: equation_ids = resize(array([-2], dtype="int32"), submodel_ids.size) values = { local_resources["field_submodel_id"]: submodel_ids, local_resources["field_equation_id"]: equation_ids, local_resources["field_coefficient_name"]: self.get_coefficient_names(), local_resources["field_variable_name"]: self.get_long_variable_names() } if self.fixed_values.size > 0: values[local_resources["field_fixed_value"]] = self.fixed_values for field in self.other_fields.keys(): values[field] = self.other_fields[field] types = { local_resources["field_submodel_id"]: 'integer', local_resources["field_equation_id"]: 'integer', local_resources["field_coefficient_name"]: 'text', local_resources["field_variable_name"]: 'text' } local_resources.merge({ "values": values, 'valuetypes': types, "drop_table_flag": 1 }) self.out_storage.write_table( table_name=local_resources['out_table_name'], table_data=local_resources['values'])
def get_resources(self, data_dictionary, dataset): """Create resources for computing a variable. """ resources=Resources() for key in data_dictionary.keys(): if key in self.datasets: data = data_dictionary[key] if self.id_names[key] not in data_dictionary[key].keys() and not isinstance(self.id_names[key], list): data[self.id_names[key]] = arange(1,\ len(data_dictionary[key][data_dictionary[key].keys()[0]])+1) # add id array if key == "land_cover": land_cover_storage = StorageFactory().get_storage('dict_storage') land_cover_table_name = 'land_cover' land_cover_storage.write_table( table_name=land_cover_table_name, table_data=data, ) lc = LandCoverDataset( in_storage=land_cover_storage, in_table_name=land_cover_table_name, ) # add relative_x and relative_y lc.get_id_attribute() n = int(ceil(sqrt(lc.size()))) if "relative_x" not in data.keys(): x = (indices((n,n))+1)[1].ravel() lc.add_attribute(x[0:lc.size()], "relative_x", metadata=1) if "relative_y" not in data.keys(): y = (indices((n,n))+1)[0].ravel() lc.add_attribute(y[0:lc.size()], "relative_y", metadata=1) resources.merge({key: lc}) if key == "gridcell": gridcell_storage = StorageFactory().get_storage('dict_storage') gridcell_table_name = 'gridcell' gridcell_storage.write_table( table_name=gridcell_table_name, table_data=data, ) gridcell_dataset = GridcellDataset( in_storage = gridcell_storage, in_table_name = gridcell_table_name, ) resources.merge({key: gridcell_dataset}) else: resources.merge({key:data_dictionary[key]}) if dataset in self.interactions: pass else: resources.merge({"dataset": resources[dataset]}) resources.merge({"check_variables":'*', "debug":4}) return resources
def write(self, resources=None, out_storage=None, out_table_name=None): """ """ # TODO: insert docstring local_resources = Resources(resources) local_resources.merge_with_defaults({ "field_submodel_id": self.field_submodel_id, "field_coefficient_name": self.field_coefficient_name, "field_estimate": self.field_estimate, "field_standard_error": self.field_standard_error, "other_fields": self.other_fields, "out_table_name": out_table_name }) if out_storage <> None: self.out_storage = out_storage if not isinstance(self.out_storage, Storage): logger.log_warning( "out_storage has to be of type Storage. No coefficients written." ) return submodels = self.get_submodels() if submodels.size <= 0: submodels = resize(array([-2], dtype=int32), self.size()) values = { local_resources["field_submodel_id"]: submodels, local_resources["field_coefficient_name"]: self.get_names(), local_resources["field_estimate"]: self.get_values(), local_resources["field_standard_error"]: self.get_standard_errors() } for measure in self.other_measures.keys(): values[measure] = self.other_measures[measure] types = { local_resources["field_submodel_id"]: 'integer', local_resources["field_coefficient_name"]: 'text', local_resources["field_estimate"]: 'double', local_resources["field_standard_error"]: 'double' } attrtypes = { local_resources["field_submodel_id"]: AttributeType.PRIMARY, local_resources["field_coefficient_name"]: AttributeType.PRIMARY, local_resources["field_estimate"]: AttributeType.PRIMARY, local_resources["field_standard_error"]: AttributeType.PRIMARY } for measure in self.other_measures.keys(): types[measure] = 'double' attrtypes[measure] = AttributeType.PRIMARY local_resources.merge({ "values": values, 'valuetypes': types, "drop_table_flag": 1, "attrtype": attrtypes }) self.out_storage.write_table( table_name=local_resources['out_table_name'], table_data=local_resources['values'])
def run(self, data=None, coefficients=None, resources=None): local_resources = Resources() if resources: local_resources.merge(resources) last_result = self.compute_utilities(data=data, coefficients=coefficients, resources=local_resources) this_result = self.compute_probabilities(resources=local_resources) if this_result <> None: last_result = this_result this_result = self.compute_choices(resources=local_resources) if this_result <> None: last_result = this_result return last_result
def preprocess_projects(self, agent_set, agents_index=None, data_objects=None): """Split projects that don't find enough choices to smaller ones (of average size). """ resources=Resources(data_objects) resources.merge({"debug":self.debug}) self.choice_set.compute_variables([self.developable_maximum_unit_full_name, self.developable_minimum_unit_full_name], resources=resources) max_capacity = self.choice_set.get_attribute(self.developable_maximum_unit_short_name) min_capacity = self.choice_set.get_attribute(self.developable_minimum_unit_short_name) self.set_choice_set_size() nchoices = self.get_choice_set_size() project_average_size = agent_set.get_attribute(agent_set.get_attribute_name()).mean() add_projects = 0 remove_projects = 0 if agents_index == None: agents_index=arange(agent_set.size()) # order agents by size ordered_indices = argsort(-1*agent_set.get_attribute_by_index(agent_set.get_attribute_name(), agents_index)) improvement_values=[] projects_ids = agent_set.get_id_attribute()[agents_index].tolist() # how many projects fit in each developable location project_sizes = agent_set.get_attribute_by_index(agent_set.get_attribute_name(), agents_index) for iagent in ordered_indices: project_size = project_sizes[iagent] capacity = logical_and(project_size > min_capacity, (max_capacity / project_size) > 0) if where(capacity)[0].size < nchoices: # not enough choices found nsplitted = int(project_size/project_average_size) add_projects += nsplitted remove_projects+=1 projects_ids.remove(agent_set.get_id_attribute()[agents_index[iagent]]) improvement_values = improvement_values + \ nsplitted*[agent_set.get_attribute_by_index("improvement_value", agents_index[iagent])] else: break # we can break here, since the projects are sorted by size if remove_projects > 0: agent_set.remove_elements(agents_index[ordered_indices[0:remove_projects]]) agents_index = agent_set.get_id_index(projects_ids) if add_projects > 0: max_id = agent_set.get_attribute(agent_set.get_id_name()[0]).max() ids = arange(max_id+1,max_id+1+add_projects) agent_set.add_elements(data={"project_id":ids, self.location_set.get_id_name()[0]:zeros((add_projects,)), "improvement_value":array(improvement_values), agent_set.get_attribute_name(): project_average_size*ones((add_projects,))}, require_all_attributes=False) agents_index = agent_set.get_id_index(projects_ids + ids.tolist())
def _compute_vacancy_variables(self, location_set, dev_model_configs, resources): compute_resources = Resources(resources) compute_resources.merge({"debug": self.debug}) self.units_variable = {} self.variable_for_vacancy = {} for project_type in dev_model_configs: self.units_variable[project_type] = dev_model_configs[project_type]["units"] self.variable_for_vacancy[project_type] = compute_resources.get( "%s_vacant_variable" % project_type, "urbansim.%s.vacant_%s" % (location_set.get_dataset_name(), self.units_variable[project_type]), ) location_set.compute_variables([self.variable_for_vacancy[project_type]], resources=compute_resources)
def run(self, data=None, coefficients=None, resources=None): local_resources = Resources() if resources: local_resources.merge(resources) last_result = self.compute_utilities(data, coefficients, local_resources) this_result = self.compute_probabilities(local_resources) if this_result <> None: last_result = this_result this_result = self.compute_choices(local_resources) if this_result <> None: last_result = this_result return last_result
def run(self, data=None, coefficients=None, resources=None): local_resources = Resources() if resources: local_resources.merge(resources) last_result = self.compute_utilities(data, coefficients, local_resources) # self.debug.print_debug("utilities: %s" % last_result, 3) # added 7 jul 09 this_result = self.compute_probabilities(local_resources) # self.debug.print_debug("probabilities: %s" % this_result, 3) # added 7 jul 09 if this_result <> None: last_result = this_result this_result = self.compute_choices(local_resources) # determines choices based on probabilities # self.debug.print_debug("choices: %s" % this_result, 3) # added 7 jul 09 if this_result <> None: last_result = this_result return last_result
def _compute_vacancy_and_total_units_variables(self, location_set, project_types, resources=None): compute_resources = Resources(resources) compute_resources.merge({"debug":self.debug}) self.variable_for_vacancy = {} self.variable_for_total_units = {} for ptype in project_types: self.variable_for_vacancy[ptype] = compute_resources.get( "%s_vacant_variable" % ptype, "urbansim_zone.%s.vacant_%s" % (location_set.get_dataset_name(), self.project_specific_units[ptype])) self.variable_for_total_units[ptype] = compute_resources.get( "%s_total_units_variable" % ptype, "%s.aggregate(urbansim_zone.building.total_%s)" % (location_set.get_dataset_name(), self.project_specific_units[ptype])) location_set.compute_variables([self.variable_for_vacancy[ptype], self.variable_for_total_units[ptype]], dataset_pool=self.dataset_pool, resources = compute_resources)
def _compute_vacancy_variables(self, location_set, dev_model_configs, resources): compute_resources = Resources(resources) compute_resources.merge({"debug": self.debug}) self.units_variable = {} self.variable_for_vacancy = {} for project_type in dev_model_configs: self.units_variable[project_type] = dev_model_configs[ project_type]['units'] self.variable_for_vacancy[project_type] = compute_resources.get( "%s_vacant_variable" % project_type, "urbansim.%s.vacant_%s" % (location_set.get_dataset_name(), self.units_variable[project_type])) location_set.compute_variables( [self.variable_for_vacancy[project_type]], resources=compute_resources)
def __init__(self, config=None, save_estimation_results=False): if 'cache_directory' not in config or config['cache_directory'] is None: raise KeyError("The cache directory must be specified in the " "given configuration, giving the filesystem path to the cache " "directory containing the data with which to estimate. Please " "check that your configuration contains the 'cache_directory' " "entry and that it is not None.") self.simulation_state = SimulationState(new_instance=True, start_time=config.get('base_year', 0)) self.simulation_state.set_cache_directory(config['cache_directory']) SessionConfiguration(new_instance=True, package_order=config['dataset_pool_configuration'].package_order, in_storage=AttributeCache()) self.config = Resources(config) self.save_estimation_results = save_estimation_results self.debuglevel = self.config.get("debuglevel", 4) self.model_system = ModelSystem() self.agents_index_for_prediction = None models = self.config.get('models',[]) self.model_name = None if "model_name" in config.keys(): self.model_name = config["model_name"] else: for model in models: if isinstance(model, dict): model_name = model.keys()[0] if (model[model_name] == "estimate") or (isinstance(model[model_name], list) and ("estimate" in model[model_name])): self.model_name = model_name break estimate_config_changes = self.config.get('config_changes_for_estimation', {}).get('estimate_config', {}) if len(estimate_config_changes) > 0: change = Resources({'models_configuration': {self.model_name: {'controller': {'init': {'arguments': {}}}}}}) estimate_config_str = self.config['models_configuration'].get(self.model_name, {}).get('controller', {}).get('init', {}).get('arguments', {}).get('estimate_config', '{}') estimate_config = Resources({}) try: estimate_config = eval(estimate_config_str) except: pass estimate_config.merge(estimate_config_changes) self.config.merge(change) self.config['models_configuration'][self.model_name]['controller']['init']['arguments']['estimate_config'] = 'Resources(%s)' % estimate_config
def __init__(self, config=None, save_estimation_results=False): if 'cache_directory' not in config or config['cache_directory'] is None: raise KeyError("The cache directory must be specified in the " "given configuration, giving the filesystem path to the cache " "directory containing the data with which to estimate. Please " "check that your configuration contains the 'cache_directory' " "entry and that it is not None.") self.simulation_state = SimulationState(new_instance=True) self.simulation_state.set_cache_directory(config['cache_directory']) SessionConfiguration(new_instance=True, package_order=config['dataset_pool_configuration'].package_order, in_storage=AttributeCache()) self.config = Resources(config) self.save_estimation_results = save_estimation_results self.debuglevel = self.config.get("debuglevel", 4) self.model_system = ModelSystem() self.agents_index_for_prediction = None models = self.config.get('models',[]) self.model_name = None if "model_name" in config.keys(): self.model_name = config["model_name"] else: for model in models: if isinstance(model, dict): model_name = model.keys()[0] if (model[model_name] == "estimate") or (isinstance(model[model_name], list) and ("estimate" in model[model_name])): self.model_name = model_name break estimate_config_changes = self.config.get('config_changes_for_estimation', {}).get('estimate_config', {}) if len(estimate_config_changes) > 0: change = Resources({'models_configuration': {self.model_name: {'controller': {'init': {'arguments': {}}}}}}) estimate_config_str = self.config['models_configuration'].get(self.model_name, {}).get('controller', {}).get('init', {}).get('arguments', {}).get('estimate_config', '{}') estimate_config = Resources({}) try: estimate_config = eval(estimate_config_str) except: pass estimate_config.merge(estimate_config_changes) self.config.merge(change) self.config['models_configuration'][self.model_name]['controller']['init']['arguments']['estimate_config'] = 'Resources(%s)' % estimate_config
def run(self, data=None, coefficients=None, resources=None): local_resources = Resources() if resources: local_resources.merge(resources) last_result = self.compute_utilities(data, coefficients, local_resources) # self.debug.print_debug("utilities: %s" % last_result, 3) # added 7 jul 09 this_result = self.compute_probabilities(local_resources) # self.debug.print_debug("probabilities: %s" % this_result, 3) # added 7 jul 09 if this_result <> None: last_result = this_result this_result = self.compute_choices( local_resources) # determines choices based on probabilities # self.debug.print_debug("choices: %s" % this_result, 3) # added 7 jul 09 if this_result <> None: last_result = this_result return last_result
def write(self, resources=None, out_storage=None, out_table_name=None): """ """ # TODO: insert docstring local_resources = Resources(resources) local_resources.merge_with_defaults({ "field_submodel_id":self.field_submodel_id, "field_equation_id":self.field_equation_id, "field_coefficient_name":self.field_coefficient_name, "field_variable_name":self.field_variable_name, "field_fixed_value":self.field_fixed_value, "out_table_name":out_table_name}) if out_storage <> None: self.out_storage = out_storage if not isinstance(self.out_storage, Storage): logger.log_warning("out_storage has to be of type Storage. No EquationSpecifications written.") return submodel_ids = self.get_submodels() if submodel_ids.size == 0: submodel_ids = resize(array([-2], dtype="int32"), len(self.get_coefficient_names())) #set sub_model_id = -2 when there is no or 1 submodels equation_ids = self.get_equations() if equation_ids.size == 0: equation_ids = resize(array([-2], dtype="int32"), submodel_ids.size) values = {local_resources["field_submodel_id"]: submodel_ids, local_resources["field_equation_id"]: equation_ids, local_resources["field_coefficient_name"]: self.get_coefficient_names(), local_resources["field_variable_name"]: self.get_long_variable_names()} if self.fixed_values.size > 0: values[local_resources["field_fixed_value"]] = self.fixed_values for field in self.other_fields.keys(): values[field] = self.other_fields[field] types = {local_resources["field_submodel_id"]: 'integer', local_resources["field_equation_id"]: 'integer', local_resources["field_coefficient_name"]: 'text', local_resources["field_variable_name"]: 'text'} local_resources.merge({"values":values, 'valuetypes': types, "drop_table_flag":1}) self.out_storage.write_table(table_name = local_resources['out_table_name'], table_data=local_resources['values'] )
def write(self, resources=None, out_storage=None, out_table_name=None): """ """ # TODO: insert docstring local_resources = Resources(resources) local_resources.merge_with_defaults({ "field_submodel_id":self.field_submodel_id, "field_coefficient_name":self.field_coefficient_name, "field_estimate":self.field_estimate, "field_standard_error":self.field_standard_error, "other_fields":self.other_fields, "out_table_name":out_table_name}) if out_storage <> None: self.out_storage = out_storage if not isinstance(self.out_storage, Storage): logger.log_warning("out_storage has to be of type Storage. No coefficients written.") return submodels = self.get_submodels() if submodels.size <= 0 : submodels = resize(array([-2], dtype=int32), self.size()) values = {local_resources["field_submodel_id"]: submodels, local_resources["field_coefficient_name"]: self.get_names(), local_resources["field_estimate"]: self.get_values(), local_resources["field_standard_error"]: self.get_standard_errors()} for measure in self.other_measures.keys(): values[measure] = self.other_measures[measure] types = {local_resources["field_submodel_id"]: 'integer', local_resources["field_coefficient_name"]: 'text', local_resources["field_estimate"]: 'double', local_resources["field_standard_error"]: 'double'} attrtypes = {local_resources["field_submodel_id"]: AttributeType.PRIMARY, local_resources["field_coefficient_name"]: AttributeType.PRIMARY, local_resources["field_estimate"]: AttributeType.PRIMARY, local_resources["field_standard_error"]: AttributeType.PRIMARY} for measure in self.other_measures.keys(): types[measure]= 'double' attrtypes[measure] = AttributeType.PRIMARY local_resources.merge({"values":values, 'valuetypes': types, "drop_table_flag":1, "attrtype":attrtypes}) self.out_storage.write_table(table_name=local_resources['out_table_name'], table_data = local_resources['values'])
def _compute_vacancy_and_total_units_variables(self, location_set, project_types, resources=None): compute_resources = Resources(resources) compute_resources.merge({"debug": self.debug}) self.variable_for_vacancy = {} self.variable_for_total_units = {} for ptype in project_types: self.variable_for_vacancy[ptype] = compute_resources.get( "%s_vacant_variable" % ptype, "urbansim_zone.%s.vacant_%s" % (location_set.get_dataset_name(), self.project_specific_units[ptype])) self.variable_for_total_units[ptype] = compute_resources.get( "%s_total_units_variable" % ptype, "%s.aggregate(urbansim_zone.building.total_%s)" % (location_set.get_dataset_name(), self.project_specific_units[ptype])) location_set.compute_variables([ self.variable_for_vacancy[ptype], self.variable_for_total_units[ptype] ], dataset_pool=self.dataset_pool, resources=compute_resources)
class RegressionModel(ChunkModel): model_name = "Regression Model" model_short_name = "RM" def __init__(self, regression_procedure="opus_core.linear_regression", submodel_string=None, run_config=None, estimate_config=None, debuglevel=0, dataset_pool=None): self.debug = DebugPrinter(debuglevel) self.dataset_pool = self.create_dataset_pool(dataset_pool) self.regression = RegressionModelFactory().get_model( name=regression_procedure) if self.regression == None: raise StandardError, "No regression procedure given." self.submodel_string = submodel_string self.run_config = run_config if self.run_config == None: self.run_config = Resources() if not isinstance(self.run_config, Resources) and isinstance( self.run_config, dict): self.run_config = Resources(self.run_config) self.estimate_config = estimate_config if self.estimate_config == None: self.estimate_config = Resources() if not isinstance(self.estimate_config, Resources) and isinstance( self.estimate_config, dict): self.estimate_config = Resources(self.estimate_config) self.data = {} self.coefficient_names = {} ChunkModel.__init__(self) self.get_status_for_gui().initialize_pieces(3, pieces_description=array([ 'initialization', 'computing variables', 'submodel: 1' ])) def run(self, specification, coefficients, dataset, index=None, chunk_specification=None, data_objects=None, run_config=None, initial_values=None, procedure=None, debuglevel=0): """'specification' is of type EquationSpecification, 'coefficients' is of type Coefficients, 'dataset' is of type Dataset, 'index' are indices of individuals in dataset for which the model runs. If it is None, the whole dataset is considered. 'chunk_specification' determines number of chunks in which the simulation is processed. 'data_objects' is a dictionary where each key is the name of an data object ('zone', ...) and its value is an object of class Dataset. 'run_config' is of type Resources, it gives additional arguments for the run. If 'procedure' is given, it overwrites the regression_procedure of the constructor. 'initial_values' is an array of the initial values of the results. It will be overwritten by the results for those elements that are handled by the model (defined by submodels in the specification). By default the results are initialized with 0. 'debuglevel' overwrites the constructor 'debuglevel'. """ self.debug.flag = debuglevel if run_config == None: run_config = Resources() if not isinstance(run_config, Resources) and isinstance( run_config, dict): run_config = Resources(run_config) self.run_config = run_config.merge_with_defaults(self.run_config) self.run_config.merge({"debug": self.debug}) if data_objects is not None: self.dataset_pool.add_datasets_if_not_included(data_objects) if procedure is not None: self.regression = RegressionModelFactory().get_model( name=procedure) if initial_values is None: self.initial_values = zeros((dataset.size(), ), dtype=float32) else: self.initial_values = zeros((dataset.size(), ), dtype=initial_values.dtype) self.initial_values[index] = initial_values if dataset.size() <= 0: # no data loaded yet dataset.get_id_attribute() if index == None: index = arange(dataset.size()) result = ChunkModel.run(self, chunk_specification, dataset, index, float32, specification=specification, coefficients=coefficients) return result def run_chunk(self, index, dataset, specification, coefficients): self.specified_coefficients = SpecifiedCoefficients().create( coefficients, specification, neqs=1) compute_resources = Resources({"debug": self.debug}) submodels = self.specified_coefficients.get_submodels() self.get_status_for_gui().update_pieces_using_submodels( submodels=submodels, leave_pieces=2) self.map_agents_to_submodels(submodels, self.submodel_string, dataset, index, dataset_pool=self.dataset_pool, resources=compute_resources) variables = self.specified_coefficients.get_full_variable_names_without_constants( ) self.debug.print_debug("Compute variables ...", 4) self.increment_current_status_piece() dataset.compute_variables(variables, dataset_pool=self.dataset_pool, resources=compute_resources) data = {} coef = {} outcome = self.initial_values[index].copy() for submodel in submodels: coef[submodel] = SpecifiedCoefficientsFor1Submodel( self.specified_coefficients, submodel) self.coefficient_names[submodel] = coef[ submodel].get_coefficient_names_without_constant()[0, :] self.debug.print_debug( "Compute regression for submodel " + str(submodel), 4) self.increment_current_status_piece() self.data[submodel] = dataset.create_regression_data( coef[submodel], index=index[self.observations_mapping[submodel]]) nan_index = where(isnan(self.data[submodel]))[1] inf_index = where(isinf(self.data[submodel]))[1] if nan_index.size > 0: nan_var_index = unique(nan_index) raise ValueError, "NaN(Not A Number) is returned from variable %s; check the model specification table and/or attribute values used in the computation for the variable." % coef[ submodel].get_variable_names()[nan_var_index] if inf_index.size > 0: inf_var_index = unique(inf_index) raise ValueError, "Inf is returned from variable %s; check the model specification table and/or attribute values used in the computation for the variable." % coef[ submodel].get_variable_names()[inf_var_index] if (self.data[submodel].shape[0] > 0) and (self.data[submodel].size > 0): # observations for this submodel available outcome[self.observations_mapping[submodel]] = \ self.regression.run(self.data[submodel], coef[submodel].get_coefficient_values()[0,:], resources=self.run_config).astype(outcome.dtype) return outcome def correct_infinite_values(self, dataset, outcome_attribute_name, maxvalue=1e+38, clip_all_larger_values=False): """Check if the model resulted in infinite values. If yes, print warning and clip the values to maxvalue. If clip_all_larger_values is True, all values larger than maxvalue are clip to maxvalue. """ infidx = where(dataset.get_attribute(outcome_attribute_name) == inf)[0] if infidx.size > 0: logger.log_warning("Infinite values in %s. Clipped to %s." % (outcome_attribute_name, maxvalue)) dataset.set_values_of_one_attribute(outcome_attribute_name, maxvalue, infidx) if clip_all_larger_values: idx = where( dataset.get_attribute(outcome_attribute_name) > maxvalue)[0] if idx.size > 0: logger.log_warning( "Values in %s larger than %s. Clipped to %s." % (outcome_attribute_name, maxvalue, maxvalue)) dataset.set_values_of_one_attribute(outcome_attribute_name, maxvalue, idx) def estimate(self, specification, dataset, outcome_attribute, index=None, procedure=None, data_objects=None, estimate_config=None, debuglevel=0): """'specification' is of type EquationSpecification, 'dataset' is of type Dataset, 'outcome_attribute' - string that determines the dependent variable, 'index' are indices of individuals in dataset for which the model runs. If it is None, the whole dataset is considered. 'procedure' - name of the estimation procedure. If it is None, there should be an entry "estimation" in 'estimate_config' that determines the procedure. The class must have a method 'run' that takes as arguments 'data', 'regression_procedure' and 'resources'. It returns a dictionary with entries 'estimators', 'standard_errors' and 't_values' (all 1D numpy arrays). 'data_objects' is a dictionary where each key is the name of an data object ('zone', ...) and its value is an object of class Dataset. 'estimate_config' is of type Resources, it gives additional arguments for the estimation procedure. 'debuglevel' overwrites the class 'debuglevel'. """ #import wingdbstub self.debug.flag = debuglevel if estimate_config == None: estimate_config = Resources() if not isinstance(estimate_config, Resources) and isinstance( estimate_config, dict): estimate_config = Resources(estimate_config) self.estimate_config = estimate_config.merge_with_defaults( self.estimate_config) if data_objects is not None: self.dataset_pool.add_datasets_if_not_included(data_objects) self.procedure = procedure if self.procedure == None: self.procedure = self.estimate_config.get("estimation", None) if self.procedure is not None: self.procedure = ModelComponentCreator().get_model_component( self.procedure) else: logger.log_warning( "No estimation procedure given, or problems with loading the corresponding module." ) compute_resources = Resources({"debug": self.debug}) if dataset.size() <= 0: # no data loaded yet dataset.get_id_attribute() if index == None: index = arange(dataset.size()) if not isinstance(index, ndarray): index = array(index) estimation_size_agents = self.estimate_config.get( "estimation_size_agents", None) # should be a proportion of the agent_set if estimation_size_agents == None: estimation_size_agents = 1.0 else: estimation_size_agents = max(min(estimation_size_agents, 1.0), 0.0) # between 0 and 1 if estimation_size_agents < 1.0: self.debug.print_debug("Sampling agents for estimation ...", 3) estimation_idx = sample_noreplace( arange(index.size), int(index.size * estimation_size_agents)) else: estimation_idx = arange(index.size) estimation_idx = index[estimation_idx] self.debug.print_debug( "Number of observations for estimation: " + str(estimation_idx.size), 2) if estimation_idx.size <= 0: self.debug.print_debug("Nothing to be done.", 2) return (None, None) coefficients = create_coefficient_from_specification(specification) specified_coefficients = SpecifiedCoefficients().create(coefficients, specification, neqs=1) submodels = specified_coefficients.get_submodels() self.get_status_for_gui().update_pieces_using_submodels( submodels=submodels, leave_pieces=2) self.map_agents_to_submodels( submodels, self.submodel_string, dataset, estimation_idx, dataset_pool=self.dataset_pool, resources=compute_resources, submodel_size_max=self.estimate_config.get('submodel_size_max', None)) variables = specified_coefficients.get_full_variable_names_without_constants( ) self.debug.print_debug("Compute variables ...", 4) self.increment_current_status_piece() dataset.compute_variables(variables, dataset_pool=self.dataset_pool, resources=compute_resources) coef = {} estimated_coef = {} self.outcome = {} dataset.compute_variables([outcome_attribute], dataset_pool=self.dataset_pool, resources=compute_resources) regression_resources = Resources(estimate_config) regression_resources.merge({"debug": self.debug}) outcome_variable_name = VariableName(outcome_attribute) for submodel in submodels: coef[submodel] = SpecifiedCoefficientsFor1Submodel( specified_coefficients, submodel) self.increment_current_status_piece() logger.log_status("Estimate regression for submodel " + str(submodel), tags=["estimate"], verbosity_level=2) logger.log_status("Number of observations: " + str(self.observations_mapping[submodel].size), tags=["estimate"], verbosity_level=2) self.data[ submodel] = dataset.create_regression_data_for_estimation( coef[submodel], index=estimation_idx[self.observations_mapping[submodel]]) self.coefficient_names[submodel] = coef[ submodel].get_coefficient_names_without_constant()[0, :] if (self.data[submodel].shape[0] > 0 ) and (self.data[submodel].size > 0) and ( self.procedure is not None): # observations for this submodel available self.outcome[submodel] = dataset.get_attribute_by_index( outcome_variable_name.get_alias(), estimation_idx[self.observations_mapping[submodel]]) regression_resources.merge({"outcome": self.outcome[submodel]}) regression_resources.merge({ "coefficient_names": self.coefficient_names[submodel].tolist(), "constant_position": coef[submodel].get_constants_positions() }) estimated_coef[submodel] = self.procedure.run( self.data[submodel], self.regression, resources=regression_resources) if "estimators" in estimated_coef[submodel].keys(): coef[submodel].set_coefficient_values( estimated_coef[submodel]["estimators"]) if "standard_errors" in estimated_coef[submodel].keys(): coef[submodel].set_standard_errors( estimated_coef[submodel]["standard_errors"]) if "other_measures" in estimated_coef[submodel].keys(): for measure in estimated_coef[submodel][ "other_measures"].keys(): coef[submodel].set_measure( measure, estimated_coef[submodel]["other_measures"] [measure]) if "other_info" in estimated_coef[submodel].keys(): for info in estimated_coef[submodel]["other_info"]: coef[submodel].set_other_info( info, estimated_coef[submodel]["other_info"][info]) coefficients.fill_coefficients(coef) self.save_predicted_values_and_errors(specification, coefficients, dataset, outcome_variable_name, index=index, data_objects=data_objects) return (coefficients, estimated_coef) def prepare_for_run(self, dataset=None, dataset_filter=None, filter_threshold=0, **kwargs): spec, coef = prepare_specification_and_coefficients(**kwargs) if (dataset is not None) and (dataset_filter is not None): filter_values = dataset.compute_variables( [dataset_filter], dataset_pool=self.dataset_pool) index = where(filter_values > filter_threshold)[0] else: index = None return (spec, coef, index) def prepare_for_estimate(self, dataset=None, dataset_filter=None, filter_threshold=0, **kwargs): spec = get_specification_for_estimation(**kwargs) if (dataset is not None) and (dataset_filter is not None): filter_values = dataset.compute_variables( [dataset_filter], dataset_pool=self.dataset_pool) index = where(filter_values > filter_threshold)[0] else: index = None return (spec, index) def get_data_as_dataset(self, submodel=-2): """Like get_all_data, but the retuning value is a Dataset containing attributes that correspond to the data columns. Their names are coefficient names.""" all_data = self.get_all_data(submodel) if all_data is None: return None names = self.get_coefficient_names(submodel) if names is None: return None dataset_data = {} for i in range(names.size): dataset_data[names[i]] = all_data[:, i].reshape(all_data.shape[0]) dataset_data["id"] = arange(all_data.shape[0]) + 1 storage = StorageFactory().get_storage('dict_storage') storage.write_table(table_name='dataset', table_data=dataset_data) ds = Dataset(in_storage=storage, id_name="id", in_table_name='dataset') return ds def save_predicted_values_and_errors(self, specification, coefficients, dataset, outcome_variable, index=None, data_objects=None): if self.estimate_config.get('save_predicted_values_and_errors', False): logger.log_status('Computing predicted values and residuals.') original_values = dataset.get_attribute_by_index( outcome_variable, index) predicted_values = zeros(dataset.size(), dtype='float32') predicted_values[index] = self.run_after_estimation( specification, coefficients, dataset, index=index, data_objects=data_objects) predicted_attribute_name = 'predicted_%s' % outcome_variable.get_alias( ) dataset.add_primary_attribute(name=predicted_attribute_name, data=predicted_values) dataset.flush_attribute(predicted_attribute_name) predicted_error_attribute_name = 'residuals_%s' % outcome_variable.get_alias( ) error_values = zeros(dataset.size(), dtype='float32') error_values[index] = (original_values - predicted_values[index]).astype( error_values.dtype) dataset.add_primary_attribute(name=predicted_error_attribute_name, data=error_values) dataset.flush_attribute(predicted_error_attribute_name) logger.log_status( 'Predicted values saved as %s (for the %s dataset)' % (predicted_attribute_name, dataset.get_dataset_name())) logger.log_status( 'Residuals saved as %s (for the %s dataset)' % (predicted_error_attribute_name, dataset.get_dataset_name())) def export_estimation_data(self, submodel=-2, file_name='./estimation_data_regression.txt', delimiter='\t'): import os from numpy import newaxis data = concatenate((self.outcome[submodel][..., newaxis], self.get_all_data(submodel=submodel)), axis=1) header = ['outcome'] + self.get_coefficient_names(submodel).tolist() nrows = data.shape[0] file_name_root, file_name_ext = os.path.splitext(file_name) out_file = "%s_submodel_%s.txt" % (file_name_root, submodel) fh = open(out_file, 'w') fh.write(delimiter.join(header) + '\n') #file header for row in range(nrows): line = [str(x) for x in data[row, ]] fh.write(delimiter.join(line) + '\n') fh.flush() fh.close print 'Data written into %s' % out_file def run_after_estimation(self, *args, **kwargs): return self.run(*args, **kwargs) def _get_status_total_pieces(self): return ChunkModel._get_status_total_pieces( self) * self.get_status_for_gui().get_total_number_of_pieces() def _get_status_current_piece(self): return ChunkModel._get_status_current_piece( self) * self.get_status_for_gui().get_total_number_of_pieces( ) + self.get_status_for_gui().get_current_piece() def _get_status_piece_description(self): return "%s %s" % (ChunkModel._get_status_piece_description( self), self.get_status_for_gui().get_current_piece_description()) def get_specified_coefficients(self): return self.specified_coefficients
def get_resources(self, data_dictionary, dataset): """Create resources for computing a variable. """ resources = Resources() for key in data_dictionary.keys(): if key in self.datasets: data = data_dictionary[key] storage = StorageFactory().get_storage('dict_storage') if self.id_names[key] not in data_dictionary[key].keys( ) and not isinstance(self.id_names[key], list): data[self.id_names[key]] = arange( 1, len(data_dictionary[key][data_dictionary[key].keys() [0]]) + 1) # add id array id_name = self.id_names[key] storage.write_table(table_name='data', table_data=data) if key == "gridcell": gc = GridcellDataset(in_storage=storage, in_table_name='data') # add relative_x and relative_y gc.get_id_attribute() n = int(ceil(sqrt(gc.size()))) if "relative_x" not in data.keys(): x = (indices((n, n)) + 1)[1].ravel() gc.add_attribute(x[0:gc.size()], "relative_x", metadata=1) if "relative_y" not in data.keys(): y = (indices((n, n)) + 1)[0].ravel() gc.add_attribute(y[0:gc.size()], "relative_y", metadata=1) resources.merge({key: gc}) elif key == "household": resources.merge({ key: HouseholdDataset(in_storage=storage, in_table_name='data') }) elif key == "development_project": resources.merge({ key: DevelopmentProjectDataset(in_storage=storage, in_table_name='data') }) elif key == "development_event": resources.merge({ key: DevelopmentEventDataset(in_storage=storage, in_table_name='data') }) elif key == "neighborhood": resources.merge({ key: NeighborhoodDataset(in_storage=storage, in_table_name='data') }) elif key == "job": resources.merge({ key: JobDataset(in_storage=storage, in_table_name='data') }) elif key == "zone": resources.merge({ key: ZoneDataset(in_storage=storage, in_table_name='data') }) elif key == "travel_data": resources.merge({ key: TravelDataDataset(in_storage=storage, in_table_name='data') }) elif key == "faz": resources.merge({ key: FazDataset(in_storage=storage, in_table_name='data') }) elif key == "fazdistrict": resources.merge({ key: FazdistrictDataset(in_storage=storage, in_table_name='data') }) elif key == "race": resources.merge({ key: RaceDataset(in_storage=storage, in_table_name='data') }) elif key == "county": resources.merge({ key: CountyDataset(in_storage=storage, in_table_name='data') }) elif key == "large_area": resources.merge({ key: LargeAreaDataset(in_storage=storage, in_table_name='data') }) elif key == "development_group": resources.merge({ key: DevelopmentGroupDataset(in_storage=storage, in_table_name='data') }) elif key == "employment_sector_group": resources.merge({ key: EmploymentSectorGroupDataset(in_storage=storage, in_table_name='data') }) elif key == "plan_type_group": resources.merge({ key: PlanTypeGroupDataset(in_storage=storage, in_table_name='data') }) elif key == "building": resources.merge({ key: BuildingDataset(in_storage=storage, in_table_name='data') }) else: resources.merge({key: data_dictionary[key]}) if dataset in self.interactions: if dataset == "household_x_gridcell": resources.merge({ "dataset": HouseholdXGridcellDataset(dataset1=resources["household"], dataset2=resources["gridcell"]) }) if dataset == "job_x_gridcell": resources.merge({ "dataset": JobXGridcellDataset(dataset1=resources["job"], dataset2=resources["gridcell"]) }) if dataset == "household_x_zone": resources.merge({ "dataset": HouseholdXZoneDataset(dataset1=resources["household"], dataset2=resources["zone"]) }) if dataset == "household_x_neighborhood": resources.merge({ "dataset": HouseholdXNeighborhoodDataset( dataset1=resources["household"], dataset2=resources["neighborhood"]) }) if dataset == "development_project_x_gridcell": resources.merge({ "dataset": DevelopmentProjectXGridcellDataset( dataset1=resources["development_project"], dataset2=resources["gridcell"]) }) else: resources.merge({"dataset": resources[dataset]}) resources.merge({"check_variables": '*', "debug": 4}) return resources
def run( self, building_set, # building_use_table, building_use_classification_table, vacancy_table, history_table, year, location_set, resources=None ): building_classes = building_use_classification_table.get_attribute("name") unit_attributes = building_use_classification_table.get_attribute('units') building_id_name = building_set.get_id_name()[0] location_id_name = location_set.get_id_name()[0] new_buildings = {building_id_name: array([], dtype='int32'), "building_use_id":array([], dtype=int8), "year_built": array([], dtype='int32'), # "building_sqft": array([], dtype='int32'), # "residential_units": array([], dtype='int32'), "unit_price": array([], dtype= float32), location_id_name: array([], dtype='int32')} for attribute in unit_attributes: new_buildings[attribute] = array([], dtype='int32') max_id = building_set.get_id_attribute().max() building_set_size_orig = building_set.size() for itype in range(building_use_classification_table.size()): # iterate over building types building_class = building_classes[itype] # type_code = building_types_table.get_id_attribute()[itype] vacancy_attribute = 'target_total_%s_vacancy' % building_class if vacancy_attribute not in vacancy_table.get_known_attribute_names(): logger.log_warning("No target vacancy for building class '%s'. Transition model for this building class skipped." % type) continue vacancy_table.get_attribute(vacancy_attribute) # ensures that the attribute is loaded target_vacancy_rate = eval("vacancy_table.get_data_element_by_id( year ).%s" % vacancy_attribute) compute_resources = Resources(resources) compute_resources.merge({"debug":self.debug}) units_attribute = unit_attributes[itype] vacant_units_attribute = 'vacant_' + units_attribute # determine current-year vacancy rates building_set.compute_variables("urbansim_parcel.building." + vacant_units_attribute, resources = compute_resources) vacant_units_sum = building_set.get_attribute(vacant_units_attribute).sum() units_sum = float( building_set.get_attribute(units_attribute).sum() ) vacant_rate = self.safe_divide(vacant_units_sum, units_sum) should_develop_units = max( 0, ( target_vacancy_rate * units_sum - vacant_units_sum ) / ( 1 - target_vacancy_rate ) ) logger.log_status(building_class + ": vacant units: %d, should be vacant: %f, sum units: %d" % (vacant_units_sum, target_vacancy_rate * units_sum, units_sum)) if not should_develop_units: logger.log_note(("Will not build any " + building_class + " units, because the current vacancy of %d units\n" + "is more than the %d units desired for the vacancy rate of %f.") % (vacant_units_sum, target_vacancy_rate * units_sum, target_vacancy_rate)) continue # average_buildings_value = None # if (type+"_improvement_value") in location_set.get_known_attribute_names(): # average_buildings_value = self.safe_divide( # location_set.get_attribute(type+"_improvement_value" ).sum(), units_sum) #create buildings history_values = history_table.get_attribute(units_attribute) index_non_zeros_values = where( history_values > 0 )[0] history_values_without_zeros = history_values[index_non_zeros_values] history_type = history_table.get_attribute("building_use_id") history_type_without_zeros = history_type[index_non_zeros_values] history_price = history_table.get_attribute("unit_price") history_price_without_zeros = history_price[index_non_zeros_values] #TODO: what happens if history has only zeroes? mean_size = history_values_without_zeros.mean() idx = array( [] ) # Ensure that there are some development projects to choose from. #TODO: should the 'int' in the following line be 'ceil'? num_of_projects_to_select = max( 10, int( should_develop_units / mean_size ) ) while True: idx = concatenate( ( idx, randint( 0, history_values_without_zeros.size, size= num_of_projects_to_select ) ) ) csum = history_values_without_zeros[idx].cumsum() idx = idx[where( csum <= should_develop_units )] if csum[-1] >= should_develop_units: break nbuildings = idx.size for attribute in unit_attributes: #if attribute == units_attribute: #new_unit_values = history_values_without_zeros[idx] #else: #new_unit_values = zeros(nbuildings) #to accomodate mixed use buildings, allow non units_attribute to be non-zero new_unit_values = history_table.get_attribute(attribute)[index_non_zeros_values[idx]] new_buildings[attribute] = concatenate((new_buildings[attribute], new_unit_values)) new_max_id = max_id + nbuildings new_buildings[building_id_name]=concatenate((new_buildings[building_id_name], arange(max_id+1, new_max_id+1))) new_buildings["building_use_id"] = concatenate((new_buildings["building_use_id"], history_type_without_zeros[idx])) new_buildings["year_built"] = concatenate((new_buildings["year_built"], year*ones(nbuildings, dtype="int32"))) new_buildings["unit_price"] = concatenate((new_buildings["unit_price"], history_price_without_zeros[idx])) new_buildings[location_id_name] = concatenate((new_buildings[location_id_name], zeros(nbuildings, dtype="int32"))) logger.log_status("Creating %s %s of %s %s buildings." % (history_values_without_zeros[idx].sum(), units_attribute, nbuildings, building_class)) building_set.add_elements(new_buildings, require_all_attributes=False) difference = building_set.size() - building_set_size_orig index = arange(difference) + building_set_size_orig return index
def run( self, building_set, new_building_copy_attrs, building_type_table, building_type_classification_table, vacancy_table, history_table, year, location_set, resources=None ): building_classes = building_type_classification_table.get_attribute("name") unit_attributes = building_type_classification_table.get_attribute('units') building_id_name = building_set.get_id_name()[0] location_id_name = location_set.get_id_name()[0] calc_attributes = [building_id_name, location_id_name, "year_built"] new_buildings = {} for attribute in new_building_copy_attrs: new_buildings[attribute] = array([], dtype=building_set.get_data_type(attribute)) for attribute in calc_attributes: new_buildings[attribute] = array([], dtype=building_set.get_data_type(attribute)) # for convenience, make a map of building_type_id => (building_type)class_id # these names are hard-wired elsewhere building_type_id_to_class_id = {} building_type_ids = building_type_table.get_attribute("building_type_id") for idx in range(building_type_table.size()): building_type_id_to_class_id[building_type_ids[idx]] = \ building_type_table.get_attribute("class_id")[idx] logger.log_status("building_type_id_to_class_id = " + str(building_type_id_to_class_id)) # and make an column for the history table of the use classes history_type_classes = zeros( (history_table.size()), dtype=int8) history_types = history_table.get_attribute("building_type_id") for idx in range(history_table.size()): history_type_classes[idx] = building_type_id_to_class_id[history_types[idx]] logger.log_status("history_types=" + str(history_types)) logger.log_status("history_type_classes=" + str(history_type_classes)) max_id = building_set.get_id_attribute().max() new_building_id_start = max_id + 1 new_building_id_end = max_id + 1 building_set_size_orig = building_set.size() for itype in range(building_type_classification_table.size()): # iterate over building types building_class = building_classes[itype] building_class_id = building_type_classification_table.get_attribute("class_id")[itype] vacancy_attribute = 'target_total_%s_vacancy' % building_class.lower() if vacancy_attribute not in vacancy_table.get_known_attribute_names(): logger.log_warning("No target vacancy for building class '%s' (e.g. no '%s' in target_vacancies). Transition model for this building class skipped." % (building_class,vacancy_attribute)) continue vacancy_table.get_attribute(vacancy_attribute) # ensures that the attribute is loaded target_vacancy_rate = eval("vacancy_table.get_data_element_by_id( year ).%s" % vacancy_attribute) logger.log_status("Target vacancy rate for building_class %s is %f" % (building_class, target_vacancy_rate)) compute_resources = Resources(resources) compute_resources.merge({"debug":self.debug}) units_attribute = unit_attributes[itype] occupied_sqft_attribute = 'occupied_sqft_of_typeclass_%s' % building_class.lower() total_sqft_attribute = 'where(sanfrancisco.building.building_typeclass_name==\'%s\',sanfrancisco.building.building_sqft,0)' % building_class.lower() # determine current-year vacancy rates building_set.compute_variables(("sanfrancisco.building." + occupied_sqft_attribute, total_sqft_attribute), resources = compute_resources) occupied_sqft_sum = building_set.get_attribute(occupied_sqft_attribute).sum() total_sqft_sum = float( building_set.get_attribute(total_sqft_attribute).sum() ) occupancy_rate = self.safe_divide(occupied_sqft_sum, total_sqft_sum) # cap it at 1.0 if occupancy_rate > 1.0: occupancy_rate = 1.0 vacancy_rate = 1.0 - occupancy_rate vacant_sqft_sum = vacancy_rate * total_sqft_sum should_develop_sqft = (target_vacancy_rate*total_sqft_sum) - vacant_sqft_sum logger.log_status("%s: vacancy rate: %4.3f occupancy rate: %4.3f" % (building_class, vacancy_rate, occupancy_rate)) logger.log_status("%s: vacant: %d, should be vacant: %f, sum units: %d" % (building_class, vacant_sqft_sum, target_vacancy_rate*total_sqft_sum, total_sqft_sum)) if should_develop_sqft <= 0: logger.log_note(("Will not build any %s units, because the current vacancy of %d sqft\n" + "is more than the %d sqft desired for the vacancy rate of %f.") % (building_class, vacant_sqft_sum, target_vacancy_rate*total_sqft_sum, target_vacancy_rate)) continue #create buildings # find sample set of qualifying buildings in the events history, # e.g. where the building_type is in the correct class, and a positive # number of units or sqft (or whatever) were present history_sqft = history_table.get_attribute('building_sqft') index_sampleset = where( (history_sqft > 0) & (history_type_classes==building_class_id))[0] # Ensure that there are some development projects to choose from. logger.log_status("should_develop_sqft=" + str(should_develop_sqft)) if index_sampleset.shape[0] == 0: logger.log_warning("Cannot create new buildings for building use class %s; no buildings in the event history table from which to sample." % building_class) continue history_sqft_sampleset = history_sqft[index_sampleset] logger.log_status("history_sqft_sampleset = " + str(history_sqft_sampleset)) mean_size = history_sqft_sampleset.mean() idx = array( [] ,dtype="int32") #TODO: should the 'int' in the following line be 'ceil'? num_of_projects_to_select = max( 10, int( should_develop_sqft / mean_size ) ) while True: idx = concatenate( ( idx, randint( 0, history_sqft_sampleset.size, size=num_of_projects_to_select) ) ) csum = history_sqft_sampleset[idx].cumsum() idx = idx[where( csum <= should_develop_sqft )] if csum[-1] >= should_develop_sqft: break logger.log_status("idx = " + str(idx)) nbuildings = idx.size if nbuildings == 0: continue new_building_id_end = new_building_id_start + nbuildings # copy_attributes for attribute in new_building_copy_attrs: attr_values = history_table.get_attribute(attribute)[index_sampleset[idx]] new_buildings[attribute] = concatenate((new_buildings[attribute], attr_values)) # calc_attributes new_buildings[building_id_name] =concatenate((new_buildings[building_id_name], arange(new_building_id_start, new_building_id_end))) new_buildings[location_id_name] = concatenate((new_buildings[location_id_name], zeros(nbuildings))) new_buildings["year_built"] = concatenate((new_buildings["year_built"], year*ones(nbuildings))) logger.log_status("Creating %s sqft of %s %s buildings." % (history_sqft_sampleset[idx].sum(), nbuildings, building_class)) new_building_id_start = new_building_id_end + 1 logger.log_status(new_buildings) building_set.add_elements(new_buildings, require_all_attributes=False) difference = building_set.size() - building_set_size_orig index = arange(difference) + building_set_size_orig return index
def preprocess_projects(self, agent_set, agents_index=None, data_objects=None): """Split projects that don't find enough choices to smaller ones (of average size). """ resources = Resources(data_objects) resources.merge({"debug": self.debug}) self.choice_set.compute_variables([ self.developable_maximum_unit_full_name, self.developable_minimum_unit_full_name ], resources=resources) max_capacity = self.choice_set.get_attribute( self.developable_maximum_unit_short_name) min_capacity = self.choice_set.get_attribute( self.developable_minimum_unit_short_name) self.set_choice_set_size() nchoices = self.get_choice_set_size() project_average_size = agent_set.get_attribute( agent_set.get_attribute_name()).mean() add_projects = 0 remove_projects = 0 if agents_index == None: agents_index = arange(agent_set.size()) # order agents by size ordered_indices = argsort(-1 * agent_set.get_attribute_by_index( agent_set.get_attribute_name(), agents_index)) improvement_values = [] projects_ids = agent_set.get_id_attribute()[agents_index].tolist() # how many projects fit in each developable location project_sizes = agent_set.get_attribute_by_index( agent_set.get_attribute_name(), agents_index) for iagent in ordered_indices: project_size = project_sizes[iagent] capacity = logical_and(project_size > min_capacity, (max_capacity / project_size) > 0) if where(capacity)[0].size < nchoices: # not enough choices found nsplitted = int(project_size / project_average_size) add_projects += nsplitted remove_projects += 1 projects_ids.remove( agent_set.get_id_attribute()[agents_index[iagent]]) improvement_values = improvement_values + \ nsplitted*[agent_set.get_attribute_by_index("improvement_value", agents_index[iagent])] else: break # we can break here, since the projects are sorted by size if remove_projects > 0: agent_set.remove_elements( agents_index[ordered_indices[0:remove_projects]]) agents_index = agent_set.get_id_index(projects_ids) if add_projects > 0: max_id = agent_set.get_attribute(agent_set.get_id_name()[0]).max() ids = arange(max_id + 1, max_id + 1 + add_projects) agent_set.add_elements(data={ "project_id": ids, self.location_set.get_id_name()[0]: zeros((add_projects, )), "improvement_value": array(improvement_values), agent_set.get_attribute_name(): project_average_size * ones((add_projects, )) }, require_all_attributes=False) agents_index = agent_set.get_id_index(projects_ids + ids.tolist())
class Estimator(GenericModelExplorer): def __init__(self, config=None, save_estimation_results=False): if 'cache_directory' not in config or config['cache_directory'] is None: raise KeyError("The cache directory must be specified in the " "given configuration, giving the filesystem path to the cache " "directory containing the data with which to estimate. Please " "check that your configuration contains the 'cache_directory' " "entry and that it is not None.") self.simulation_state = SimulationState(new_instance=True) self.simulation_state.set_cache_directory(config['cache_directory']) SessionConfiguration(new_instance=True, package_order=config['dataset_pool_configuration'].package_order, in_storage=AttributeCache()) self.config = Resources(config) self.save_estimation_results = save_estimation_results self.debuglevel = self.config.get("debuglevel", 4) self.model_system = ModelSystem() self.agents_index_for_prediction = None models = self.config.get('models',[]) self.model_name = None if "model_name" in config.keys(): self.model_name = config["model_name"] else: for model in models: if isinstance(model, dict): model_name = model.keys()[0] if (model[model_name] == "estimate") or (isinstance(model[model_name], list) and ("estimate" in model[model_name])): self.model_name = model_name break estimate_config_changes = self.config.get('config_changes_for_estimation', {}).get('estimate_config', {}) if len(estimate_config_changes) > 0: change = Resources({'models_configuration': {self.model_name: {'controller': {'init': {'arguments': {}}}}}}) estimate_config_str = self.config['models_configuration'].get(self.model_name, {}).get('controller', {}).get('init', {}).get('arguments', {}).get('estimate_config', '{}') estimate_config = Resources({}) try: estimate_config = eval(estimate_config_str) except: pass estimate_config.merge(estimate_config_changes) self.config.merge(change) self.config['models_configuration'][self.model_name]['controller']['init']['arguments']['estimate_config'] = 'Resources(%s)' % estimate_config def estimate(self, out_storage=None): self.model_system.run(self.config, write_datasets_to_cache_at_end_of_year=False) self.extract_coefficients_and_specification() if self.save_estimation_results: self.save_results(out_storage=out_storage) def reestimate(self, specification_module_name=None, specification_dict=None, out_storage=None, type=None, submodels=None): """specification_module_name is name of a module that contains a dictionary called 'specification'. If it is not given, the argument specification_dict must be given which is a dictionary object. 'type' is the name of model member, such as 'commercial', 'residential'. The specification dictionary is expected to have an entry of this name. If 'submodels' is given (list or a number), the restimation is done only for those submodels. """ if specification_module_name is not None: exec("import " + specification_module_name) eval("reload (" + specification_module_name + ")") exec("specification_dict =" + specification_module_name + ".specification") if type is not None: specification_dict = specification_dict[type] if submodels is not None: #remove all submodels but the given ones from specification submodels_to_be_deleted = specification_dict.keys() if not isinstance(submodels, list): submodels = [submodels] for sm in submodels: if sm not in submodels_to_be_deleted: raise ValueError, "Submodel %s not in the specification." % sm submodels_to_be_deleted.remove(sm) if "_definition_" in submodels_to_be_deleted: submodels_to_be_deleted.remove("_definition_") for sm in submodels_to_be_deleted: del specification_dict[sm] self.specification = EquationSpecification(specification_dict=specification_dict) new_namespace = self.model_system.run_year_namespace keys_coeff_spec = self.get_keys_for_coefficients_and_specification() new_namespace[keys_coeff_spec["specification"]] = self.specification self.coefficients, coeff_dict_dummy = self.model_system.do_process(new_namespace) ## update run_year_namespce since it's not been updated by do_process self.model_system.run_year_namespace = new_namespace self.model_system.run_year_namespace[keys_coeff_spec["coefficients"]] = self.coefficients ## this gets coeff and spec from run_year_namespce and is only updated in _run_year method #self.extract_coefficients_and_specification() if self.save_estimation_results: self.save_results(out_storage=out_storage) def predict(self, predicted_choice_id_name, agents_index=None): """ Run prediction. Currently makes sense only for choice models.""" # Create temporary configuration where all words 'estimate' are replaced by 'run' tmp_config = Resources(self.config) if self.agents_index_for_prediction is None: self.agents_index_for_prediction = self.get_agent_set_index().copy() if agents_index is None: agents_index = self.agents_index_for_prediction tmp_config['models_configuration'][self.model_name]['controller']['run']['arguments']['coefficients'] = "coeff_est" tmp_config['models_configuration'][self.model_name]['controller']['run']['arguments']['agents_index'] = "agents_index" tmp_config['models_configuration'][self.model_name]['controller']['run']['arguments']['chunk_specification'] = "{'nchunks':1}" ### save specification and coefficients to cache (no matter the save_estimation_results flag) ### so that the prepare_for_run method could load specification and coefficients from there #output_configuration = self.config['output_configuration'] #del self.config['output_configuration'] #self.save_results() #self.config['output_configuration'] = output_configuration #self.model_system.run_year_namespace["coefficients"] = self.coefficients #del tmp_config['models_configuration'][self.model_name]['controller']['prepare_for_run'] try: run_year_namespace = copy.copy(self.model_system.run_year_namespace) except: logger.log_error("The estimate() method must be run first") return False try: agents = self.get_agent_set() choice_id_name = self.get_choice_set().get_id_name()[0] # save current locations of agents current_choices = agents.get_attribute(choice_id_name).copy() dummy_data = zeros(current_choices.size, dtype=current_choices.dtype)-1 agents.modify_attribute(name=choice_id_name, data=dummy_data) #reset all choices run_year_namespace["process"] = "run" run_year_namespace["coeff_est"] = self.coefficients run_year_namespace["agents_index"] = agents_index run_year_namespace["processmodel_config"] = tmp_config['models_configuration'][self.model_name]['controller']['run'] new_choices = self.model_system.do_process(run_year_namespace) #self.model_system.run(tmp_config, write_datasets_to_cache_at_end_of_year=False) #new_choices = agents.get_attribute(choice_id_name).copy() agents.modify_attribute(name=choice_id_name, data=current_choices) dummy_data[agents_index] = new_choices if predicted_choice_id_name not in agents.get_known_attribute_names(): agents.add_primary_attribute(name=predicted_choice_id_name, data=dummy_data) else: agents.modify_attribute(name=predicted_choice_id_name, data=dummy_data) logger.log_status("Predictions saved into attribute " + predicted_choice_id_name) return True except Exception, e: logger.log_error("Error encountered in prediction: %s" % e) logger.log_stack_trace() return False
def estimate(self, specification, dataset, outcome_attribute, index = None, procedure=None, data_objects=None, estimate_config=None, debuglevel=0): """'specification' is of type EquationSpecification, 'dataset' is of type Dataset, 'outcome_attribute' - string that determines the dependent variable, 'index' are indices of individuals in dataset for which the model runs. If it is None, the whole dataset is considered. 'procedure' - name of the estimation procedure. If it is None, there should be an entry "estimation" in 'estimate_config' that determines the procedure. The class must have a method 'run' that takes as arguments 'data', 'regression_procedure' and 'resources'. It returns a dictionary with entries 'estimators', 'standard_errors' and 't_values' (all 1D numpy arrays). 'data_objects' is a dictionary where each key is the name of an data object ('zone', ...) and its value is an object of class Dataset. 'estimate_config' is of type Resources, it gives additional arguments for the estimation procedure. 'debuglevel' overwrites the class 'debuglevel'. """ #import wingdbstub self.debug.flag = debuglevel if estimate_config == None: estimate_config = Resources() if not isinstance(estimate_config,Resources) and isinstance(estimate_config, dict): estimate_config = Resources(estimate_config) self.estimate_config = estimate_config.merge_with_defaults(self.estimate_config) if data_objects is not None: self.dataset_pool.add_datasets_if_not_included(data_objects) self.procedure=procedure if self.procedure == None: self.procedure = self.estimate_config.get("estimation", None) if self.procedure is not None: self.procedure = ModelComponentCreator().get_model_component(self.procedure) else: logger.log_warning("No estimation procedure given, or problems with loading the corresponding module.") compute_resources = Resources({"debug":self.debug}) if dataset.size()<=0: # no data loaded yet dataset.get_id_attribute() if index == None: index = arange(dataset.size()) if not isinstance(index,ndarray): index=array(index) estimation_size_agents = self.estimate_config.get("estimation_size_agents", None) # should be a proportion of the agent_set if estimation_size_agents == None: estimation_size_agents = 1.0 else: estimation_size_agents = max(min(estimation_size_agents,1.0),0.0) # between 0 and 1 if estimation_size_agents < 1.0: self.debug.print_debug("Sampling agents for estimation ...",3) estimation_idx = sample_noreplace(arange(index.size), int(index.size*estimation_size_agents)) else: estimation_idx = arange(index.size) estimation_idx = index[estimation_idx] self.debug.print_debug("Number of observations for estimation: " + str(estimation_idx.size),2) if estimation_idx.size <= 0: self.debug.print_debug("Nothing to be done.",2) return (None, None) coefficients = create_coefficient_from_specification(specification) self.specified_coefficients = SpecifiedCoefficients().create(coefficients, specification, neqs=1) submodels = self.specified_coefficients.get_submodels() self.get_status_for_gui().update_pieces_using_submodels(submodels=submodels, leave_pieces=2) self.map_agents_to_submodels(submodels, self.submodel_string, dataset, estimation_idx, dataset_pool=self.dataset_pool, resources = compute_resources, submodel_size_max=self.estimate_config.get('submodel_size_max', None)) variables = self.specified_coefficients.get_full_variable_names_without_constants() self.debug.print_debug("Compute variables ...",4) self.increment_current_status_piece() dataset.compute_variables(variables, dataset_pool=self.dataset_pool, resources = compute_resources) coef = {} estimated_coef={} self.outcome = {} dataset.compute_variables([outcome_attribute], dataset_pool=self.dataset_pool, resources=compute_resources) regression_resources=Resources(estimate_config) regression_resources.merge({"debug":self.debug}) outcome_variable_name = VariableName(outcome_attribute) for submodel in submodels: coef[submodel] = SpecifiedCoefficientsFor1Submodel(self.specified_coefficients,submodel) self.increment_current_status_piece() logger.log_status("Estimate regression for submodel " +str(submodel), tags=["estimate"], verbosity_level=2) #logger.log_status("Number of observations: " +str(self.observations_mapping[submodel].size), #tags=["estimate"], verbosity_level=2) self.data[submodel] = dataset.create_regression_data_for_estimation(coef[submodel], index = estimation_idx[self.observations_mapping[submodel]]) self.coefficient_names[submodel] = coef[submodel].get_coefficient_names_without_constant()[0,:] if (self.data[submodel].shape[0] > 0) and (self.data[submodel].size > 0) and (self.procedure is not None): # observations for this submodel available self.outcome[submodel] = dataset.get_attribute_by_index(outcome_variable_name.get_alias(), estimation_idx[self.observations_mapping[submodel]]) regression_resources.merge({"outcome": self.outcome[submodel]}) regression_resources.merge({"coefficient_names":self.coefficient_names[submodel].tolist(), "constant_position": coef[submodel].get_constants_positions()}) regression_resources.merge({"submodel": submodel}) estimated_coef[submodel] = self.procedure.run(self.data[submodel], self.regression, resources=regression_resources) if "estimators" in estimated_coef[submodel].keys(): coef[submodel].set_coefficient_values(estimated_coef[submodel]["estimators"]) if "standard_errors" in estimated_coef[submodel].keys(): coef[submodel].set_standard_errors(estimated_coef[submodel]["standard_errors"]) if "other_measures" in estimated_coef[submodel].keys(): for measure in estimated_coef[submodel]["other_measures"].keys(): coef[submodel].set_measure(measure, estimated_coef[submodel]["other_measures"][measure]) if "other_info" in estimated_coef[submodel].keys(): for info in estimated_coef[submodel]["other_info"]: coef[submodel].set_other_info(info, estimated_coef[submodel]["other_info"][info]) coefficients.fill_coefficients(coef) self.specified_coefficients.coefficients = coefficients self.save_predicted_values_and_errors(specification, coefficients, dataset, outcome_variable_name, index=index, data_objects=data_objects) return (coefficients, estimated_coef)
class RegressionModel(ChunkModel): model_name = "Regression Model" model_short_name = "RM" def __init__(self, regression_procedure="opus_core.linear_regression", submodel_string=None, run_config=None, estimate_config=None, debuglevel=0, dataset_pool=None): self.debug = DebugPrinter(debuglevel) self.dataset_pool = self.create_dataset_pool(dataset_pool) self.regression = RegressionModelFactory().get_model(name=regression_procedure) if self.regression == None: raise StandardError, "No regression procedure given." self.submodel_string = submodel_string self.run_config = run_config if self.run_config == None: self.run_config = Resources() if not isinstance(self.run_config,Resources) and isinstance(self.run_config, dict): self.run_config = Resources(self.run_config) self.estimate_config = estimate_config if self.estimate_config == None: self.estimate_config = Resources() if not isinstance(self.estimate_config,Resources) and isinstance(self.estimate_config, dict): self.estimate_config = Resources(self.estimate_config) self.data = {} self.coefficient_names = {} ChunkModel.__init__(self) self.get_status_for_gui().initialize_pieces(3, pieces_description = array(['initialization', 'computing variables', 'submodel: 1'])) def run(self, specification, coefficients, dataset, index=None, chunk_specification=None, data_objects=None, run_config=None, initial_values=None, procedure=None, debuglevel=0): """'specification' is of type EquationSpecification, 'coefficients' is of type Coefficients, 'dataset' is of type Dataset, 'index' are indices of individuals in dataset for which the model runs. If it is None, the whole dataset is considered. 'chunk_specification' determines number of chunks in which the simulation is processed. 'data_objects' is a dictionary where each key is the name of an data object ('zone', ...) and its value is an object of class Dataset. 'run_config' is of type Resources, it gives additional arguments for the run. If 'procedure' is given, it overwrites the regression_procedure of the constructor. 'initial_values' is an array of the initial values of the results. It will be overwritten by the results for those elements that are handled by the model (defined by submodels in the specification). By default the results are initialized with 0. 'debuglevel' overwrites the constructor 'debuglevel'. """ self.debug.flag = debuglevel if run_config == None: run_config = Resources() if not isinstance(run_config,Resources) and isinstance(run_config, dict): run_config = Resources(run_config) self.run_config = run_config.merge_with_defaults(self.run_config) self.run_config.merge({"debug":self.debug}) if data_objects is not None: self.dataset_pool.add_datasets_if_not_included(data_objects) self.dataset_pool.replace_dataset(dataset.get_dataset_name(), dataset) if procedure is not None: self.regression = RegressionModelFactory().get_model(name=procedure) if initial_values is None: self.initial_values = zeros((dataset.size(),), dtype=float32) else: self.initial_values = zeros((dataset.size(),), dtype=initial_values.dtype) self.initial_values[index] = initial_values if dataset.size()<=0: # no data loaded yet dataset.get_id_attribute() if index == None: index = arange(dataset.size()) result = ChunkModel.run(self, chunk_specification, dataset, index, float32, specification=specification, coefficients=coefficients) return result def run_chunk (self, index, dataset, specification, coefficients): self.specified_coefficients = SpecifiedCoefficients().create(coefficients, specification, neqs=1) compute_resources = Resources({"debug":self.debug}) submodels = self.specified_coefficients.get_submodels() self.get_status_for_gui().update_pieces_using_submodels(submodels=submodels, leave_pieces=2) self.map_agents_to_submodels(submodels, self.submodel_string, dataset, index, dataset_pool=self.dataset_pool, resources = compute_resources) variables = self.specified_coefficients.get_full_variable_names_without_constants() self.debug.print_debug("Compute variables ...",4) self.increment_current_status_piece() dataset.compute_variables(variables, dataset_pool = self.dataset_pool, resources = compute_resources) data = {} coef = {} outcome=self.initial_values[index].copy() for submodel in submodels: coef[submodel] = SpecifiedCoefficientsFor1Submodel(self.specified_coefficients,submodel) self.coefficient_names[submodel] = coef[submodel].get_coefficient_names_without_constant()[0,:] self.debug.print_debug("Compute regression for submodel " +str(submodel),4) self.increment_current_status_piece() self.data[submodel] = dataset.create_regression_data(coef[submodel], index = index[self.observations_mapping[submodel]]) nan_index = where(isnan(self.data[submodel]))[1] inf_index = where(isinf(self.data[submodel]))[1] vnames = asarray(coef[submodel].get_variable_names()) if nan_index.size > 0: nan_var_index = unique(nan_index) self.data[submodel] = nan_to_num(self.data[submodel]) logger.log_warning("NaN(Not A Number) is returned from variable %s; it is replaced with %s." % (vnames[nan_var_index], nan_to_num(nan))) #raise ValueError, "NaN(Not A Number) is returned from variable %s; check the model specification table and/or attribute values used in the computation for the variable." % vnames[nan_var_index] if inf_index.size > 0: inf_var_index = unique(inf_index) self.data[submodel] = nan_to_num(self.data[submodel]) logger.log_warning("Inf is returned from variable %s; it is replaced with %s." % (vnames[inf_var_index], nan_to_num(inf))) #raise ValueError, "Inf is returned from variable %s; check the model specification table and/or attribute values used in the computation for the variable." % vnames[inf_var_index] if (self.data[submodel].shape[0] > 0) and (self.data[submodel].size > 0): # observations for this submodel available outcome[self.observations_mapping[submodel]] = \ self.regression.run(self.data[submodel], coef[submodel].get_coefficient_values()[0,:], resources=self.run_config).astype(outcome.dtype) return outcome def correct_infinite_values(self, dataset, outcome_attribute_name, maxvalue=1e+38, clip_all_larger_values=False): """Check if the model resulted in infinite values. If yes, print warning and clip the values to maxvalue. If clip_all_larger_values is True, all values larger than maxvalue are clip to maxvalue. """ infidx = where(dataset.get_attribute(outcome_attribute_name) == inf)[0] if infidx.size > 0: logger.log_warning("Infinite values in %s. Clipped to %s." % (outcome_attribute_name, maxvalue)) dataset.set_values_of_one_attribute(outcome_attribute_name, maxvalue, infidx) if clip_all_larger_values: idx = where(dataset.get_attribute(outcome_attribute_name) > maxvalue)[0] if idx.size > 0: logger.log_warning("Values in %s larger than %s. Clipped to %s." % (outcome_attribute_name, maxvalue, maxvalue)) dataset.set_values_of_one_attribute(outcome_attribute_name, maxvalue, idx) def estimate(self, specification, dataset, outcome_attribute, index = None, procedure=None, data_objects=None, estimate_config=None, debuglevel=0): """'specification' is of type EquationSpecification, 'dataset' is of type Dataset, 'outcome_attribute' - string that determines the dependent variable, 'index' are indices of individuals in dataset for which the model runs. If it is None, the whole dataset is considered. 'procedure' - name of the estimation procedure. If it is None, there should be an entry "estimation" in 'estimate_config' that determines the procedure. The class must have a method 'run' that takes as arguments 'data', 'regression_procedure' and 'resources'. It returns a dictionary with entries 'estimators', 'standard_errors' and 't_values' (all 1D numpy arrays). 'data_objects' is a dictionary where each key is the name of an data object ('zone', ...) and its value is an object of class Dataset. 'estimate_config' is of type Resources, it gives additional arguments for the estimation procedure. 'debuglevel' overwrites the class 'debuglevel'. """ #import wingdbstub self.debug.flag = debuglevel if estimate_config == None: estimate_config = Resources() if not isinstance(estimate_config,Resources) and isinstance(estimate_config, dict): estimate_config = Resources(estimate_config) self.estimate_config = estimate_config.merge_with_defaults(self.estimate_config) if data_objects is not None: self.dataset_pool.add_datasets_if_not_included(data_objects) self.procedure=procedure if self.procedure == None: self.procedure = self.estimate_config.get("estimation", None) if self.procedure is not None: self.procedure = ModelComponentCreator().get_model_component(self.procedure) else: logger.log_warning("No estimation procedure given, or problems with loading the corresponding module.") compute_resources = Resources({"debug":self.debug}) if dataset.size()<=0: # no data loaded yet dataset.get_id_attribute() if index == None: index = arange(dataset.size()) if not isinstance(index,ndarray): index=array(index) estimation_size_agents = self.estimate_config.get("estimation_size_agents", None) # should be a proportion of the agent_set if estimation_size_agents == None: estimation_size_agents = 1.0 else: estimation_size_agents = max(min(estimation_size_agents,1.0),0.0) # between 0 and 1 if estimation_size_agents < 1.0: self.debug.print_debug("Sampling agents for estimation ...",3) estimation_idx = sample_noreplace(arange(index.size), int(index.size*estimation_size_agents)) else: estimation_idx = arange(index.size) estimation_idx = index[estimation_idx] self.debug.print_debug("Number of observations for estimation: " + str(estimation_idx.size),2) if estimation_idx.size <= 0: self.debug.print_debug("Nothing to be done.",2) return (None, None) coefficients = create_coefficient_from_specification(specification) self.specified_coefficients = SpecifiedCoefficients().create(coefficients, specification, neqs=1) submodels = self.specified_coefficients.get_submodels() self.get_status_for_gui().update_pieces_using_submodels(submodels=submodels, leave_pieces=2) self.map_agents_to_submodels(submodels, self.submodel_string, dataset, estimation_idx, dataset_pool=self.dataset_pool, resources = compute_resources, submodel_size_max=self.estimate_config.get('submodel_size_max', None)) variables = self.specified_coefficients.get_full_variable_names_without_constants() self.debug.print_debug("Compute variables ...",4) self.increment_current_status_piece() dataset.compute_variables(variables, dataset_pool=self.dataset_pool, resources = compute_resources) coef = {} estimated_coef={} self.outcome = {} dataset.compute_variables([outcome_attribute], dataset_pool=self.dataset_pool, resources=compute_resources) regression_resources=Resources(estimate_config) regression_resources.merge({"debug":self.debug}) outcome_variable_name = VariableName(outcome_attribute) for submodel in submodels: coef[submodel] = SpecifiedCoefficientsFor1Submodel(self.specified_coefficients,submodel) self.increment_current_status_piece() logger.log_status("Estimate regression for submodel " +str(submodel), tags=["estimate"], verbosity_level=2) #logger.log_status("Number of observations: " +str(self.observations_mapping[submodel].size), #tags=["estimate"], verbosity_level=2) self.data[submodel] = dataset.create_regression_data_for_estimation(coef[submodel], index = estimation_idx[self.observations_mapping[submodel]]) self.coefficient_names[submodel] = coef[submodel].get_coefficient_names_without_constant()[0,:] if (self.data[submodel].shape[0] > 0) and (self.data[submodel].size > 0) and (self.procedure is not None): # observations for this submodel available self.outcome[submodel] = dataset.get_attribute_by_index(outcome_variable_name.get_alias(), estimation_idx[self.observations_mapping[submodel]]) regression_resources.merge({"outcome": self.outcome[submodel]}) regression_resources.merge({"coefficient_names":self.coefficient_names[submodel].tolist(), "constant_position": coef[submodel].get_constants_positions()}) regression_resources.merge({"submodel": submodel}) estimated_coef[submodel] = self.procedure.run(self.data[submodel], self.regression, resources=regression_resources) if "estimators" in estimated_coef[submodel].keys(): coef[submodel].set_coefficient_values(estimated_coef[submodel]["estimators"]) if "standard_errors" in estimated_coef[submodel].keys(): coef[submodel].set_standard_errors(estimated_coef[submodel]["standard_errors"]) if "other_measures" in estimated_coef[submodel].keys(): for measure in estimated_coef[submodel]["other_measures"].keys(): coef[submodel].set_measure(measure, estimated_coef[submodel]["other_measures"][measure]) if "other_info" in estimated_coef[submodel].keys(): for info in estimated_coef[submodel]["other_info"]: coef[submodel].set_other_info(info, estimated_coef[submodel]["other_info"][info]) coefficients.fill_coefficients(coef) self.specified_coefficients.coefficients = coefficients self.save_predicted_values_and_errors(specification, coefficients, dataset, outcome_variable_name, index=index, data_objects=data_objects) return (coefficients, estimated_coef) def prepare_for_run(self, dataset=None, dataset_filter=None, filter_threshold=0, **kwargs): spec, coef = prepare_specification_and_coefficients(**kwargs) if (dataset is not None) and (dataset_filter is not None): filter_values = dataset.compute_variables([dataset_filter], dataset_pool=self.dataset_pool) index = where(filter_values > filter_threshold)[0] else: index = None return (spec, coef, index) def prepare_for_estimate(self, dataset=None, dataset_filter=None, filter_threshold=0, **kwargs): spec = get_specification_for_estimation(**kwargs) if (dataset is not None) and (dataset_filter is not None): filter_values = dataset.compute_variables([dataset_filter], dataset_pool=self.dataset_pool) index = where(filter_values > filter_threshold)[0] else: index = None return (spec, index) def get_data_as_dataset(self, submodel=-2): """Like get_all_data, but the retuning value is a Dataset containing attributes that correspond to the data columns. Their names are coefficient names.""" all_data = self.get_all_data(submodel) if all_data is None: return None names = self.get_coefficient_names(submodel) if names is None: return None dataset_data = {} for i in range(names.size): dataset_data[names[i]] = all_data[:, i].reshape(all_data.shape[0]) dataset_data["id"] = arange(all_data.shape[0])+1 storage = StorageFactory().get_storage('dict_storage') storage.write_table(table_name='dataset', table_data=dataset_data) ds = Dataset(in_storage=storage, id_name="id", in_table_name='dataset') return ds def save_predicted_values_and_errors(self, specification, coefficients, dataset, outcome_variable, index=None, data_objects=None): if self.estimate_config.get('save_predicted_values_and_errors', False): logger.log_status('Computing predicted values and residuals.') original_values = dataset.get_attribute_by_index(outcome_variable, index) predicted_values = zeros(dataset.size(), dtype='float32') predicted_values[index] = self.run_after_estimation(specification, coefficients, dataset, index=index, data_objects=data_objects) predicted_attribute_name = 'predicted_%s' % outcome_variable.get_alias() dataset.add_primary_attribute(name=predicted_attribute_name, data=predicted_values) dataset.flush_attribute(predicted_attribute_name) predicted_error_attribute_name = 'residuals_%s' % outcome_variable.get_alias() error_values = zeros(dataset.size(), dtype='float32') error_values[index] = (original_values - predicted_values[index]).astype(error_values.dtype) dataset.add_primary_attribute(name=predicted_error_attribute_name, data = error_values) dataset.flush_attribute(predicted_error_attribute_name) logger.log_status('Predicted values saved as %s (for the %s dataset)' % (predicted_attribute_name, dataset.get_dataset_name())) logger.log_status('Residuals saved as %s (for the %s dataset)' % (predicted_error_attribute_name, dataset.get_dataset_name())) def export_estimation_data(self, submodel=-2, file_name='./estimation_data_regression.txt', delimiter = '\t'): import os from numpy import newaxis data = concatenate((self.outcome[submodel][...,newaxis], self.get_all_data(submodel=submodel)), axis=1) header = ['outcome'] + self.get_coefficient_names(submodel).tolist() nrows = data.shape[0] file_name_root, file_name_ext = os.path.splitext(file_name) out_file = "%s_submodel_%s.txt" % (file_name_root, submodel) fh = open(out_file,'w') fh.write(delimiter.join(header) + '\n') #file header for row in range(nrows): line = [str(x) for x in data[row,]] fh.write(delimiter.join(line) + '\n') fh.flush() fh.close print 'Data written into %s' % out_file def run_after_estimation(self, *args, **kwargs): return self.run(*args, **kwargs) def _get_status_total_pieces(self): return ChunkModel._get_status_total_pieces(self) * self.get_status_for_gui().get_total_number_of_pieces() def _get_status_current_piece(self): return ChunkModel._get_status_current_piece(self)*self.get_status_for_gui().get_total_number_of_pieces() + self.get_status_for_gui().get_current_piece() def _get_status_piece_description(self): return "%s %s" % (ChunkModel._get_status_piece_description(self), self.get_status_for_gui().get_current_piece_description()) def get_specified_coefficients(self): return self.specified_coefficients
def run( self, building_set, building_types_table, vacancy_table, year, location_set, building_categories=None, dataset_pool=None, resources=None ): building_types = building_types_table.get_attribute("name") building_id_name = building_set.get_id_name()[0] location_id_name = location_set.get_id_name()[0] new_buildings = {building_id_name: array([], dtype=building_set.get_data_type(building_id_name)), "building_type_id":array([], dtype=building_set.get_data_type("building_type_id", int8)), "year_built": array([], dtype=building_set.get_data_type("year_built", int32)), "sqft": array([], dtype=building_set.get_data_type("sqft", int32)), "residential_units": array([], dtype=building_set.get_data_type("residential_units", int32)), "improvement_value": array([], dtype= building_set.get_data_type("improvement_value", float32)), "land_value": array([], dtype= building_set.get_data_type("land_value", float32)), location_id_name: array([], dtype=building_set.get_data_type(location_id_name, int32))} max_id = building_set.get_id_attribute().max() buildings_set_size_orig = building_set.size() for itype in range(building_types_table.size()): # iterate over building types type = building_types[itype] type_code = building_types_table.get_id_attribute()[itype] is_residential = building_types_table.get_attribute("is_residential")[itype] vacancy_attribute = 'target_total_%s_vacancy' % type if vacancy_attribute not in vacancy_table.get_known_attribute_names(): logger.log_warning("No target vacancy for building type '%s'. Transition model for this building type skipped." % type) continue vacancy_table.get_attribute(vacancy_attribute) # ensures that the attribute is loaded target_vacancy_rate = eval("vacancy_table.get_data_element_by_id( year ).%s" % vacancy_attribute) compute_resources = Resources(resources) compute_resources.merge({"debug":self.debug}) units_attribute = building_types_table.get_attribute('units')[itype] # determine current-year vacancy rates if is_residential: default_vacancy_variable = "urbansim.%s.vacant_%s_units_from_buildings" % ( location_set.get_dataset_name(), type) else: default_vacancy_variable = "urbansim.%s.vacant_%s_sqft_from_buildings" % ( location_set.get_dataset_name(), type) variable_for_vacancy = compute_resources.get( "%s_vacant_variable" % type, default_vacancy_variable) location_set.compute_variables([variable_for_vacancy, "urbansim.%s.buildings_%s_space" % ( location_set.get_dataset_name(),type)], dataset_pool=dataset_pool, resources = compute_resources) vacant_units_sum = location_set.get_attribute(variable_for_vacancy).sum() units_sum = float( location_set.get_attribute("buildings_%s_space" % type).sum() ) vacant_rate = self.safe_divide(vacant_units_sum, units_sum) should_develop_units = int(round(max( 0, ( target_vacancy_rate * units_sum - vacant_units_sum ) / ( 1 - target_vacancy_rate ) ))) logger.log_status(type + ": vacant units: %d, should be vacant: %f, sum units: %d" % (vacant_units_sum, target_vacancy_rate * units_sum, units_sum)) if not should_develop_units: logger.log_note(("Will not build any " + type + " units, because the current vacancy of %d units\n" + "is more than the %d units desired for the vacancy rate of %f.") % (vacant_units_sum, target_vacancy_rate * units_sum, target_vacancy_rate)) continue improvement_value = building_set.compute_variables("urbansim.%s.%s_improvement_value" % ( building_set.get_dataset_name(), type), dataset_pool=dataset_pool, resources=compute_resources) average_improvement_value = improvement_value.sum()/ units_sum #create buildings is_building_type = building_set.compute_variables("urbansim.building.is_building_type_%s" % type, dataset_pool=dataset_pool, resources=compute_resources) units_of_this_type = building_set.compute_variables(units_attribute, dataset_pool=dataset_pool, resources=compute_resources) units_of_this_type = units_of_this_type*is_building_type units_without_zeros_idx = where(units_of_this_type > 0)[0] history_values_without_zeros = units_of_this_type[units_without_zeros_idx] history_improvement_values_without_zeros = where(improvement_value[units_without_zeros_idx]>0, improvement_value[units_without_zeros_idx], average_improvement_value) mean_size = history_values_without_zeros.mean() idx = array( [], dtype="int32" ) # Ensure that there are some development projects to choose from. num_of_projects_to_select = max( 10, int( should_develop_units / mean_size ) ) while True: idx = concatenate( ( idx, randint( 0, history_values_without_zeros.size, size=num_of_projects_to_select) ) ) csum = history_values_without_zeros[idx].cumsum() idx = idx[where( csum <= should_develop_units )] if csum[-1] >= should_develop_units: break nbuildings = idx.size new_buildings["building_type_id"] = concatenate((new_buildings["building_type_id"], type_code*ones(nbuildings))) new_buildings["year_built"] = concatenate((new_buildings["year_built"], year*ones(nbuildings))) new_max_id = max_id + nbuildings new_buildings[building_id_name]=concatenate((new_buildings[building_id_name], arange(max_id+1, new_max_id+1))) max_id = new_max_id new_buildings["improvement_value"] = concatenate((new_buildings["improvement_value"], history_improvement_values_without_zeros[idx])) if is_residential: target_size_attribute = "residential_units" zero_attribute = "sqft" else: target_size_attribute = "sqft" zero_attribute = "residential_units" new_buildings[target_size_attribute] = concatenate((new_buildings[target_size_attribute], history_values_without_zeros[idx])) new_buildings[zero_attribute] = concatenate((new_buildings[zero_attribute], zeros(nbuildings))) new_buildings[location_id_name] = concatenate((new_buildings[location_id_name], zeros(nbuildings))) new_buildings["land_value"] = concatenate((new_buildings["land_value"], zeros(nbuildings))) logger.log_status("Creating %s %s of %s %s buildings." % (history_values_without_zeros[idx].sum(), target_size_attribute, nbuildings, type)) building_set.add_elements(new_buildings, require_all_attributes=False) if building_categories: # should be a dictionary of categories for each building type building_set.resources['building_categories'] = building_categories # add submodel attribute category_variables = map(lambda type: "urbansim.%s.size_category_%s" % (building_set.get_dataset_name(), type), building_types) for category_var in category_variables: var = VariableName(category_var) if var.get_alias() in building_set.get_known_attribute_names(): building_set.delete_one_attribute(var) building_set.compute_variables(var, dataset_pool=dataset_pool, resources = compute_resources) building_set.add_primary_attribute(building_set.get_attribute(var), var.get_alias()) difference = building_set.size() - buildings_set_size_orig return difference
def run(self, building_set, new_building_copy_attrs, building_type_table, building_type_classification_table, vacancy_table, history_table, year, location_set, resources=None): building_classes = building_type_classification_table.get_attribute( "name") unit_attributes = building_type_classification_table.get_attribute( 'units') building_id_name = building_set.get_id_name()[0] location_id_name = location_set.get_id_name()[0] calc_attributes = [building_id_name, location_id_name, "year_built"] new_buildings = {} for attribute in new_building_copy_attrs: new_buildings[attribute] = array( [], dtype=building_set.get_data_type(attribute)) for attribute in calc_attributes: new_buildings[attribute] = array( [], dtype=building_set.get_data_type(attribute)) # for convenience, make a map of building_type_id => (building_type)class_id # these names are hard-wired elsewhere building_type_id_to_class_id = {} building_type_ids = building_type_table.get_attribute( "building_type_id") for idx in range(building_type_table.size()): building_type_id_to_class_id[building_type_ids[idx]] = \ building_type_table.get_attribute("class_id")[idx] logger.log_status("building_type_id_to_class_id = " + str(building_type_id_to_class_id)) # and make an column for the history table of the use classes history_type_classes = zeros((history_table.size()), dtype=int8) history_types = history_table.get_attribute("building_type_id") for idx in range(history_table.size()): history_type_classes[idx] = building_type_id_to_class_id[ history_types[idx]] logger.log_status("history_types=" + str(history_types)) logger.log_status("history_type_classes=" + str(history_type_classes)) max_id = building_set.get_id_attribute().max() new_building_id_start = max_id + 1 new_building_id_end = max_id + 1 building_set_size_orig = building_set.size() for itype in range(building_type_classification_table.size() ): # iterate over building types building_class = building_classes[itype] building_class_id = building_type_classification_table.get_attribute( "class_id")[itype] vacancy_attribute = 'target_total_%s_vacancy' % building_class.lower( ) if vacancy_attribute not in vacancy_table.get_known_attribute_names( ): logger.log_warning( "No target vacancy for building class '%s' (e.g. no '%s' in target_vacancies). Transition model for this building class skipped." % (building_class, vacancy_attribute)) continue vacancy_table.get_attribute( vacancy_attribute) # ensures that the attribute is loaded target_vacancy_rate = eval( "vacancy_table.get_data_element_by_id( year ).%s" % vacancy_attribute) logger.log_status( "Target vacancy rate for building_class %s is %f" % (building_class, target_vacancy_rate)) compute_resources = Resources(resources) compute_resources.merge({"debug": self.debug}) units_attribute = unit_attributes[itype] occupied_sqft_attribute = 'occupied_sqft_of_typeclass_%s' % building_class.lower( ) total_sqft_attribute = 'where(sanfrancisco.building.building_typeclass_name==\'%s\',sanfrancisco.building.building_sqft,0)' % building_class.lower( ) # determine current-year vacancy rates building_set.compute_variables( ("sanfrancisco.building." + occupied_sqft_attribute, total_sqft_attribute), resources=compute_resources) occupied_sqft_sum = building_set.get_attribute( occupied_sqft_attribute).sum() total_sqft_sum = float( building_set.get_attribute(total_sqft_attribute).sum()) occupancy_rate = self.safe_divide(occupied_sqft_sum, total_sqft_sum) # cap it at 1.0 if occupancy_rate > 1.0: occupancy_rate = 1.0 vacancy_rate = 1.0 - occupancy_rate vacant_sqft_sum = vacancy_rate * total_sqft_sum should_develop_sqft = (target_vacancy_rate * total_sqft_sum) - vacant_sqft_sum logger.log_status( "%s: vacancy rate: %4.3f occupancy rate: %4.3f" % (building_class, vacancy_rate, occupancy_rate)) logger.log_status( "%s: vacant: %d, should be vacant: %f, sum units: %d" % (building_class, vacant_sqft_sum, target_vacancy_rate * total_sqft_sum, total_sqft_sum)) if should_develop_sqft <= 0: logger.log_note(( "Will not build any %s units, because the current vacancy of %d sqft\n" + "is more than the %d sqft desired for the vacancy rate of %f." ) % (building_class, vacant_sqft_sum, target_vacancy_rate * total_sqft_sum, target_vacancy_rate)) continue #create buildings # find sample set of qualifying buildings in the events history, # e.g. where the building_type is in the correct class, and a positive # number of units or sqft (or whatever) were present history_sqft = history_table.get_attribute('building_sqft') index_sampleset = where((history_sqft > 0) & ( history_type_classes == building_class_id))[0] # Ensure that there are some development projects to choose from. logger.log_status("should_develop_sqft=" + str(should_develop_sqft)) if index_sampleset.shape[0] == 0: logger.log_warning( "Cannot create new buildings for building use class %s; no buildings in the event history table from which to sample." % building_class) continue history_sqft_sampleset = history_sqft[index_sampleset] logger.log_status("history_sqft_sampleset = " + str(history_sqft_sampleset)) mean_size = history_sqft_sampleset.mean() idx = array([], dtype="int32") #TODO: should the 'int' in the following line be 'ceil'? num_of_projects_to_select = max( 10, int(should_develop_sqft / mean_size)) while True: idx = concatenate((idx, randint(0, history_sqft_sampleset.size, size=num_of_projects_to_select))) csum = history_sqft_sampleset[idx].cumsum() idx = idx[where(csum <= should_develop_sqft)] if csum[-1] >= should_develop_sqft: break logger.log_status("idx = " + str(idx)) nbuildings = idx.size if nbuildings == 0: continue new_building_id_end = new_building_id_start + nbuildings # copy_attributes for attribute in new_building_copy_attrs: attr_values = history_table.get_attribute(attribute)[ index_sampleset[idx]] new_buildings[attribute] = concatenate( (new_buildings[attribute], attr_values)) # calc_attributes new_buildings[building_id_name] = concatenate( (new_buildings[building_id_name], arange(new_building_id_start, new_building_id_end))) new_buildings[location_id_name] = concatenate( (new_buildings[location_id_name], zeros(nbuildings))) new_buildings["year_built"] = concatenate( (new_buildings["year_built"], year * ones(nbuildings))) logger.log_status("Creating %s sqft of %s %s buildings." % (history_sqft_sampleset[idx].sum(), nbuildings, building_class)) new_building_id_start = new_building_id_end + 1 logger.log_status(new_buildings) building_set.add_elements(new_buildings, require_all_attributes=False) difference = building_set.size() - building_set_size_orig index = arange(difference) + building_set_size_orig return index
class Estimator(GenericModelExplorer): def __init__(self, config=None, save_estimation_results=False): if 'cache_directory' not in config or config['cache_directory'] is None: raise KeyError( "The cache directory must be specified in the " "given configuration, giving the filesystem path to the cache " "directory containing the data with which to estimate. Please " "check that your configuration contains the 'cache_directory' " "entry and that it is not None.") self.simulation_state = SimulationState(new_instance=True) self.simulation_state.set_cache_directory(config['cache_directory']) SessionConfiguration( new_instance=True, package_order=config['dataset_pool_configuration'].package_order, in_storage=AttributeCache()) self.config = Resources(config) self.save_estimation_results = save_estimation_results self.debuglevel = self.config.get("debuglevel", 4) self.model_system = ModelSystem() self.agents_index_for_prediction = None models = self.config.get('models', []) self.model_name = None if "model_name" in config.keys(): self.model_name = config["model_name"] else: for model in models: if isinstance(model, dict): model_name = model.keys()[0] if (model[model_name] == "estimate") or ( isinstance(model[model_name], list) and ("estimate" in model[model_name])): self.model_name = model_name break estimate_config_changes = self.config.get( 'config_changes_for_estimation', {}).get('estimate_config', {}) if len(estimate_config_changes) > 0: change = Resources({ 'models_configuration': { self.model_name: { 'controller': { 'init': { 'arguments': {} } } } } }) estimate_config_str = self.config['models_configuration'].get( self.model_name, {}).get('controller', {}).get('init', {}).get('arguments', {}).get('estimate_config', '{}') estimate_config = Resources({}) try: estimate_config = eval(estimate_config_str) except: pass estimate_config.merge(estimate_config_changes) self.config.merge(change) self.config['models_configuration'][ self.model_name]['controller']['init']['arguments'][ 'estimate_config'] = 'Resources(%s)' % estimate_config def estimate(self, out_storage=None): self.model_system.run(self.config, write_datasets_to_cache_at_end_of_year=False) self.extract_coefficients_and_specification() if self.save_estimation_results: self.save_results(out_storage=out_storage) def reestimate(self, specification_module_name=None, specification_dict=None, out_storage=None, type=None, submodels=None): """specification_module_name is name of a module that contains a dictionary called 'specification'. If it is not given, the argument specification_dict must be given which is a dictionary object. 'type' is the name of model member, such as 'commercial', 'residential'. The specification dictionary is expected to have an entry of this name. If 'submodels' is given (list or a number), the restimation is done only for those submodels. """ if specification_module_name is not None: exec("import " + specification_module_name) eval("reload (" + specification_module_name + ")") exec("specification_dict =" + specification_module_name + ".specification") if type is not None: specification_dict = specification_dict[type] if submodels is not None: #remove all submodels but the given ones from specification submodels_to_be_deleted = specification_dict.keys() if not isinstance(submodels, list): submodels = [submodels] for sm in submodels: if sm not in submodels_to_be_deleted: raise ValueError, "Submodel %s not in the specification." % sm submodels_to_be_deleted.remove(sm) if "_definition_" in submodels_to_be_deleted: submodels_to_be_deleted.remove("_definition_") for sm in submodels_to_be_deleted: del specification_dict[sm] self.specification = EquationSpecification( specification_dict=specification_dict) new_namespace = self.model_system.run_year_namespace keys_coeff_spec = self.get_keys_for_coefficients_and_specification() new_namespace[keys_coeff_spec["specification"]] = self.specification self.coefficients, coeff_dict_dummy = self.model_system.do_process( new_namespace) ## update run_year_namespce since it's not been updated by do_process self.model_system.run_year_namespace = new_namespace self.model_system.run_year_namespace[ keys_coeff_spec["coefficients"]] = self.coefficients ## this gets coeff and spec from run_year_namespce and is only updated in _run_year method #self.extract_coefficients_and_specification() if self.save_estimation_results: self.save_results(out_storage=out_storage) def predict(self, predicted_choice_id_name, agents_index=None): """ Run prediction. Currently makes sense only for choice models.""" # Create temporary configuration where all words 'estimate' are replaced by 'run' tmp_config = Resources(self.config) if self.agents_index_for_prediction is None: self.agents_index_for_prediction = self.get_agent_set_index().copy( ) if agents_index is None: agents_index = self.agents_index_for_prediction tmp_config['models_configuration'][self.model_name]['controller'][ 'run']['arguments']['coefficients'] = "coeff_est" tmp_config['models_configuration'][self.model_name]['controller'][ 'run']['arguments']['agents_index'] = "agents_index" tmp_config['models_configuration'][self.model_name]['controller'][ 'run']['arguments']['chunk_specification'] = "{'nchunks':1}" ### save specification and coefficients to cache (no matter the save_estimation_results flag) ### so that the prepare_for_run method could load specification and coefficients from there #output_configuration = self.config['output_configuration'] #del self.config['output_configuration'] #self.save_results() #self.config['output_configuration'] = output_configuration #self.model_system.run_year_namespace["coefficients"] = self.coefficients #del tmp_config['models_configuration'][self.model_name]['controller']['prepare_for_run'] try: run_year_namespace = copy.copy( self.model_system.run_year_namespace) except: logger.log_error("The estimate() method must be run first") return False try: agents = self.get_agent_set() choice_id_name = self.get_choice_set().get_id_name()[0] # save current locations of agents current_choices = agents.get_attribute(choice_id_name).copy() dummy_data = zeros(current_choices.size, dtype=current_choices.dtype) - 1 agents.modify_attribute(name=choice_id_name, data=dummy_data) #reset all choices run_year_namespace["process"] = "run" run_year_namespace["coeff_est"] = self.coefficients run_year_namespace["agents_index"] = agents_index run_year_namespace["processmodel_config"] = tmp_config[ 'models_configuration'][self.model_name]['controller']['run'] new_choices = self.model_system.do_process(run_year_namespace) #self.model_system.run(tmp_config, write_datasets_to_cache_at_end_of_year=False) #new_choices = agents.get_attribute(choice_id_name).copy() agents.modify_attribute(name=choice_id_name, data=current_choices) dummy_data[agents_index] = new_choices if predicted_choice_id_name not in agents.get_known_attribute_names( ): agents.add_primary_attribute(name=predicted_choice_id_name, data=dummy_data) else: agents.modify_attribute(name=predicted_choice_id_name, data=dummy_data) logger.log_status("Predictions saved into attribute " + predicted_choice_id_name) return True except Exception, e: logger.log_error("Error encountered in prediction: %s" % e) logger.log_stack_trace() return False
def run( self, building_set, # building_use_table, building_use_classification_table, vacancy_table, history_table, year, location_set, resources=None): building_classes = building_use_classification_table.get_attribute( "name") unit_attributes = building_use_classification_table.get_attribute( 'units') building_id_name = building_set.get_id_name()[0] location_id_name = location_set.get_id_name()[0] new_buildings = { building_id_name: array([], dtype='int32'), "building_use_id": array([], dtype=int8), "year_built": array([], dtype='int32'), # "building_sqft": array([], dtype='int32'), # "residential_units": array([], dtype='int32'), "unit_price": array([], dtype=float32), location_id_name: array([], dtype='int32') } for attribute in unit_attributes: new_buildings[attribute] = array([], dtype='int32') max_id = building_set.get_id_attribute().max() building_set_size_orig = building_set.size() for itype in range(building_use_classification_table.size() ): # iterate over building types building_class = building_classes[itype] # type_code = building_types_table.get_id_attribute()[itype] vacancy_attribute = 'target_total_%s_vacancy' % building_class if vacancy_attribute not in vacancy_table.get_known_attribute_names( ): logger.log_warning( "No target vacancy for building class '%s'. Transition model for this building class skipped." % type) continue vacancy_table.get_attribute( vacancy_attribute) # ensures that the attribute is loaded target_vacancy_rate = eval( "vacancy_table.get_data_element_by_id( year ).%s" % vacancy_attribute) compute_resources = Resources(resources) compute_resources.merge({"debug": self.debug}) units_attribute = unit_attributes[itype] vacant_units_attribute = 'vacant_' + units_attribute # determine current-year vacancy rates building_set.compute_variables("urbansim_parcel.building." + vacant_units_attribute, resources=compute_resources) vacant_units_sum = building_set.get_attribute( vacant_units_attribute).sum() units_sum = float( building_set.get_attribute(units_attribute).sum()) vacant_rate = self.safe_divide(vacant_units_sum, units_sum) should_develop_units = max( 0, (target_vacancy_rate * units_sum - vacant_units_sum) / (1 - target_vacancy_rate)) logger.log_status( building_class + ": vacant units: %d, should be vacant: %f, sum units: %d" % (vacant_units_sum, target_vacancy_rate * units_sum, units_sum)) if not should_develop_units: logger.log_note(( "Will not build any " + building_class + " units, because the current vacancy of %d units\n" + "is more than the %d units desired for the vacancy rate of %f." ) % (vacant_units_sum, target_vacancy_rate * units_sum, target_vacancy_rate)) continue # average_buildings_value = None # if (type+"_improvement_value") in location_set.get_known_attribute_names(): # average_buildings_value = self.safe_divide( # location_set.get_attribute(type+"_improvement_value" ).sum(), units_sum) #create buildings history_values = history_table.get_attribute(units_attribute) index_non_zeros_values = where(history_values > 0)[0] history_values_without_zeros = history_values[ index_non_zeros_values] history_type = history_table.get_attribute("building_use_id") history_type_without_zeros = history_type[index_non_zeros_values] history_price = history_table.get_attribute("unit_price") history_price_without_zeros = history_price[index_non_zeros_values] #TODO: what happens if history has only zeroes? mean_size = history_values_without_zeros.mean() idx = array([]) # Ensure that there are some development projects to choose from. #TODO: should the 'int' in the following line be 'ceil'? num_of_projects_to_select = max( 10, int(should_develop_units / mean_size)) while True: idx = concatenate((idx, randint(0, history_values_without_zeros.size, size=num_of_projects_to_select))) csum = history_values_without_zeros[idx].cumsum() idx = idx[where(csum <= should_develop_units)] if csum[-1] >= should_develop_units: break nbuildings = idx.size for attribute in unit_attributes: #if attribute == units_attribute: #new_unit_values = history_values_without_zeros[idx] #else: #new_unit_values = zeros(nbuildings) #to accomodate mixed use buildings, allow non units_attribute to be non-zero new_unit_values = history_table.get_attribute(attribute)[ index_non_zeros_values[idx]] new_buildings[attribute] = concatenate( (new_buildings[attribute], new_unit_values)) new_max_id = max_id + nbuildings new_buildings[building_id_name] = concatenate( (new_buildings[building_id_name], arange(max_id + 1, new_max_id + 1))) new_buildings["building_use_id"] = concatenate( (new_buildings["building_use_id"], history_type_without_zeros[idx])) new_buildings["year_built"] = concatenate( (new_buildings["year_built"], year * ones(nbuildings, dtype="int32"))) new_buildings["unit_price"] = concatenate( (new_buildings["unit_price"], history_price_without_zeros[idx])) new_buildings[location_id_name] = concatenate( (new_buildings[location_id_name], zeros(nbuildings, dtype="int32"))) logger.log_status("Creating %s %s of %s %s buildings." % (history_values_without_zeros[idx].sum(), units_attribute, nbuildings, building_class)) building_set.add_elements(new_buildings, require_all_attributes=False) difference = building_set.size() - building_set_size_orig index = arange(difference) + building_set_size_orig return index
def get_resources(self, data_dictionary, dataset): """Create resources for computing a variable. """ resources=Resources() for key in data_dictionary.keys(): if key in self.datasets: data = data_dictionary[key] storage = StorageFactory().get_storage('dict_storage') if self.id_names[key] not in data_dictionary[key].keys() and not isinstance(self.id_names[key], list): data[self.id_names[key]] = arange(1, len(data_dictionary[key][data_dictionary[key].keys()[0]])+1) # add id array id_name = self.id_names[key] storage.write_table(table_name = 'data', table_data = data) if key == "gridcell": gc = GridcellDataset(in_storage=storage, in_table_name='data') # add relative_x and relative_y gc.get_id_attribute() n = int(ceil(sqrt(gc.size()))) if "relative_x" not in data.keys(): x = (indices((n,n))+1)[1].ravel() gc.add_attribute(x[0:gc.size()], "relative_x", metadata=1) if "relative_y" not in data.keys(): y = (indices((n,n))+1)[0].ravel() gc.add_attribute(y[0:gc.size()], "relative_y", metadata=1) resources.merge({key: gc}) elif key == "household": resources.merge({key: HouseholdDataset(in_storage=storage, in_table_name='data')}) elif key == "development_project": resources.merge({key: DevelopmentProjectDataset(in_storage=storage, in_table_name='data')}) elif key == "development_event": resources.merge({key: DevelopmentEventDataset(in_storage=storage, in_table_name='data')}) elif key == "neighborhood": resources.merge({key: NeighborhoodDataset(in_storage=storage, in_table_name='data')}) elif key == "job": resources.merge({key: JobDataset(in_storage=storage, in_table_name='data')}) elif key == "zone": resources.merge({key: ZoneDataset(in_storage=storage, in_table_name='data')}) elif key == "travel_data": resources.merge({key: TravelDataDataset(in_storage=storage, in_table_name='data')}) elif key == "faz": resources.merge({key: FazDataset(in_storage=storage, in_table_name='data')}) elif key == "fazdistrict": resources.merge({key: FazdistrictDataset(in_storage=storage, in_table_name='data')}) elif key == "race": resources.merge({key: RaceDataset(in_storage=storage, in_table_name='data')}) elif key == "county": resources.merge({key: CountyDataset(in_storage=storage, in_table_name='data')}) elif key == "large_area": resources.merge({key: LargeAreaDataset(in_storage=storage, in_table_name='data')}) elif key == "development_group": resources.merge({key: DevelopmentGroupDataset(in_storage=storage, in_table_name='data')}) elif key == "employment_sector_group": resources.merge({key: EmploymentSectorGroupDataset(in_storage=storage, in_table_name='data')}) elif key == "plan_type_group": resources.merge({key: PlanTypeGroupDataset(in_storage=storage, in_table_name='data')}) elif key == "building": resources.merge({key: BuildingDataset(in_storage=storage, in_table_name='data')}) else: resources.merge({key:data_dictionary[key]}) if dataset in self.interactions: if dataset == "household_x_gridcell": resources.merge({"dataset": HouseholdXGridcellDataset(dataset1=resources["household"], dataset2=resources["gridcell"])}) if dataset == "job_x_gridcell": resources.merge({"dataset": JobXGridcellDataset(dataset1=resources["job"], dataset2=resources["gridcell"])}) if dataset == "household_x_zone": resources.merge({"dataset": HouseholdXZoneDataset(dataset1=resources["household"], dataset2=resources["zone"])}) if dataset == "household_x_neighborhood": resources.merge({"dataset": HouseholdXNeighborhoodDataset(dataset1=resources["household"], dataset2=resources["neighborhood"])}) if dataset == "development_project_x_gridcell": resources.merge({"dataset": DevelopmentProjectXGridcellDataset(dataset1=resources["development_project"], dataset2=resources["gridcell"])}) else: resources.merge({"dataset": resources[dataset]}) resources.merge({"check_variables":'*', "debug":4}) return resources
def get_resources(self, data_dictionary, dataset): """Create resources for computing a variable. """ resources = Resources() for key in data_dictionary.keys(): if key in self.datasets: data = data_dictionary[key] if self.id_names[key] not in data_dictionary[key].keys( ) and not isinstance(self.id_names[key], list): data[self.id_names[key]] = arange(1,\ len(data_dictionary[key][data_dictionary[key].keys()[0]])+1) # add id array if key == "land_cover": land_cover_storage = StorageFactory().get_storage( 'dict_storage') land_cover_table_name = 'land_cover' land_cover_storage.write_table( table_name=land_cover_table_name, table_data=data, ) lc = LandCoverDataset( in_storage=land_cover_storage, in_table_name=land_cover_table_name, ) # add relative_x and relative_y lc.get_id_attribute() n = int(ceil(sqrt(lc.size()))) if "relative_x" not in data.keys(): x = (indices((n, n)) + 1)[1].ravel() lc.add_attribute(x[0:lc.size()], "relative_x", metadata=1) if "relative_y" not in data.keys(): y = (indices((n, n)) + 1)[0].ravel() lc.add_attribute(y[0:lc.size()], "relative_y", metadata=1) resources.merge({key: lc}) if key == "gridcell": gridcell_storage = StorageFactory().get_storage( 'dict_storage') gridcell_table_name = 'gridcell' gridcell_storage.write_table( table_name=gridcell_table_name, table_data=data, ) gridcell_dataset = GridcellDataset( in_storage=gridcell_storage, in_table_name=gridcell_table_name, ) resources.merge({key: gridcell_dataset}) else: resources.merge({key: data_dictionary[key]}) if dataset in self.interactions: pass else: resources.merge({"dataset": resources[dataset]}) resources.merge({"check_variables": '*', "debug": 4}) return resources
def estimate(self, specification, dataset, outcome_attribute, index=None, procedure=None, data_objects=None, estimate_config=None, debuglevel=0): """'specification' is of type EquationSpecification, 'dataset' is of type Dataset, 'outcome_attribute' - string that determines the dependent variable, 'index' are indices of individuals in dataset for which the model runs. If it is None, the whole dataset is considered. 'procedure' - name of the estimation procedure. If it is None, there should be an entry "estimation" in 'estimate_config' that determines the procedure. The class must have a method 'run' that takes as arguments 'data', 'regression_procedure' and 'resources'. It returns a dictionary with entries 'estimators', 'standard_errors' and 't_values' (all 1D numpy arrays). 'data_objects' is a dictionary where each key is the name of an data object ('zone', ...) and its value is an object of class Dataset. 'estimate_config' is of type Resources, it gives additional arguments for the estimation procedure. 'debuglevel' overwrites the class 'debuglevel'. """ #import wingdbstub self.debug.flag = debuglevel if estimate_config == None: estimate_config = Resources() if not isinstance(estimate_config, Resources) and isinstance( estimate_config, dict): estimate_config = Resources(estimate_config) self.estimate_config = estimate_config.merge_with_defaults( self.estimate_config) if data_objects is not None: self.dataset_pool.add_datasets_if_not_included(data_objects) self.procedure = procedure if self.procedure == None: self.procedure = self.estimate_config.get("estimation", None) if self.procedure is not None: self.procedure = ModelComponentCreator().get_model_component( self.procedure) else: logger.log_warning( "No estimation procedure given, or problems with loading the corresponding module." ) compute_resources = Resources({"debug": self.debug}) if dataset.size() <= 0: # no data loaded yet dataset.get_id_attribute() if index == None: index = arange(dataset.size()) if not isinstance(index, ndarray): index = array(index) estimation_size_agents = self.estimate_config.get( "estimation_size_agents", None) # should be a proportion of the agent_set if estimation_size_agents == None: estimation_size_agents = 1.0 else: estimation_size_agents = max(min(estimation_size_agents, 1.0), 0.0) # between 0 and 1 if estimation_size_agents < 1.0: self.debug.print_debug("Sampling agents for estimation ...", 3) estimation_idx = sample_noreplace( arange(index.size), int(index.size * estimation_size_agents)) else: estimation_idx = arange(index.size) estimation_idx = index[estimation_idx] self.debug.print_debug( "Number of observations for estimation: " + str(estimation_idx.size), 2) if estimation_idx.size <= 0: self.debug.print_debug("Nothing to be done.", 2) return (None, None) coefficients = create_coefficient_from_specification(specification) specified_coefficients = SpecifiedCoefficients().create(coefficients, specification, neqs=1) submodels = specified_coefficients.get_submodels() self.get_status_for_gui().update_pieces_using_submodels( submodels=submodels, leave_pieces=2) self.map_agents_to_submodels( submodels, self.submodel_string, dataset, estimation_idx, dataset_pool=self.dataset_pool, resources=compute_resources, submodel_size_max=self.estimate_config.get('submodel_size_max', None)) variables = specified_coefficients.get_full_variable_names_without_constants( ) self.debug.print_debug("Compute variables ...", 4) self.increment_current_status_piece() dataset.compute_variables(variables, dataset_pool=self.dataset_pool, resources=compute_resources) coef = {} estimated_coef = {} self.outcome = {} dataset.compute_variables([outcome_attribute], dataset_pool=self.dataset_pool, resources=compute_resources) regression_resources = Resources(estimate_config) regression_resources.merge({"debug": self.debug}) outcome_variable_name = VariableName(outcome_attribute) for submodel in submodels: coef[submodel] = SpecifiedCoefficientsFor1Submodel( specified_coefficients, submodel) self.increment_current_status_piece() logger.log_status("Estimate regression for submodel " + str(submodel), tags=["estimate"], verbosity_level=2) logger.log_status("Number of observations: " + str(self.observations_mapping[submodel].size), tags=["estimate"], verbosity_level=2) self.data[ submodel] = dataset.create_regression_data_for_estimation( coef[submodel], index=estimation_idx[self.observations_mapping[submodel]]) self.coefficient_names[submodel] = coef[ submodel].get_coefficient_names_without_constant()[0, :] if (self.data[submodel].shape[0] > 0 ) and (self.data[submodel].size > 0) and ( self.procedure is not None): # observations for this submodel available self.outcome[submodel] = dataset.get_attribute_by_index( outcome_variable_name.get_alias(), estimation_idx[self.observations_mapping[submodel]]) regression_resources.merge({"outcome": self.outcome[submodel]}) regression_resources.merge({ "coefficient_names": self.coefficient_names[submodel].tolist(), "constant_position": coef[submodel].get_constants_positions() }) estimated_coef[submodel] = self.procedure.run( self.data[submodel], self.regression, resources=regression_resources) if "estimators" in estimated_coef[submodel].keys(): coef[submodel].set_coefficient_values( estimated_coef[submodel]["estimators"]) if "standard_errors" in estimated_coef[submodel].keys(): coef[submodel].set_standard_errors( estimated_coef[submodel]["standard_errors"]) if "other_measures" in estimated_coef[submodel].keys(): for measure in estimated_coef[submodel][ "other_measures"].keys(): coef[submodel].set_measure( measure, estimated_coef[submodel]["other_measures"] [measure]) if "other_info" in estimated_coef[submodel].keys(): for info in estimated_coef[submodel]["other_info"]: coef[submodel].set_other_info( info, estimated_coef[submodel]["other_info"][info]) coefficients.fill_coefficients(coef) self.save_predicted_values_and_errors(specification, coefficients, dataset, outcome_variable_name, index=index, data_objects=data_objects) return (coefficients, estimated_coef)
def load_dataset(self, resources=None, attributes=None, in_storage=None, in_table_name=None, lowercase=None, **kwargs): #set defaults attributes_default = '*' lower_default = 1 # if 1, use lowercase for attribute names # merge arguments with dictionaries and add missing entries local_resources = Resources(self.resources) if resources is not None: local_resources.merge_if_not_None(resources) local_resources.merge_if_not_None({"attributes":attributes, "in_storage":in_storage, "in_table_name":in_table_name, "lowercase":lowercase}) local_resources.merge_with_defaults({"attributes":attributes_default, "lowercase":lower_default, }) # check obligatory entries local_resources.check_obligatory_keys(["in_storage", "in_table_name"]) # prepare for loading in_storage = local_resources["in_storage"] if not self._is_hidden_id(): local_resources.merge({"id_name":self._id_names}) table_name = local_resources['in_table_name'] column_names = local_resources['attributes'] chunked_attributes = self.chunk_columns(storage=in_storage, table_name=table_name, column_names=column_names, nchunks=1) # flatten list column_names = [name for name in chunked_attributes[0] if name in in_storage.get_column_names(table_name)] data = in_storage.load_table(table_name = table_name, column_names = column_names) self.df = pd.DataFrame(data) self.df.set_index(self._id_names, inplace=True) data_computed = {} if table_name+".computed" in in_storage.get_table_names(): column_names_computed = [name for name in column_names if name in in_storage.get_column_names(table_name+".computed")] data_computed = in_storage.load_table(table_name = table_name+".computed", column_names = column_names_computed) dfcomp = pd.DataFrame(data_computed) dfcomp.set_index(self._id_names, inplace=True) self.df = concat(self.df, dfcomp) for attr in data: if not ((attr in self._id_names) and self.attribute_boxes.has_key(attr)): #do not store id_name every time self.attribute_boxes[attr] = AttributeBox(self, [], variable_name=self.create_and_check_qualified_variable_name(attr), type=AttributeType.PRIMARY, is_in_memory=True, header=None, version=0) for attr in data_computed: if not ((attr in self._id_names) and self.attribute_boxes.has_key(attr)): #do not store id_name every time self.attribute_boxes[attr] = AttributeBox(self, [], variable_name=self.create_and_check_qualified_variable_name(attr), type=AttributeType.COMPUTED, is_in_memory=True, header=None, version=0) self.n = self.df.shape[0]