def run(self, specification, coefficients, dataset, index=None, chunk_specification=None, data_objects=None, run_config=None, initial_values=None, procedure=None, debuglevel=0): """'specification' is of type EquationSpecification, 'coefficients' is of type Coefficients, 'dataset' is of type Dataset, 'index' are indices of individuals in dataset for which the model runs. If it is None, the whole dataset is considered. 'chunk_specification' determines number of chunks in which the simulation is processed. 'data_objects' is a dictionary where each key is the name of an data object ('zone', ...) and its value is an object of class Dataset. 'run_config' is of type Resources, it gives additional arguments for the run. If 'procedure' is given, it overwrites the regression_procedure of the constructor. 'initial_values' is an array of the initial values of the results. It will be overwritten by the results for those elements that are handled by the model (defined by submodels in the specification). By default the results are initialized with 0. 'debuglevel' overwrites the constructor 'debuglevel'. """ self.debug.flag = debuglevel if run_config == None: run_config = Resources() if not isinstance(run_config, Resources) and isinstance( run_config, dict): run_config = Resources(run_config) self.run_config = run_config.merge_with_defaults(self.run_config) self.run_config.merge({"debug": self.debug}) if data_objects is not None: self.dataset_pool.add_datasets_if_not_included(data_objects) self.dataset_pool.replace_dataset(dataset.get_dataset_name(), dataset) if procedure is not None: self.regression = RegressionModelFactory().get_model( name=procedure) if initial_values is None: self.initial_values = zeros((dataset.size(), ), dtype=float32) else: self.initial_values = zeros((dataset.size(), ), dtype=initial_values.dtype) self.initial_values[index] = initial_values if dataset.size() <= 0: # no data loaded yet dataset.get_id_attribute() if index == None: index = arange(dataset.size()) result = ChunkModel.run(self, chunk_specification, dataset, index, float32, specification=specification, coefficients=coefficients) return result
def __init__(self, regression_procedure="opus_core.linear_regression", submodel_string=None, run_config=None, estimate_config=None, debuglevel=0, dataset_pool=None): self.debug = DebugPrinter(debuglevel) self.dataset_pool = self.create_dataset_pool(dataset_pool) self.regression = RegressionModelFactory().get_model(name=regression_procedure) if self.regression == None: raise StandardError, "No regression procedure given." self.submodel_string = submodel_string self.run_config = run_config if self.run_config == None: self.run_config = Resources() if not isinstance(self.run_config,Resources) and isinstance(self.run_config, dict): self.run_config = Resources(self.run_config) self.estimate_config = estimate_config if self.estimate_config == None: self.estimate_config = Resources() if not isinstance(self.estimate_config,Resources) and isinstance(self.estimate_config, dict): self.estimate_config = Resources(self.estimate_config) self.data = {} self.coefficient_names = {} ChunkModel.__init__(self) self.get_status_for_gui().initialize_pieces(3, pieces_description = array(['initialization', 'computing variables', 'submodel: 1']))
def __init__(self, regression_procedure="opus_core.linear_regression", submodel_string=None, run_config=None, estimate_config=None, debuglevel=0, dataset_pool=None): self.debug = DebugPrinter(debuglevel) self.dataset_pool = self.create_dataset_pool(dataset_pool) self.regression = RegressionModelFactory().get_model( name=regression_procedure) if self.regression == None: raise StandardError, "No regression procedure given." self.submodel_string = submodel_string self.run_config = run_config if self.run_config == None: self.run_config = Resources() if not isinstance(self.run_config, Resources) and isinstance( self.run_config, dict): self.run_config = Resources(self.run_config) self.estimate_config = estimate_config if self.estimate_config == None: self.estimate_config = Resources() if not isinstance(self.estimate_config, Resources) and isinstance( self.estimate_config, dict): self.estimate_config = Resources(self.estimate_config) self.data = {} self.coefficient_names = {} ChunkModel.__init__(self) self.get_status_for_gui().initialize_pieces(3, pieces_description=array([ 'initialization', 'computing variables', 'submodel: 1' ]))
def run(self, specification, coefficients, dataset, index=None, chunk_specification=None, data_objects=None, run_config=None, initial_values=None, procedure=None, debuglevel=0): """'specification' is of type EquationSpecification, 'coefficients' is of type Coefficients, 'dataset' is of type Dataset, 'index' are indices of individuals in dataset for which the model runs. If it is None, the whole dataset is considered. 'chunk_specification' determines number of chunks in which the simulation is processed. 'data_objects' is a dictionary where each key is the name of an data object ('zone', ...) and its value is an object of class Dataset. 'run_config' is of type Resources, it gives additional arguments for the run. If 'procedure' is given, it overwrites the regression_procedure of the constructor. 'initial_values' is an array of the initial values of the results. It will be overwritten by the results for those elements that are handled by the model (defined by submodels in the specification). By default the results are initialized with 0. 'debuglevel' overwrites the constructor 'debuglevel'. """ self.debug.flag = debuglevel if run_config == None: run_config = Resources() if not isinstance(run_config,Resources) and isinstance(run_config, dict): run_config = Resources(run_config) self.run_config = run_config.merge_with_defaults(self.run_config) self.run_config.merge({"debug":self.debug}) if data_objects is not None: self.dataset_pool.add_datasets_if_not_included(data_objects) self.dataset_name = dataset.get_dataset_name() self.dataset_pool.replace_dataset(self.dataset_name, dataset) if procedure is not None: self.regression = RegressionModelFactory().get_model(name=procedure) if initial_values is None: self.initial_values = zeros((dataset.size(),), dtype=float32) else: self.initial_values = zeros((dataset.size(),), dtype=initial_values.dtype) self.initial_values[index] = initial_values if dataset.size()<=0: # no data loaded yet dataset.get_id_attribute() if index == None: index = arange(dataset.size()) result = ChunkModel.run(self, chunk_specification, dataset, index, float32, specification=specification, coefficients=coefficients) return result
class RegressionModel(ChunkModel): model_name = "Regression Model" model_short_name = "RM" def __init__(self, regression_procedure="opus_core.linear_regression", submodel_string=None, run_config=None, estimate_config=None, debuglevel=0, dataset_pool=None): self.debug = DebugPrinter(debuglevel) self.dataset_pool = self.create_dataset_pool(dataset_pool) self.regression = RegressionModelFactory().get_model( name=regression_procedure) if self.regression == None: raise StandardError, "No regression procedure given." self.submodel_string = submodel_string self.run_config = run_config if self.run_config == None: self.run_config = Resources() if not isinstance(self.run_config, Resources) and isinstance( self.run_config, dict): self.run_config = Resources(self.run_config) self.estimate_config = estimate_config if self.estimate_config == None: self.estimate_config = Resources() if not isinstance(self.estimate_config, Resources) and isinstance( self.estimate_config, dict): self.estimate_config = Resources(self.estimate_config) self.data = {} self.coefficient_names = {} ChunkModel.__init__(self) self.get_status_for_gui().initialize_pieces(3, pieces_description=array([ 'initialization', 'computing variables', 'submodel: 1' ])) def run(self, specification, coefficients, dataset, index=None, chunk_specification=None, data_objects=None, run_config=None, initial_values=None, procedure=None, debuglevel=0): """'specification' is of type EquationSpecification, 'coefficients' is of type Coefficients, 'dataset' is of type Dataset, 'index' are indices of individuals in dataset for which the model runs. If it is None, the whole dataset is considered. 'chunk_specification' determines number of chunks in which the simulation is processed. 'data_objects' is a dictionary where each key is the name of an data object ('zone', ...) and its value is an object of class Dataset. 'run_config' is of type Resources, it gives additional arguments for the run. If 'procedure' is given, it overwrites the regression_procedure of the constructor. 'initial_values' is an array of the initial values of the results. It will be overwritten by the results for those elements that are handled by the model (defined by submodels in the specification). By default the results are initialized with 0. 'debuglevel' overwrites the constructor 'debuglevel'. """ self.debug.flag = debuglevel if run_config == None: run_config = Resources() if not isinstance(run_config, Resources) and isinstance( run_config, dict): run_config = Resources(run_config) self.run_config = run_config.merge_with_defaults(self.run_config) self.run_config.merge({"debug": self.debug}) if data_objects is not None: self.dataset_pool.add_datasets_if_not_included(data_objects) if procedure is not None: self.regression = RegressionModelFactory().get_model( name=procedure) if initial_values is None: self.initial_values = zeros((dataset.size(), ), dtype=float32) else: self.initial_values = zeros((dataset.size(), ), dtype=initial_values.dtype) self.initial_values[index] = initial_values if dataset.size() <= 0: # no data loaded yet dataset.get_id_attribute() if index == None: index = arange(dataset.size()) result = ChunkModel.run(self, chunk_specification, dataset, index, float32, specification=specification, coefficients=coefficients) return result def run_chunk(self, index, dataset, specification, coefficients): self.specified_coefficients = SpecifiedCoefficients().create( coefficients, specification, neqs=1) compute_resources = Resources({"debug": self.debug}) submodels = self.specified_coefficients.get_submodels() self.get_status_for_gui().update_pieces_using_submodels( submodels=submodels, leave_pieces=2) self.map_agents_to_submodels(submodels, self.submodel_string, dataset, index, dataset_pool=self.dataset_pool, resources=compute_resources) variables = self.specified_coefficients.get_full_variable_names_without_constants( ) self.debug.print_debug("Compute variables ...", 4) self.increment_current_status_piece() dataset.compute_variables(variables, dataset_pool=self.dataset_pool, resources=compute_resources) data = {} coef = {} outcome = self.initial_values[index].copy() for submodel in submodels: coef[submodel] = SpecifiedCoefficientsFor1Submodel( self.specified_coefficients, submodel) self.coefficient_names[submodel] = coef[ submodel].get_coefficient_names_without_constant()[0, :] self.debug.print_debug( "Compute regression for submodel " + str(submodel), 4) self.increment_current_status_piece() self.data[submodel] = dataset.create_regression_data( coef[submodel], index=index[self.observations_mapping[submodel]]) nan_index = where(isnan(self.data[submodel]))[1] inf_index = where(isinf(self.data[submodel]))[1] if nan_index.size > 0: nan_var_index = unique(nan_index) raise ValueError, "NaN(Not A Number) is returned from variable %s; check the model specification table and/or attribute values used in the computation for the variable." % coef[ submodel].get_variable_names()[nan_var_index] if inf_index.size > 0: inf_var_index = unique(inf_index) raise ValueError, "Inf is returned from variable %s; check the model specification table and/or attribute values used in the computation for the variable." % coef[ submodel].get_variable_names()[inf_var_index] if (self.data[submodel].shape[0] > 0) and (self.data[submodel].size > 0): # observations for this submodel available outcome[self.observations_mapping[submodel]] = \ self.regression.run(self.data[submodel], coef[submodel].get_coefficient_values()[0,:], resources=self.run_config).astype(outcome.dtype) return outcome def correct_infinite_values(self, dataset, outcome_attribute_name, maxvalue=1e+38, clip_all_larger_values=False): """Check if the model resulted in infinite values. If yes, print warning and clip the values to maxvalue. If clip_all_larger_values is True, all values larger than maxvalue are clip to maxvalue. """ infidx = where(dataset.get_attribute(outcome_attribute_name) == inf)[0] if infidx.size > 0: logger.log_warning("Infinite values in %s. Clipped to %s." % (outcome_attribute_name, maxvalue)) dataset.set_values_of_one_attribute(outcome_attribute_name, maxvalue, infidx) if clip_all_larger_values: idx = where( dataset.get_attribute(outcome_attribute_name) > maxvalue)[0] if idx.size > 0: logger.log_warning( "Values in %s larger than %s. Clipped to %s." % (outcome_attribute_name, maxvalue, maxvalue)) dataset.set_values_of_one_attribute(outcome_attribute_name, maxvalue, idx) def estimate(self, specification, dataset, outcome_attribute, index=None, procedure=None, data_objects=None, estimate_config=None, debuglevel=0): """'specification' is of type EquationSpecification, 'dataset' is of type Dataset, 'outcome_attribute' - string that determines the dependent variable, 'index' are indices of individuals in dataset for which the model runs. If it is None, the whole dataset is considered. 'procedure' - name of the estimation procedure. If it is None, there should be an entry "estimation" in 'estimate_config' that determines the procedure. The class must have a method 'run' that takes as arguments 'data', 'regression_procedure' and 'resources'. It returns a dictionary with entries 'estimators', 'standard_errors' and 't_values' (all 1D numpy arrays). 'data_objects' is a dictionary where each key is the name of an data object ('zone', ...) and its value is an object of class Dataset. 'estimate_config' is of type Resources, it gives additional arguments for the estimation procedure. 'debuglevel' overwrites the class 'debuglevel'. """ #import wingdbstub self.debug.flag = debuglevel if estimate_config == None: estimate_config = Resources() if not isinstance(estimate_config, Resources) and isinstance( estimate_config, dict): estimate_config = Resources(estimate_config) self.estimate_config = estimate_config.merge_with_defaults( self.estimate_config) if data_objects is not None: self.dataset_pool.add_datasets_if_not_included(data_objects) self.procedure = procedure if self.procedure == None: self.procedure = self.estimate_config.get("estimation", None) if self.procedure is not None: self.procedure = ModelComponentCreator().get_model_component( self.procedure) else: logger.log_warning( "No estimation procedure given, or problems with loading the corresponding module." ) compute_resources = Resources({"debug": self.debug}) if dataset.size() <= 0: # no data loaded yet dataset.get_id_attribute() if index == None: index = arange(dataset.size()) if not isinstance(index, ndarray): index = array(index) estimation_size_agents = self.estimate_config.get( "estimation_size_agents", None) # should be a proportion of the agent_set if estimation_size_agents == None: estimation_size_agents = 1.0 else: estimation_size_agents = max(min(estimation_size_agents, 1.0), 0.0) # between 0 and 1 if estimation_size_agents < 1.0: self.debug.print_debug("Sampling agents for estimation ...", 3) estimation_idx = sample_noreplace( arange(index.size), int(index.size * estimation_size_agents)) else: estimation_idx = arange(index.size) estimation_idx = index[estimation_idx] self.debug.print_debug( "Number of observations for estimation: " + str(estimation_idx.size), 2) if estimation_idx.size <= 0: self.debug.print_debug("Nothing to be done.", 2) return (None, None) coefficients = create_coefficient_from_specification(specification) specified_coefficients = SpecifiedCoefficients().create(coefficients, specification, neqs=1) submodels = specified_coefficients.get_submodels() self.get_status_for_gui().update_pieces_using_submodels( submodels=submodels, leave_pieces=2) self.map_agents_to_submodels( submodels, self.submodel_string, dataset, estimation_idx, dataset_pool=self.dataset_pool, resources=compute_resources, submodel_size_max=self.estimate_config.get('submodel_size_max', None)) variables = specified_coefficients.get_full_variable_names_without_constants( ) self.debug.print_debug("Compute variables ...", 4) self.increment_current_status_piece() dataset.compute_variables(variables, dataset_pool=self.dataset_pool, resources=compute_resources) coef = {} estimated_coef = {} self.outcome = {} dataset.compute_variables([outcome_attribute], dataset_pool=self.dataset_pool, resources=compute_resources) regression_resources = Resources(estimate_config) regression_resources.merge({"debug": self.debug}) outcome_variable_name = VariableName(outcome_attribute) for submodel in submodels: coef[submodel] = SpecifiedCoefficientsFor1Submodel( specified_coefficients, submodel) self.increment_current_status_piece() logger.log_status("Estimate regression for submodel " + str(submodel), tags=["estimate"], verbosity_level=2) logger.log_status("Number of observations: " + str(self.observations_mapping[submodel].size), tags=["estimate"], verbosity_level=2) self.data[ submodel] = dataset.create_regression_data_for_estimation( coef[submodel], index=estimation_idx[self.observations_mapping[submodel]]) self.coefficient_names[submodel] = coef[ submodel].get_coefficient_names_without_constant()[0, :] if (self.data[submodel].shape[0] > 0 ) and (self.data[submodel].size > 0) and ( self.procedure is not None): # observations for this submodel available self.outcome[submodel] = dataset.get_attribute_by_index( outcome_variable_name.get_alias(), estimation_idx[self.observations_mapping[submodel]]) regression_resources.merge({"outcome": self.outcome[submodel]}) regression_resources.merge({ "coefficient_names": self.coefficient_names[submodel].tolist(), "constant_position": coef[submodel].get_constants_positions() }) estimated_coef[submodel] = self.procedure.run( self.data[submodel], self.regression, resources=regression_resources) if "estimators" in estimated_coef[submodel].keys(): coef[submodel].set_coefficient_values( estimated_coef[submodel]["estimators"]) if "standard_errors" in estimated_coef[submodel].keys(): coef[submodel].set_standard_errors( estimated_coef[submodel]["standard_errors"]) if "other_measures" in estimated_coef[submodel].keys(): for measure in estimated_coef[submodel][ "other_measures"].keys(): coef[submodel].set_measure( measure, estimated_coef[submodel]["other_measures"] [measure]) if "other_info" in estimated_coef[submodel].keys(): for info in estimated_coef[submodel]["other_info"]: coef[submodel].set_other_info( info, estimated_coef[submodel]["other_info"][info]) coefficients.fill_coefficients(coef) self.save_predicted_values_and_errors(specification, coefficients, dataset, outcome_variable_name, index=index, data_objects=data_objects) return (coefficients, estimated_coef) def prepare_for_run(self, dataset=None, dataset_filter=None, filter_threshold=0, **kwargs): spec, coef = prepare_specification_and_coefficients(**kwargs) if (dataset is not None) and (dataset_filter is not None): filter_values = dataset.compute_variables( [dataset_filter], dataset_pool=self.dataset_pool) index = where(filter_values > filter_threshold)[0] else: index = None return (spec, coef, index) def prepare_for_estimate(self, dataset=None, dataset_filter=None, filter_threshold=0, **kwargs): spec = get_specification_for_estimation(**kwargs) if (dataset is not None) and (dataset_filter is not None): filter_values = dataset.compute_variables( [dataset_filter], dataset_pool=self.dataset_pool) index = where(filter_values > filter_threshold)[0] else: index = None return (spec, index) def get_data_as_dataset(self, submodel=-2): """Like get_all_data, but the retuning value is a Dataset containing attributes that correspond to the data columns. Their names are coefficient names.""" all_data = self.get_all_data(submodel) if all_data is None: return None names = self.get_coefficient_names(submodel) if names is None: return None dataset_data = {} for i in range(names.size): dataset_data[names[i]] = all_data[:, i].reshape(all_data.shape[0]) dataset_data["id"] = arange(all_data.shape[0]) + 1 storage = StorageFactory().get_storage('dict_storage') storage.write_table(table_name='dataset', table_data=dataset_data) ds = Dataset(in_storage=storage, id_name="id", in_table_name='dataset') return ds def save_predicted_values_and_errors(self, specification, coefficients, dataset, outcome_variable, index=None, data_objects=None): if self.estimate_config.get('save_predicted_values_and_errors', False): logger.log_status('Computing predicted values and residuals.') original_values = dataset.get_attribute_by_index( outcome_variable, index) predicted_values = zeros(dataset.size(), dtype='float32') predicted_values[index] = self.run_after_estimation( specification, coefficients, dataset, index=index, data_objects=data_objects) predicted_attribute_name = 'predicted_%s' % outcome_variable.get_alias( ) dataset.add_primary_attribute(name=predicted_attribute_name, data=predicted_values) dataset.flush_attribute(predicted_attribute_name) predicted_error_attribute_name = 'residuals_%s' % outcome_variable.get_alias( ) error_values = zeros(dataset.size(), dtype='float32') error_values[index] = (original_values - predicted_values[index]).astype( error_values.dtype) dataset.add_primary_attribute(name=predicted_error_attribute_name, data=error_values) dataset.flush_attribute(predicted_error_attribute_name) logger.log_status( 'Predicted values saved as %s (for the %s dataset)' % (predicted_attribute_name, dataset.get_dataset_name())) logger.log_status( 'Residuals saved as %s (for the %s dataset)' % (predicted_error_attribute_name, dataset.get_dataset_name())) def export_estimation_data(self, submodel=-2, file_name='./estimation_data_regression.txt', delimiter='\t'): import os from numpy import newaxis data = concatenate((self.outcome[submodel][..., newaxis], self.get_all_data(submodel=submodel)), axis=1) header = ['outcome'] + self.get_coefficient_names(submodel).tolist() nrows = data.shape[0] file_name_root, file_name_ext = os.path.splitext(file_name) out_file = "%s_submodel_%s.txt" % (file_name_root, submodel) fh = open(out_file, 'w') fh.write(delimiter.join(header) + '\n') #file header for row in range(nrows): line = [str(x) for x in data[row, ]] fh.write(delimiter.join(line) + '\n') fh.flush() fh.close print 'Data written into %s' % out_file def run_after_estimation(self, *args, **kwargs): return self.run(*args, **kwargs) def _get_status_total_pieces(self): return ChunkModel._get_status_total_pieces( self) * self.get_status_for_gui().get_total_number_of_pieces() def _get_status_current_piece(self): return ChunkModel._get_status_current_piece( self) * self.get_status_for_gui().get_total_number_of_pieces( ) + self.get_status_for_gui().get_current_piece() def _get_status_piece_description(self): return "%s %s" % (ChunkModel._get_status_piece_description( self), self.get_status_for_gui().get_current_piece_description()) def get_specified_coefficients(self): return self.specified_coefficients
class RegressionModel(ChunkModel): model_name = "Regression Model" model_short_name = "RM" def __init__(self, regression_procedure="opus_core.linear_regression", submodel_string=None, run_config=None, estimate_config=None, debuglevel=0, dataset_pool=None): self.debug = DebugPrinter(debuglevel) self.dataset_pool = self.create_dataset_pool(dataset_pool) self.regression = RegressionModelFactory().get_model(name=regression_procedure) if self.regression == None: raise StandardError, "No regression procedure given." self.submodel_string = submodel_string self.run_config = run_config if self.run_config == None: self.run_config = Resources() if not isinstance(self.run_config,Resources) and isinstance(self.run_config, dict): self.run_config = Resources(self.run_config) self.estimate_config = estimate_config if self.estimate_config == None: self.estimate_config = Resources() if not isinstance(self.estimate_config,Resources) and isinstance(self.estimate_config, dict): self.estimate_config = Resources(self.estimate_config) self.data = {} self.coefficient_names = {} ChunkModel.__init__(self) self.get_status_for_gui().initialize_pieces(3, pieces_description = array(['initialization', 'computing variables', 'submodel: 1'])) def run(self, specification, coefficients, dataset, index=None, chunk_specification=None, data_objects=None, run_config=None, initial_values=None, procedure=None, debuglevel=0): """'specification' is of type EquationSpecification, 'coefficients' is of type Coefficients, 'dataset' is of type Dataset, 'index' are indices of individuals in dataset for which the model runs. If it is None, the whole dataset is considered. 'chunk_specification' determines number of chunks in which the simulation is processed. 'data_objects' is a dictionary where each key is the name of an data object ('zone', ...) and its value is an object of class Dataset. 'run_config' is of type Resources, it gives additional arguments for the run. If 'procedure' is given, it overwrites the regression_procedure of the constructor. 'initial_values' is an array of the initial values of the results. It will be overwritten by the results for those elements that are handled by the model (defined by submodels in the specification). By default the results are initialized with 0. 'debuglevel' overwrites the constructor 'debuglevel'. """ self.debug.flag = debuglevel if run_config == None: run_config = Resources() if not isinstance(run_config,Resources) and isinstance(run_config, dict): run_config = Resources(run_config) self.run_config = run_config.merge_with_defaults(self.run_config) self.run_config.merge({"debug":self.debug}) if data_objects is not None: self.dataset_pool.add_datasets_if_not_included(data_objects) self.dataset_pool.replace_dataset(dataset.get_dataset_name(), dataset) if procedure is not None: self.regression = RegressionModelFactory().get_model(name=procedure) if initial_values is None: self.initial_values = zeros((dataset.size(),), dtype=float32) else: self.initial_values = zeros((dataset.size(),), dtype=initial_values.dtype) self.initial_values[index] = initial_values if dataset.size()<=0: # no data loaded yet dataset.get_id_attribute() if index == None: index = arange(dataset.size()) result = ChunkModel.run(self, chunk_specification, dataset, index, float32, specification=specification, coefficients=coefficients) return result def run_chunk (self, index, dataset, specification, coefficients): self.specified_coefficients = SpecifiedCoefficients().create(coefficients, specification, neqs=1) compute_resources = Resources({"debug":self.debug}) submodels = self.specified_coefficients.get_submodels() self.get_status_for_gui().update_pieces_using_submodels(submodels=submodels, leave_pieces=2) self.map_agents_to_submodels(submodels, self.submodel_string, dataset, index, dataset_pool=self.dataset_pool, resources = compute_resources) variables = self.specified_coefficients.get_full_variable_names_without_constants() self.debug.print_debug("Compute variables ...",4) self.increment_current_status_piece() dataset.compute_variables(variables, dataset_pool = self.dataset_pool, resources = compute_resources) data = {} coef = {} outcome=self.initial_values[index].copy() for submodel in submodels: coef[submodel] = SpecifiedCoefficientsFor1Submodel(self.specified_coefficients,submodel) self.coefficient_names[submodel] = coef[submodel].get_coefficient_names_without_constant()[0,:] self.debug.print_debug("Compute regression for submodel " +str(submodel),4) self.increment_current_status_piece() self.data[submodel] = dataset.create_regression_data(coef[submodel], index = index[self.observations_mapping[submodel]]) nan_index = where(isnan(self.data[submodel]))[1] inf_index = where(isinf(self.data[submodel]))[1] vnames = asarray(coef[submodel].get_variable_names()) if nan_index.size > 0: nan_var_index = unique(nan_index) self.data[submodel] = nan_to_num(self.data[submodel]) logger.log_warning("NaN(Not A Number) is returned from variable %s; it is replaced with %s." % (vnames[nan_var_index], nan_to_num(nan))) #raise ValueError, "NaN(Not A Number) is returned from variable %s; check the model specification table and/or attribute values used in the computation for the variable." % vnames[nan_var_index] if inf_index.size > 0: inf_var_index = unique(inf_index) self.data[submodel] = nan_to_num(self.data[submodel]) logger.log_warning("Inf is returned from variable %s; it is replaced with %s." % (vnames[inf_var_index], nan_to_num(inf))) #raise ValueError, "Inf is returned from variable %s; check the model specification table and/or attribute values used in the computation for the variable." % vnames[inf_var_index] if (self.data[submodel].shape[0] > 0) and (self.data[submodel].size > 0): # observations for this submodel available outcome[self.observations_mapping[submodel]] = \ self.regression.run(self.data[submodel], coef[submodel].get_coefficient_values()[0,:], resources=self.run_config).astype(outcome.dtype) return outcome def correct_infinite_values(self, dataset, outcome_attribute_name, maxvalue=1e+38, clip_all_larger_values=False): """Check if the model resulted in infinite values. If yes, print warning and clip the values to maxvalue. If clip_all_larger_values is True, all values larger than maxvalue are clip to maxvalue. """ infidx = where(dataset.get_attribute(outcome_attribute_name) == inf)[0] if infidx.size > 0: logger.log_warning("Infinite values in %s. Clipped to %s." % (outcome_attribute_name, maxvalue)) dataset.set_values_of_one_attribute(outcome_attribute_name, maxvalue, infidx) if clip_all_larger_values: idx = where(dataset.get_attribute(outcome_attribute_name) > maxvalue)[0] if idx.size > 0: logger.log_warning("Values in %s larger than %s. Clipped to %s." % (outcome_attribute_name, maxvalue, maxvalue)) dataset.set_values_of_one_attribute(outcome_attribute_name, maxvalue, idx) def estimate(self, specification, dataset, outcome_attribute, index = None, procedure=None, data_objects=None, estimate_config=None, debuglevel=0): """'specification' is of type EquationSpecification, 'dataset' is of type Dataset, 'outcome_attribute' - string that determines the dependent variable, 'index' are indices of individuals in dataset for which the model runs. If it is None, the whole dataset is considered. 'procedure' - name of the estimation procedure. If it is None, there should be an entry "estimation" in 'estimate_config' that determines the procedure. The class must have a method 'run' that takes as arguments 'data', 'regression_procedure' and 'resources'. It returns a dictionary with entries 'estimators', 'standard_errors' and 't_values' (all 1D numpy arrays). 'data_objects' is a dictionary where each key is the name of an data object ('zone', ...) and its value is an object of class Dataset. 'estimate_config' is of type Resources, it gives additional arguments for the estimation procedure. 'debuglevel' overwrites the class 'debuglevel'. """ #import wingdbstub self.debug.flag = debuglevel if estimate_config == None: estimate_config = Resources() if not isinstance(estimate_config,Resources) and isinstance(estimate_config, dict): estimate_config = Resources(estimate_config) self.estimate_config = estimate_config.merge_with_defaults(self.estimate_config) if data_objects is not None: self.dataset_pool.add_datasets_if_not_included(data_objects) self.procedure=procedure if self.procedure == None: self.procedure = self.estimate_config.get("estimation", None) if self.procedure is not None: self.procedure = ModelComponentCreator().get_model_component(self.procedure) else: logger.log_warning("No estimation procedure given, or problems with loading the corresponding module.") compute_resources = Resources({"debug":self.debug}) if dataset.size()<=0: # no data loaded yet dataset.get_id_attribute() if index == None: index = arange(dataset.size()) if not isinstance(index,ndarray): index=array(index) estimation_size_agents = self.estimate_config.get("estimation_size_agents", None) # should be a proportion of the agent_set if estimation_size_agents == None: estimation_size_agents = 1.0 else: estimation_size_agents = max(min(estimation_size_agents,1.0),0.0) # between 0 and 1 if estimation_size_agents < 1.0: self.debug.print_debug("Sampling agents for estimation ...",3) estimation_idx = sample_noreplace(arange(index.size), int(index.size*estimation_size_agents)) else: estimation_idx = arange(index.size) estimation_idx = index[estimation_idx] self.debug.print_debug("Number of observations for estimation: " + str(estimation_idx.size),2) if estimation_idx.size <= 0: self.debug.print_debug("Nothing to be done.",2) return (None, None) coefficients = create_coefficient_from_specification(specification) self.specified_coefficients = SpecifiedCoefficients().create(coefficients, specification, neqs=1) submodels = self.specified_coefficients.get_submodels() self.get_status_for_gui().update_pieces_using_submodels(submodels=submodels, leave_pieces=2) self.map_agents_to_submodels(submodels, self.submodel_string, dataset, estimation_idx, dataset_pool=self.dataset_pool, resources = compute_resources, submodel_size_max=self.estimate_config.get('submodel_size_max', None)) variables = self.specified_coefficients.get_full_variable_names_without_constants() self.debug.print_debug("Compute variables ...",4) self.increment_current_status_piece() dataset.compute_variables(variables, dataset_pool=self.dataset_pool, resources = compute_resources) coef = {} estimated_coef={} self.outcome = {} dataset.compute_variables([outcome_attribute], dataset_pool=self.dataset_pool, resources=compute_resources) regression_resources=Resources(estimate_config) regression_resources.merge({"debug":self.debug}) outcome_variable_name = VariableName(outcome_attribute) for submodel in submodels: coef[submodel] = SpecifiedCoefficientsFor1Submodel(self.specified_coefficients,submodel) self.increment_current_status_piece() logger.log_status("Estimate regression for submodel " +str(submodel), tags=["estimate"], verbosity_level=2) #logger.log_status("Number of observations: " +str(self.observations_mapping[submodel].size), #tags=["estimate"], verbosity_level=2) self.data[submodel] = dataset.create_regression_data_for_estimation(coef[submodel], index = estimation_idx[self.observations_mapping[submodel]]) self.coefficient_names[submodel] = coef[submodel].get_coefficient_names_without_constant()[0,:] if (self.data[submodel].shape[0] > 0) and (self.data[submodel].size > 0) and (self.procedure is not None): # observations for this submodel available self.outcome[submodel] = dataset.get_attribute_by_index(outcome_variable_name.get_alias(), estimation_idx[self.observations_mapping[submodel]]) regression_resources.merge({"outcome": self.outcome[submodel]}) regression_resources.merge({"coefficient_names":self.coefficient_names[submodel].tolist(), "constant_position": coef[submodel].get_constants_positions()}) regression_resources.merge({"submodel": submodel}) estimated_coef[submodel] = self.procedure.run(self.data[submodel], self.regression, resources=regression_resources) if "estimators" in estimated_coef[submodel].keys(): coef[submodel].set_coefficient_values(estimated_coef[submodel]["estimators"]) if "standard_errors" in estimated_coef[submodel].keys(): coef[submodel].set_standard_errors(estimated_coef[submodel]["standard_errors"]) if "other_measures" in estimated_coef[submodel].keys(): for measure in estimated_coef[submodel]["other_measures"].keys(): coef[submodel].set_measure(measure, estimated_coef[submodel]["other_measures"][measure]) if "other_info" in estimated_coef[submodel].keys(): for info in estimated_coef[submodel]["other_info"]: coef[submodel].set_other_info(info, estimated_coef[submodel]["other_info"][info]) coefficients.fill_coefficients(coef) self.specified_coefficients.coefficients = coefficients self.save_predicted_values_and_errors(specification, coefficients, dataset, outcome_variable_name, index=index, data_objects=data_objects) return (coefficients, estimated_coef) def prepare_for_run(self, dataset=None, dataset_filter=None, filter_threshold=0, **kwargs): spec, coef = prepare_specification_and_coefficients(**kwargs) if (dataset is not None) and (dataset_filter is not None): filter_values = dataset.compute_variables([dataset_filter], dataset_pool=self.dataset_pool) index = where(filter_values > filter_threshold)[0] else: index = None return (spec, coef, index) def prepare_for_estimate(self, dataset=None, dataset_filter=None, filter_threshold=0, **kwargs): spec = get_specification_for_estimation(**kwargs) if (dataset is not None) and (dataset_filter is not None): filter_values = dataset.compute_variables([dataset_filter], dataset_pool=self.dataset_pool) index = where(filter_values > filter_threshold)[0] else: index = None return (spec, index) def get_data_as_dataset(self, submodel=-2): """Like get_all_data, but the retuning value is a Dataset containing attributes that correspond to the data columns. Their names are coefficient names.""" all_data = self.get_all_data(submodel) if all_data is None: return None names = self.get_coefficient_names(submodel) if names is None: return None dataset_data = {} for i in range(names.size): dataset_data[names[i]] = all_data[:, i].reshape(all_data.shape[0]) dataset_data["id"] = arange(all_data.shape[0])+1 storage = StorageFactory().get_storage('dict_storage') storage.write_table(table_name='dataset', table_data=dataset_data) ds = Dataset(in_storage=storage, id_name="id", in_table_name='dataset') return ds def save_predicted_values_and_errors(self, specification, coefficients, dataset, outcome_variable, index=None, data_objects=None): if self.estimate_config.get('save_predicted_values_and_errors', False): logger.log_status('Computing predicted values and residuals.') original_values = dataset.get_attribute_by_index(outcome_variable, index) predicted_values = zeros(dataset.size(), dtype='float32') predicted_values[index] = self.run_after_estimation(specification, coefficients, dataset, index=index, data_objects=data_objects) predicted_attribute_name = 'predicted_%s' % outcome_variable.get_alias() dataset.add_primary_attribute(name=predicted_attribute_name, data=predicted_values) dataset.flush_attribute(predicted_attribute_name) predicted_error_attribute_name = 'residuals_%s' % outcome_variable.get_alias() error_values = zeros(dataset.size(), dtype='float32') error_values[index] = (original_values - predicted_values[index]).astype(error_values.dtype) dataset.add_primary_attribute(name=predicted_error_attribute_name, data = error_values) dataset.flush_attribute(predicted_error_attribute_name) logger.log_status('Predicted values saved as %s (for the %s dataset)' % (predicted_attribute_name, dataset.get_dataset_name())) logger.log_status('Residuals saved as %s (for the %s dataset)' % (predicted_error_attribute_name, dataset.get_dataset_name())) def export_estimation_data(self, submodel=-2, file_name='./estimation_data_regression.txt', delimiter = '\t'): import os from numpy import newaxis data = concatenate((self.outcome[submodel][...,newaxis], self.get_all_data(submodel=submodel)), axis=1) header = ['outcome'] + self.get_coefficient_names(submodel).tolist() nrows = data.shape[0] file_name_root, file_name_ext = os.path.splitext(file_name) out_file = "%s_submodel_%s.txt" % (file_name_root, submodel) fh = open(out_file,'w') fh.write(delimiter.join(header) + '\n') #file header for row in range(nrows): line = [str(x) for x in data[row,]] fh.write(delimiter.join(line) + '\n') fh.flush() fh.close print 'Data written into %s' % out_file def run_after_estimation(self, *args, **kwargs): return self.run(*args, **kwargs) def _get_status_total_pieces(self): return ChunkModel._get_status_total_pieces(self) * self.get_status_for_gui().get_total_number_of_pieces() def _get_status_current_piece(self): return ChunkModel._get_status_current_piece(self)*self.get_status_for_gui().get_total_number_of_pieces() + self.get_status_for_gui().get_current_piece() def _get_status_piece_description(self): return "%s %s" % (ChunkModel._get_status_piece_description(self), self.get_status_for_gui().get_current_piece_description()) def get_specified_coefficients(self): return self.specified_coefficients