Beispiel #1
0
    def generate_posterior_distribution(
            self,
            year,
            quantity_of_interest,
            cache_directory=None,
            values=None,
            ids=None,
            procedure="opus_core.bm_normal_posterior",
            use_bias_and_variance_from=None,
            transformed_back=True,
            transformation_pair=(None, None),
            **kwargs):
        if cache_directory is not None:
            self.cache_set = array([cache_directory])
            #self.set_cache_attributes(cache_directory)
        else:
            if values is None or ids is None:
                raise StandardError, "Either cache_directory or values and ids must be given."

        self.set_posterior(
            year,
            quantity_of_interest,
            values=values,
            ids=ids,
            use_bias_and_variance_from=use_bias_and_variance_from,
            transformation_pair=transformation_pair)
        procedure_class = ModelComponentCreator().get_model_component(
            procedure)
        self.simulated_values = procedure_class.run(self, **kwargs)
        if transformed_back and (self.transformation_pair_for_prediction[0]
                                 is not None):  # need to transform back
            self.simulated_values = try_transformation(
                self.simulated_values,
                self.transformation_pair_for_prediction[1])
        return self.simulated_values
Beispiel #2
0
    def generate_posterior_distribution(
            self,
            year,
            quantity_of_interest,
            procedure="opus_core.bm_normal_posterior",
            use_bias_and_variance_from=None,
            transformed_back=True,
            aggregate_to=None,
            intermediates=[],
            **kwargs):
        """
        'quantity_of_interest' is a variable name about which we want to get the posterior distribution.
        If there is multiple known_output, it must be made clear from which one the bias and variance
        is to be used (argument use_bias_and_variance_from) If it is None, the first known output is used.
        """
        self.set_posterior(year, quantity_of_interest,
                           use_bias_and_variance_from)
        procedure_class = ModelComponentCreator().get_model_component(
            procedure)
        self.simulated_values = procedure_class.run(self, **kwargs)
        if transformed_back and (self.transformation_pair_for_prediction[0]
                                 is not None):  # need to transform back
            self.simulated_values = try_transformation(
                self.simulated_values,
                self.transformation_pair_for_prediction[1])
        if aggregate_to is not None:
            self.simulated_values = self.aggregate(
                self.simulated_values,
                aggregate_from=VariableName(
                    quantity_of_interest).get_dataset_name(),
                aggregate_to=aggregate_to,
                intermediates=intermediates)

        return self.simulated_values
 def generate_posterior_distribution(self, year, quantity_of_interest, cache_directory=None, values=None, ids=None, procedure="opus_core.bm_normal_posterior",
                                     use_bias_and_variance_from=None, transformed_back=True, transformation_pair = (None, None), **kwargs):
     if cache_directory is not None:
         self.cache_set = array([cache_directory])
         #self.set_cache_attributes(cache_directory)
     else:
         if values is None or ids is None:
             raise StandardError, "Either cache_directory or values and ids must be given."
         
     self.set_posterior(year, quantity_of_interest, values=values, ids=ids, use_bias_and_variance_from=use_bias_and_variance_from, transformation_pair=transformation_pair)
     procedure_class = ModelComponentCreator().get_model_component(procedure)
     self.simulated_values = procedure_class.run(self, **kwargs)
     if transformed_back and (self.transformation_pair_for_prediction[0] is not None): # need to transform back
         self.simulated_values = try_transformation(self.simulated_values,
                                                    self.transformation_pair_for_prediction[1])
     return self.simulated_values
Beispiel #4
0
 def compute_weights(self, procedure="opus_core.bm_normal_weights", **kwargs):
     """ Launches the run method of the given 'procedure'. This should return the actual BM weights.
     The method passes self as first argument into the run method.
     If 'procedure' is not given, the method returns equal weights.
     """
     self.compute_y()
     self.estimate_mu()
     self.estimate_bias()
     self.estimate_variance()
     if procedure is not None:
         procedure_class = ModelComponentCreator().get_model_component(procedure)
         self.weights, self.weight_components = procedure_class.run(self, **kwargs)
     else:
         self.weights = 1.0/self.number_of_runs * ones(self.number_of_runs)
     write_to_text_file(os.path.join(self.output_directory, self.weights_file_name),
                        self.weights)
     return self.weights
 def generate_posterior_distribution(self, year, quantity_of_interest, procedure="opus_core.bm_normal_posterior",
                                     use_bias_and_variance_from=None, transformed_back=True, aggregate_to=None,
                                     intermediates=[], **kwargs):
     """
     'quantity_of_interest' is a variable name about which we want to get the posterior distribution.
     If there is multiple known_output, it must be made clear from which one the bias and variance
     is to be used (argument use_bias_and_variance_from) If it is None, the first known output is used.
     """
     self.set_posterior(year, quantity_of_interest, use_bias_and_variance_from)
     procedure_class = ModelComponentCreator().get_model_component(procedure)
     self.simulated_values = procedure_class.run(self, **kwargs)
     if transformed_back and (self.transformation_pair_for_prediction[0] is not None): # need to transform back
         self.simulated_values = try_transformation(self.simulated_values,
                                                    self.transformation_pair_for_prediction[1])
     if aggregate_to is not None:
         self.simulated_values = self.aggregate(self.simulated_values, 
                                                aggregate_from=VariableName(quantity_of_interest).get_dataset_name(),
                                                aggregate_to=aggregate_to, intermediates=intermediates)
             
     return self.simulated_values
Beispiel #6
0
    def generate_posterior_distribution(self, year, quantity_of_interest, cache_directory=None, values=None, ids=None, 
                                        procedure="opus_core.bm_normal_posterior", use_bias_and_variance_from=None, 
                                        transformed_back=True, aggregate_to=None,
                                        intermediates=[], propagation_factor=1, no_propagation=True, additive_propagation=False, 
                                        omit_bias=False, **kwargs):

        if (values is None or ids is None) and (self.cache_set is None):
            raise StandardError, "values and ids must be give if the BM object is initialized without cache_file_location."
            
        self.set_posterior(year, quantity_of_interest, values=values, ids=ids, use_bias_and_variance_from=use_bias_and_variance_from, 
                           propagation_factor=propagation_factor, 
                           no_propagation=no_propagation, additive_propagation=additive_propagation, omit_bias=omit_bias)
        procedure_class = ModelComponentCreator().get_model_component(procedure)
        self.simulated_values = procedure_class.run(self, **kwargs)
        if transformed_back and (self.transformation_pair_for_prediction[0] is not None): # need to transform back
            self.simulated_values = try_transformation(self.simulated_values,
                                                       self.transformation_pair_for_prediction[1])
        self.simulated_values_ids = self.m_ids
        if aggregate_to is not None:
            (self.simulated_values, self.simulated_values_ids) = self.aggregate(self.simulated_values, 
                                                   aggregate_from=VariableName(quantity_of_interest).get_dataset_name(),
                                                   aggregate_to=aggregate_to, intermediates=intermediates)
        return self.simulated_values
class RegressionModel(ChunkModel):

    model_name = "Regression Model"
    model_short_name = "RM"

    def __init__(self,
                 regression_procedure="opus_core.linear_regression",
                 submodel_string=None,
                 run_config=None,
                 estimate_config=None,
                 debuglevel=0,
                 dataset_pool=None):

        self.debug = DebugPrinter(debuglevel)

        self.dataset_pool = self.create_dataset_pool(dataset_pool)

        self.regression = RegressionModelFactory().get_model(
            name=regression_procedure)
        if self.regression == None:
            raise StandardError, "No regression procedure given."

        self.submodel_string = submodel_string

        self.run_config = run_config
        if self.run_config == None:
            self.run_config = Resources()
        if not isinstance(self.run_config, Resources) and isinstance(
                self.run_config, dict):
            self.run_config = Resources(self.run_config)

        self.estimate_config = estimate_config
        if self.estimate_config == None:
            self.estimate_config = Resources()
        if not isinstance(self.estimate_config, Resources) and isinstance(
                self.estimate_config, dict):
            self.estimate_config = Resources(self.estimate_config)

        self.data = {}
        self.coefficient_names = {}
        ChunkModel.__init__(self)
        self.get_status_for_gui().initialize_pieces(3,
                                                    pieces_description=array([
                                                        'initialization',
                                                        'computing variables',
                                                        'submodel: 1'
                                                    ]))

    def run(self,
            specification,
            coefficients,
            dataset,
            index=None,
            chunk_specification=None,
            data_objects=None,
            run_config=None,
            initial_values=None,
            procedure=None,
            debuglevel=0):
        """'specification' is of type EquationSpecification,
            'coefficients' is of type Coefficients,
            'dataset' is of type Dataset,
            'index' are indices of individuals in dataset for which
                        the model runs. If it is None, the whole dataset is considered.
            'chunk_specification' determines  number of chunks in which the simulation is processed.
            'data_objects' is a dictionary where each key is the name of an data object
            ('zone', ...) and its value is an object of class  Dataset.
           'run_config' is of type Resources, it gives additional arguments for the run.
           If 'procedure' is given, it overwrites the regression_procedure of the constructor.
           'initial_values' is an array of the initial values of the results. It will be overwritten
           by the results for those elements that are handled by the model (defined by submodels in the specification).
           By default the results are initialized with 0.
            'debuglevel' overwrites the constructor 'debuglevel'.
        """
        self.debug.flag = debuglevel
        if run_config == None:
            run_config = Resources()
        if not isinstance(run_config, Resources) and isinstance(
                run_config, dict):
            run_config = Resources(run_config)
        self.run_config = run_config.merge_with_defaults(self.run_config)
        self.run_config.merge({"debug": self.debug})
        if data_objects is not None:
            self.dataset_pool.add_datasets_if_not_included(data_objects)
        if procedure is not None:
            self.regression = RegressionModelFactory().get_model(
                name=procedure)
        if initial_values is None:
            self.initial_values = zeros((dataset.size(), ), dtype=float32)
        else:
            self.initial_values = zeros((dataset.size(), ),
                                        dtype=initial_values.dtype)
            self.initial_values[index] = initial_values

        if dataset.size() <= 0:  # no data loaded yet
            dataset.get_id_attribute()
        if index == None:
            index = arange(dataset.size())

        result = ChunkModel.run(self,
                                chunk_specification,
                                dataset,
                                index,
                                float32,
                                specification=specification,
                                coefficients=coefficients)
        return result

    def run_chunk(self, index, dataset, specification, coefficients):
        self.specified_coefficients = SpecifiedCoefficients().create(
            coefficients, specification, neqs=1)
        compute_resources = Resources({"debug": self.debug})
        submodels = self.specified_coefficients.get_submodels()
        self.get_status_for_gui().update_pieces_using_submodels(
            submodels=submodels, leave_pieces=2)
        self.map_agents_to_submodels(submodels,
                                     self.submodel_string,
                                     dataset,
                                     index,
                                     dataset_pool=self.dataset_pool,
                                     resources=compute_resources)
        variables = self.specified_coefficients.get_full_variable_names_without_constants(
        )
        self.debug.print_debug("Compute variables ...", 4)
        self.increment_current_status_piece()
        dataset.compute_variables(variables,
                                  dataset_pool=self.dataset_pool,
                                  resources=compute_resources)
        data = {}
        coef = {}
        outcome = self.initial_values[index].copy()
        for submodel in submodels:
            coef[submodel] = SpecifiedCoefficientsFor1Submodel(
                self.specified_coefficients, submodel)
            self.coefficient_names[submodel] = coef[
                submodel].get_coefficient_names_without_constant()[0, :]
            self.debug.print_debug(
                "Compute regression for submodel " + str(submodel), 4)
            self.increment_current_status_piece()
            self.data[submodel] = dataset.create_regression_data(
                coef[submodel],
                index=index[self.observations_mapping[submodel]])
            nan_index = where(isnan(self.data[submodel]))[1]
            inf_index = where(isinf(self.data[submodel]))[1]
            if nan_index.size > 0:
                nan_var_index = unique(nan_index)
                raise ValueError, "NaN(Not A Number) is returned from variable %s; check the model specification table and/or attribute values used in the computation for the variable." % coef[
                    submodel].get_variable_names()[nan_var_index]
            if inf_index.size > 0:
                inf_var_index = unique(inf_index)
                raise ValueError, "Inf is returned from variable %s; check the model specification table and/or attribute values used in the computation for the variable." % coef[
                    submodel].get_variable_names()[inf_var_index]

            if (self.data[submodel].shape[0] >
                    0) and (self.data[submodel].size >
                            0):  # observations for this submodel available
                outcome[self.observations_mapping[submodel]] = \
                    self.regression.run(self.data[submodel], coef[submodel].get_coefficient_values()[0,:],
                        resources=self.run_config).astype(outcome.dtype)
        return outcome

    def correct_infinite_values(self,
                                dataset,
                                outcome_attribute_name,
                                maxvalue=1e+38,
                                clip_all_larger_values=False):
        """Check if the model resulted in infinite values. If yes,
        print warning and clip the values to maxvalue. 
        If clip_all_larger_values is True, all values larger than maxvalue are clip to maxvalue.
        """
        infidx = where(dataset.get_attribute(outcome_attribute_name) == inf)[0]

        if infidx.size > 0:
            logger.log_warning("Infinite values in %s. Clipped to %s." %
                               (outcome_attribute_name, maxvalue))
            dataset.set_values_of_one_attribute(outcome_attribute_name,
                                                maxvalue, infidx)
        if clip_all_larger_values:
            idx = where(
                dataset.get_attribute(outcome_attribute_name) > maxvalue)[0]
            if idx.size > 0:
                logger.log_warning(
                    "Values in %s larger than %s. Clipped to %s." %
                    (outcome_attribute_name, maxvalue, maxvalue))
                dataset.set_values_of_one_attribute(outcome_attribute_name,
                                                    maxvalue, idx)

    def estimate(self,
                 specification,
                 dataset,
                 outcome_attribute,
                 index=None,
                 procedure=None,
                 data_objects=None,
                 estimate_config=None,
                 debuglevel=0):
        """'specification' is of type EquationSpecification,
            'dataset' is of type Dataset,
            'outcome_attribute' - string that determines the dependent variable,
            'index' are indices of individuals in dataset for which
                    the model runs. If it is None, the whole dataset is considered.
            'procedure' - name of the estimation procedure. If it is None,
                there should be an entry "estimation" in 'estimate_config' that determines the procedure. The class
                must have a method 'run' that takes as arguments 'data', 'regression_procedure' and 'resources'.
                It returns a dictionary with entries 'estimators', 'standard_errors' and 't_values' (all 1D numpy arrays).
            'data_objects' is a dictionary where each key is the name of an data object
                    ('zone', ...) and its value is an object of class  Dataset.
            'estimate_config' is of type Resources, it gives additional arguments for the estimation procedure.
            'debuglevel' overwrites the class 'debuglevel'.
        """
        #import wingdbstub
        self.debug.flag = debuglevel
        if estimate_config == None:
            estimate_config = Resources()
        if not isinstance(estimate_config, Resources) and isinstance(
                estimate_config, dict):
            estimate_config = Resources(estimate_config)
        self.estimate_config = estimate_config.merge_with_defaults(
            self.estimate_config)
        if data_objects is not None:
            self.dataset_pool.add_datasets_if_not_included(data_objects)
        self.procedure = procedure
        if self.procedure == None:
            self.procedure = self.estimate_config.get("estimation", None)
        if self.procedure is not None:
            self.procedure = ModelComponentCreator().get_model_component(
                self.procedure)
        else:
            logger.log_warning(
                "No estimation procedure given, or problems with loading the corresponding module."
            )

        compute_resources = Resources({"debug": self.debug})
        if dataset.size() <= 0:  # no data loaded yet
            dataset.get_id_attribute()
        if index == None:
            index = arange(dataset.size())
        if not isinstance(index, ndarray):
            index = array(index)

        estimation_size_agents = self.estimate_config.get(
            "estimation_size_agents",
            None)  # should be a proportion of the agent_set
        if estimation_size_agents == None:
            estimation_size_agents = 1.0
        else:
            estimation_size_agents = max(min(estimation_size_agents, 1.0),
                                         0.0)  # between 0 and 1

        if estimation_size_agents < 1.0:
            self.debug.print_debug("Sampling agents for estimation ...", 3)
            estimation_idx = sample_noreplace(
                arange(index.size), int(index.size * estimation_size_agents))
        else:
            estimation_idx = arange(index.size)

        estimation_idx = index[estimation_idx]
        self.debug.print_debug(
            "Number of observations for estimation: " +
            str(estimation_idx.size), 2)
        if estimation_idx.size <= 0:
            self.debug.print_debug("Nothing to be done.", 2)
            return (None, None)

        coefficients = create_coefficient_from_specification(specification)
        specified_coefficients = SpecifiedCoefficients().create(coefficients,
                                                                specification,
                                                                neqs=1)
        submodels = specified_coefficients.get_submodels()
        self.get_status_for_gui().update_pieces_using_submodels(
            submodels=submodels, leave_pieces=2)
        self.map_agents_to_submodels(
            submodels,
            self.submodel_string,
            dataset,
            estimation_idx,
            dataset_pool=self.dataset_pool,
            resources=compute_resources,
            submodel_size_max=self.estimate_config.get('submodel_size_max',
                                                       None))
        variables = specified_coefficients.get_full_variable_names_without_constants(
        )
        self.debug.print_debug("Compute variables ...", 4)
        self.increment_current_status_piece()
        dataset.compute_variables(variables,
                                  dataset_pool=self.dataset_pool,
                                  resources=compute_resources)

        coef = {}
        estimated_coef = {}
        self.outcome = {}
        dataset.compute_variables([outcome_attribute],
                                  dataset_pool=self.dataset_pool,
                                  resources=compute_resources)
        regression_resources = Resources(estimate_config)
        regression_resources.merge({"debug": self.debug})
        outcome_variable_name = VariableName(outcome_attribute)
        for submodel in submodels:
            coef[submodel] = SpecifiedCoefficientsFor1Submodel(
                specified_coefficients, submodel)
            self.increment_current_status_piece()
            logger.log_status("Estimate regression for submodel " +
                              str(submodel),
                              tags=["estimate"],
                              verbosity_level=2)
            logger.log_status("Number of observations: " +
                              str(self.observations_mapping[submodel].size),
                              tags=["estimate"],
                              verbosity_level=2)
            self.data[
                submodel] = dataset.create_regression_data_for_estimation(
                    coef[submodel],
                    index=estimation_idx[self.observations_mapping[submodel]])
            self.coefficient_names[submodel] = coef[
                submodel].get_coefficient_names_without_constant()[0, :]
            if (self.data[submodel].shape[0] > 0
                ) and (self.data[submodel].size > 0) and (
                    self.procedure
                    is not None):  # observations for this submodel available
                self.outcome[submodel] = dataset.get_attribute_by_index(
                    outcome_variable_name.get_alias(),
                    estimation_idx[self.observations_mapping[submodel]])
                regression_resources.merge({"outcome": self.outcome[submodel]})
                regression_resources.merge({
                    "coefficient_names":
                    self.coefficient_names[submodel].tolist(),
                    "constant_position":
                    coef[submodel].get_constants_positions()
                })
                estimated_coef[submodel] = self.procedure.run(
                    self.data[submodel],
                    self.regression,
                    resources=regression_resources)
                if "estimators" in estimated_coef[submodel].keys():
                    coef[submodel].set_coefficient_values(
                        estimated_coef[submodel]["estimators"])
                if "standard_errors" in estimated_coef[submodel].keys():
                    coef[submodel].set_standard_errors(
                        estimated_coef[submodel]["standard_errors"])
                if "other_measures" in estimated_coef[submodel].keys():
                    for measure in estimated_coef[submodel][
                            "other_measures"].keys():
                        coef[submodel].set_measure(
                            measure, estimated_coef[submodel]["other_measures"]
                            [measure])
                if "other_info" in estimated_coef[submodel].keys():
                    for info in estimated_coef[submodel]["other_info"]:
                        coef[submodel].set_other_info(
                            info, estimated_coef[submodel]["other_info"][info])
        coefficients.fill_coefficients(coef)

        self.save_predicted_values_and_errors(specification,
                                              coefficients,
                                              dataset,
                                              outcome_variable_name,
                                              index=index,
                                              data_objects=data_objects)

        return (coefficients, estimated_coef)

    def prepare_for_run(self,
                        dataset=None,
                        dataset_filter=None,
                        filter_threshold=0,
                        **kwargs):
        spec, coef = prepare_specification_and_coefficients(**kwargs)
        if (dataset is not None) and (dataset_filter is not None):
            filter_values = dataset.compute_variables(
                [dataset_filter], dataset_pool=self.dataset_pool)
            index = where(filter_values > filter_threshold)[0]
        else:
            index = None
        return (spec, coef, index)

    def prepare_for_estimate(self,
                             dataset=None,
                             dataset_filter=None,
                             filter_threshold=0,
                             **kwargs):
        spec = get_specification_for_estimation(**kwargs)
        if (dataset is not None) and (dataset_filter is not None):
            filter_values = dataset.compute_variables(
                [dataset_filter], dataset_pool=self.dataset_pool)
            index = where(filter_values > filter_threshold)[0]
        else:
            index = None
        return (spec, index)

    def get_data_as_dataset(self, submodel=-2):
        """Like get_all_data, but the retuning value is a Dataset containing attributes that
        correspond to the data columns. Their names are coefficient names."""
        all_data = self.get_all_data(submodel)
        if all_data is None:
            return None
        names = self.get_coefficient_names(submodel)
        if names is None:
            return None
        dataset_data = {}
        for i in range(names.size):
            dataset_data[names[i]] = all_data[:, i].reshape(all_data.shape[0])
        dataset_data["id"] = arange(all_data.shape[0]) + 1
        storage = StorageFactory().get_storage('dict_storage')
        storage.write_table(table_name='dataset', table_data=dataset_data)
        ds = Dataset(in_storage=storage, id_name="id", in_table_name='dataset')
        return ds

    def save_predicted_values_and_errors(self,
                                         specification,
                                         coefficients,
                                         dataset,
                                         outcome_variable,
                                         index=None,
                                         data_objects=None):
        if self.estimate_config.get('save_predicted_values_and_errors', False):
            logger.log_status('Computing predicted values and residuals.')
            original_values = dataset.get_attribute_by_index(
                outcome_variable, index)
            predicted_values = zeros(dataset.size(), dtype='float32')
            predicted_values[index] = self.run_after_estimation(
                specification,
                coefficients,
                dataset,
                index=index,
                data_objects=data_objects)
            predicted_attribute_name = 'predicted_%s' % outcome_variable.get_alias(
            )
            dataset.add_primary_attribute(name=predicted_attribute_name,
                                          data=predicted_values)
            dataset.flush_attribute(predicted_attribute_name)
            predicted_error_attribute_name = 'residuals_%s' % outcome_variable.get_alias(
            )
            error_values = zeros(dataset.size(), dtype='float32')
            error_values[index] = (original_values -
                                   predicted_values[index]).astype(
                                       error_values.dtype)
            dataset.add_primary_attribute(name=predicted_error_attribute_name,
                                          data=error_values)
            dataset.flush_attribute(predicted_error_attribute_name)
            logger.log_status(
                'Predicted values saved as %s (for the %s dataset)' %
                (predicted_attribute_name, dataset.get_dataset_name()))
            logger.log_status(
                'Residuals saved as %s (for the %s dataset)' %
                (predicted_error_attribute_name, dataset.get_dataset_name()))

    def export_estimation_data(self,
                               submodel=-2,
                               file_name='./estimation_data_regression.txt',
                               delimiter='\t'):
        import os
        from numpy import newaxis
        data = concatenate((self.outcome[submodel][..., newaxis],
                            self.get_all_data(submodel=submodel)),
                           axis=1)
        header = ['outcome'] + self.get_coefficient_names(submodel).tolist()
        nrows = data.shape[0]
        file_name_root, file_name_ext = os.path.splitext(file_name)
        out_file = "%s_submodel_%s.txt" % (file_name_root, submodel)
        fh = open(out_file, 'w')
        fh.write(delimiter.join(header) + '\n')  #file header
        for row in range(nrows):
            line = [str(x) for x in data[row, ]]
            fh.write(delimiter.join(line) + '\n')
        fh.flush()
        fh.close
        print 'Data written into %s' % out_file

    def run_after_estimation(self, *args, **kwargs):
        return self.run(*args, **kwargs)

    def _get_status_total_pieces(self):
        return ChunkModel._get_status_total_pieces(
            self) * self.get_status_for_gui().get_total_number_of_pieces()

    def _get_status_current_piece(self):
        return ChunkModel._get_status_current_piece(
            self) * self.get_status_for_gui().get_total_number_of_pieces(
            ) + self.get_status_for_gui().get_current_piece()

    def _get_status_piece_description(self):
        return "%s %s" % (ChunkModel._get_status_piece_description(
            self), self.get_status_for_gui().get_current_piece_description())

    def get_specified_coefficients(self):
        return self.specified_coefficients
    def estimate(self,
                 specification,
                 dataset,
                 outcome_attribute,
                 index=None,
                 procedure=None,
                 data_objects=None,
                 estimate_config=None,
                 debuglevel=0):
        """'specification' is of type EquationSpecification,
            'dataset' is of type Dataset,
            'outcome_attribute' - string that determines the dependent variable,
            'index' are indices of individuals in dataset for which
                    the model runs. If it is None, the whole dataset is considered.
            'procedure' - name of the estimation procedure. If it is None,
                there should be an entry "estimation" in 'estimate_config' that determines the procedure. The class
                must have a method 'run' that takes as arguments 'data', 'regression_procedure' and 'resources'.
                It returns a dictionary with entries 'estimators', 'standard_errors' and 't_values' (all 1D numpy arrays).
            'data_objects' is a dictionary where each key is the name of an data object
                    ('zone', ...) and its value is an object of class  Dataset.
            'estimate_config' is of type Resources, it gives additional arguments for the estimation procedure.
            'debuglevel' overwrites the class 'debuglevel'.
        """
        #import wingdbstub
        self.debug.flag = debuglevel
        if estimate_config == None:
            estimate_config = Resources()
        if not isinstance(estimate_config, Resources) and isinstance(
                estimate_config, dict):
            estimate_config = Resources(estimate_config)
        self.estimate_config = estimate_config.merge_with_defaults(
            self.estimate_config)
        if data_objects is not None:
            self.dataset_pool.add_datasets_if_not_included(data_objects)
        self.procedure = procedure
        if self.procedure == None:
            self.procedure = self.estimate_config.get("estimation", None)
        if self.procedure is not None:
            self.procedure = ModelComponentCreator().get_model_component(
                self.procedure)
        else:
            logger.log_warning(
                "No estimation procedure given, or problems with loading the corresponding module."
            )

        compute_resources = Resources({"debug": self.debug})
        if dataset.size() <= 0:  # no data loaded yet
            dataset.get_id_attribute()
        if index == None:
            index = arange(dataset.size())
        if not isinstance(index, ndarray):
            index = array(index)

        estimation_size_agents = self.estimate_config.get(
            "estimation_size_agents",
            None)  # should be a proportion of the agent_set
        if estimation_size_agents == None:
            estimation_size_agents = 1.0
        else:
            estimation_size_agents = max(min(estimation_size_agents, 1.0),
                                         0.0)  # between 0 and 1

        if estimation_size_agents < 1.0:
            self.debug.print_debug("Sampling agents for estimation ...", 3)
            estimation_idx = sample_noreplace(
                arange(index.size), int(index.size * estimation_size_agents))
        else:
            estimation_idx = arange(index.size)

        estimation_idx = index[estimation_idx]
        self.debug.print_debug(
            "Number of observations for estimation: " +
            str(estimation_idx.size), 2)
        if estimation_idx.size <= 0:
            self.debug.print_debug("Nothing to be done.", 2)
            return (None, None)

        coefficients = create_coefficient_from_specification(specification)
        specified_coefficients = SpecifiedCoefficients().create(coefficients,
                                                                specification,
                                                                neqs=1)
        submodels = specified_coefficients.get_submodels()
        self.get_status_for_gui().update_pieces_using_submodels(
            submodels=submodels, leave_pieces=2)
        self.map_agents_to_submodels(
            submodels,
            self.submodel_string,
            dataset,
            estimation_idx,
            dataset_pool=self.dataset_pool,
            resources=compute_resources,
            submodel_size_max=self.estimate_config.get('submodel_size_max',
                                                       None))
        variables = specified_coefficients.get_full_variable_names_without_constants(
        )
        self.debug.print_debug("Compute variables ...", 4)
        self.increment_current_status_piece()
        dataset.compute_variables(variables,
                                  dataset_pool=self.dataset_pool,
                                  resources=compute_resources)

        coef = {}
        estimated_coef = {}
        self.outcome = {}
        dataset.compute_variables([outcome_attribute],
                                  dataset_pool=self.dataset_pool,
                                  resources=compute_resources)
        regression_resources = Resources(estimate_config)
        regression_resources.merge({"debug": self.debug})
        outcome_variable_name = VariableName(outcome_attribute)
        for submodel in submodels:
            coef[submodel] = SpecifiedCoefficientsFor1Submodel(
                specified_coefficients, submodel)
            self.increment_current_status_piece()
            logger.log_status("Estimate regression for submodel " +
                              str(submodel),
                              tags=["estimate"],
                              verbosity_level=2)
            logger.log_status("Number of observations: " +
                              str(self.observations_mapping[submodel].size),
                              tags=["estimate"],
                              verbosity_level=2)
            self.data[
                submodel] = dataset.create_regression_data_for_estimation(
                    coef[submodel],
                    index=estimation_idx[self.observations_mapping[submodel]])
            self.coefficient_names[submodel] = coef[
                submodel].get_coefficient_names_without_constant()[0, :]
            if (self.data[submodel].shape[0] > 0
                ) and (self.data[submodel].size > 0) and (
                    self.procedure
                    is not None):  # observations for this submodel available
                self.outcome[submodel] = dataset.get_attribute_by_index(
                    outcome_variable_name.get_alias(),
                    estimation_idx[self.observations_mapping[submodel]])
                regression_resources.merge({"outcome": self.outcome[submodel]})
                regression_resources.merge({
                    "coefficient_names":
                    self.coefficient_names[submodel].tolist(),
                    "constant_position":
                    coef[submodel].get_constants_positions()
                })
                estimated_coef[submodel] = self.procedure.run(
                    self.data[submodel],
                    self.regression,
                    resources=regression_resources)
                if "estimators" in estimated_coef[submodel].keys():
                    coef[submodel].set_coefficient_values(
                        estimated_coef[submodel]["estimators"])
                if "standard_errors" in estimated_coef[submodel].keys():
                    coef[submodel].set_standard_errors(
                        estimated_coef[submodel]["standard_errors"])
                if "other_measures" in estimated_coef[submodel].keys():
                    for measure in estimated_coef[submodel][
                            "other_measures"].keys():
                        coef[submodel].set_measure(
                            measure, estimated_coef[submodel]["other_measures"]
                            [measure])
                if "other_info" in estimated_coef[submodel].keys():
                    for info in estimated_coef[submodel]["other_info"]:
                        coef[submodel].set_other_info(
                            info, estimated_coef[submodel]["other_info"][info])
        coefficients.fill_coefficients(coef)

        self.save_predicted_values_and_errors(specification,
                                              coefficients,
                                              dataset,
                                              outcome_variable_name,
                                              index=index,
                                              data_objects=data_objects)

        return (coefficients, estimated_coef)
class RegressionModel(ChunkModel):

    model_name = "Regression Model"
    model_short_name = "RM"

    def __init__(self, regression_procedure="opus_core.linear_regression",
                  submodel_string=None,
                  run_config=None, estimate_config=None, debuglevel=0, dataset_pool=None):
 
        self.debug = DebugPrinter(debuglevel)

        self.dataset_pool = self.create_dataset_pool(dataset_pool)

        self.regression = RegressionModelFactory().get_model(name=regression_procedure)
        if self.regression == None:
            raise StandardError, "No regression procedure given."

        self.submodel_string = submodel_string

        self.run_config = run_config
        if self.run_config == None:
            self.run_config = Resources()
        if not isinstance(self.run_config,Resources) and isinstance(self.run_config, dict):
            self.run_config = Resources(self.run_config)

        self.estimate_config = estimate_config
        if self.estimate_config == None:
            self.estimate_config = Resources()
        if not isinstance(self.estimate_config,Resources) and isinstance(self.estimate_config, dict):
            self.estimate_config = Resources(self.estimate_config)
            
        self.data = {}
        self.coefficient_names = {}
        ChunkModel.__init__(self)
        self.get_status_for_gui().initialize_pieces(3, pieces_description = array(['initialization', 'computing variables', 'submodel: 1']))

    def run(self, specification, coefficients, dataset, index=None, chunk_specification=None,
            data_objects=None, run_config=None, initial_values=None, procedure=None, debuglevel=0):
        """'specification' is of type EquationSpecification,
            'coefficients' is of type Coefficients,
            'dataset' is of type Dataset,
            'index' are indices of individuals in dataset for which
                        the model runs. If it is None, the whole dataset is considered.
            'chunk_specification' determines  number of chunks in which the simulation is processed.
            'data_objects' is a dictionary where each key is the name of an data object
            ('zone', ...) and its value is an object of class  Dataset.
           'run_config' is of type Resources, it gives additional arguments for the run.
           If 'procedure' is given, it overwrites the regression_procedure of the constructor.
           'initial_values' is an array of the initial values of the results. It will be overwritten
           by the results for those elements that are handled by the model (defined by submodels in the specification).
           By default the results are initialized with 0.
            'debuglevel' overwrites the constructor 'debuglevel'.
        """
        self.debug.flag = debuglevel
        if run_config == None:
            run_config = Resources()
        if not isinstance(run_config,Resources) and isinstance(run_config, dict):
            run_config = Resources(run_config)
        self.run_config = run_config.merge_with_defaults(self.run_config)
        self.run_config.merge({"debug":self.debug})
        if data_objects is not None:
            self.dataset_pool.add_datasets_if_not_included(data_objects)
        self.dataset_pool.replace_dataset(dataset.get_dataset_name(), dataset)
        if procedure is not None: 
            self.regression = RegressionModelFactory().get_model(name=procedure)
        if initial_values is None:
            self.initial_values = zeros((dataset.size(),), dtype=float32)
        else:
            self.initial_values = zeros((dataset.size(),), dtype=initial_values.dtype)
            self.initial_values[index] = initial_values
            
        if dataset.size()<=0: # no data loaded yet
            dataset.get_id_attribute()
        if index == None:
            index = arange(dataset.size())
            
        result = ChunkModel.run(self, chunk_specification, dataset, index, float32,
                                 specification=specification, coefficients=coefficients)
        return result

    def run_chunk (self, index, dataset, specification, coefficients):
        self.specified_coefficients = SpecifiedCoefficients().create(coefficients, specification, neqs=1)
        compute_resources = Resources({"debug":self.debug})
        submodels = self.specified_coefficients.get_submodels()
        self.get_status_for_gui().update_pieces_using_submodels(submodels=submodels, leave_pieces=2)
        self.map_agents_to_submodels(submodels, self.submodel_string, dataset, index,
                                      dataset_pool=self.dataset_pool, resources = compute_resources)
        variables = self.specified_coefficients.get_full_variable_names_without_constants()
        self.debug.print_debug("Compute variables ...",4)
        self.increment_current_status_piece()
        dataset.compute_variables(variables, dataset_pool = self.dataset_pool, resources = compute_resources)
        data = {}
        coef = {}
        outcome=self.initial_values[index].copy()
        for submodel in submodels:
            coef[submodel] = SpecifiedCoefficientsFor1Submodel(self.specified_coefficients,submodel)
            self.coefficient_names[submodel] = coef[submodel].get_coefficient_names_without_constant()[0,:]
            self.debug.print_debug("Compute regression for submodel " +str(submodel),4)
            self.increment_current_status_piece()
            self.data[submodel] = dataset.create_regression_data(coef[submodel],
                                                                index = index[self.observations_mapping[submodel]])
            nan_index = where(isnan(self.data[submodel]))[1]
            inf_index = where(isinf(self.data[submodel]))[1]
            vnames = asarray(coef[submodel].get_variable_names())
            if nan_index.size > 0:
                nan_var_index = unique(nan_index)
                self.data[submodel] = nan_to_num(self.data[submodel])
                logger.log_warning("NaN(Not A Number) is returned from variable %s; it is replaced with %s." % (vnames[nan_var_index], nan_to_num(nan)))
                #raise ValueError, "NaN(Not A Number) is returned from variable %s; check the model specification table and/or attribute values used in the computation for the variable." % vnames[nan_var_index]
            if inf_index.size > 0:
                inf_var_index = unique(inf_index)
                self.data[submodel] = nan_to_num(self.data[submodel])
                logger.log_warning("Inf is returned from variable %s; it is replaced with %s." % (vnames[inf_var_index], nan_to_num(inf)))
                #raise ValueError, "Inf is returned from variable %s; check the model specification table and/or attribute values used in the computation for the variable." % vnames[inf_var_index]
            
            if (self.data[submodel].shape[0] > 0) and (self.data[submodel].size > 0): # observations for this submodel available
                outcome[self.observations_mapping[submodel]] = \
                    self.regression.run(self.data[submodel], coef[submodel].get_coefficient_values()[0,:],
                        resources=self.run_config).astype(outcome.dtype)
        return outcome

    def correct_infinite_values(self, dataset, outcome_attribute_name, maxvalue=1e+38, clip_all_larger_values=False):
        """Check if the model resulted in infinite values. If yes,
        print warning and clip the values to maxvalue. 
        If clip_all_larger_values is True, all values larger than maxvalue are clip to maxvalue.
        """
        infidx = where(dataset.get_attribute(outcome_attribute_name) == inf)[0]

        if infidx.size > 0:
            logger.log_warning("Infinite values in %s. Clipped to %s." % (outcome_attribute_name, maxvalue))
            dataset.set_values_of_one_attribute(outcome_attribute_name, maxvalue, infidx)
        if clip_all_larger_values:
            idx = where(dataset.get_attribute(outcome_attribute_name) > maxvalue)[0]
            if idx.size > 0:
                logger.log_warning("Values in %s larger than %s. Clipped to %s." % (outcome_attribute_name, maxvalue, maxvalue))
                dataset.set_values_of_one_attribute(outcome_attribute_name, maxvalue, idx)
            
    def estimate(self, specification, dataset, outcome_attribute, index = None, procedure=None, data_objects=None,
                        estimate_config=None,  debuglevel=0):
        """'specification' is of type EquationSpecification,
            'dataset' is of type Dataset,
            'outcome_attribute' - string that determines the dependent variable,
            'index' are indices of individuals in dataset for which
                    the model runs. If it is None, the whole dataset is considered.
            'procedure' - name of the estimation procedure. If it is None,
                there should be an entry "estimation" in 'estimate_config' that determines the procedure. The class
                must have a method 'run' that takes as arguments 'data', 'regression_procedure' and 'resources'.
                It returns a dictionary with entries 'estimators', 'standard_errors' and 't_values' (all 1D numpy arrays).
            'data_objects' is a dictionary where each key is the name of an data object
                    ('zone', ...) and its value is an object of class  Dataset.
            'estimate_config' is of type Resources, it gives additional arguments for the estimation procedure.
            'debuglevel' overwrites the class 'debuglevel'.
        """
        #import wingdbstub
        self.debug.flag = debuglevel
        if estimate_config == None:
            estimate_config = Resources()
        if not isinstance(estimate_config,Resources) and isinstance(estimate_config, dict):
            estimate_config = Resources(estimate_config)
        self.estimate_config = estimate_config.merge_with_defaults(self.estimate_config)
        if data_objects is not None:
            self.dataset_pool.add_datasets_if_not_included(data_objects)
        self.procedure=procedure
        if self.procedure == None:
            self.procedure = self.estimate_config.get("estimation", None)
        if self.procedure is not None:
            self.procedure = ModelComponentCreator().get_model_component(self.procedure)
        else:
            logger.log_warning("No estimation procedure given, or problems with loading the corresponding module.")

        compute_resources = Resources({"debug":self.debug})
        if dataset.size()<=0: # no data loaded yet
            dataset.get_id_attribute()
        if index == None:
            index = arange(dataset.size())
        if not isinstance(index,ndarray):
            index=array(index)

        estimation_size_agents = self.estimate_config.get("estimation_size_agents", None) # should be a proportion of the agent_set
        if estimation_size_agents == None:
            estimation_size_agents = 1.0
        else:
            estimation_size_agents = max(min(estimation_size_agents,1.0),0.0) # between 0 and 1

        if estimation_size_agents < 1.0:
            self.debug.print_debug("Sampling agents for estimation ...",3)
            estimation_idx = sample_noreplace(arange(index.size),
                                                         int(index.size*estimation_size_agents))
        else:
            estimation_idx = arange(index.size)

        estimation_idx = index[estimation_idx]
        self.debug.print_debug("Number of observations for estimation: " + str(estimation_idx.size),2)
        if estimation_idx.size <= 0:
            self.debug.print_debug("Nothing to be done.",2)
            return (None, None)

        coefficients = create_coefficient_from_specification(specification)
        self.specified_coefficients = SpecifiedCoefficients().create(coefficients, specification, neqs=1)
        submodels = self.specified_coefficients.get_submodels()
        self.get_status_for_gui().update_pieces_using_submodels(submodels=submodels, leave_pieces=2)
        self.map_agents_to_submodels(submodels, self.submodel_string, dataset, estimation_idx,
                                      dataset_pool=self.dataset_pool, resources = compute_resources,
                                      submodel_size_max=self.estimate_config.get('submodel_size_max', None))
        variables = self.specified_coefficients.get_full_variable_names_without_constants()
        self.debug.print_debug("Compute variables ...",4)
        self.increment_current_status_piece()
        dataset.compute_variables(variables, dataset_pool=self.dataset_pool, resources = compute_resources)

        coef = {}
        estimated_coef={}
        self.outcome = {}
        dataset.compute_variables([outcome_attribute], dataset_pool=self.dataset_pool, resources=compute_resources)
        regression_resources=Resources(estimate_config)
        regression_resources.merge({"debug":self.debug})
        outcome_variable_name = VariableName(outcome_attribute)
        for submodel in submodels:
            coef[submodel] = SpecifiedCoefficientsFor1Submodel(self.specified_coefficients,submodel)
            self.increment_current_status_piece()
            logger.log_status("Estimate regression for submodel " +str(submodel),
                               tags=["estimate"], verbosity_level=2)
            #logger.log_status("Number of observations: " +str(self.observations_mapping[submodel].size),
                               #tags=["estimate"], verbosity_level=2)
            self.data[submodel] = dataset.create_regression_data_for_estimation(coef[submodel],
                                                            index = estimation_idx[self.observations_mapping[submodel]])
            self.coefficient_names[submodel] = coef[submodel].get_coefficient_names_without_constant()[0,:]
            if (self.data[submodel].shape[0] > 0) and (self.data[submodel].size > 0) and (self.procedure is not None): # observations for this submodel available
                self.outcome[submodel] = dataset.get_attribute_by_index(outcome_variable_name.get_alias(), estimation_idx[self.observations_mapping[submodel]])   
                regression_resources.merge({"outcome":  self.outcome[submodel]})
                regression_resources.merge({"coefficient_names":self.coefficient_names[submodel].tolist(),
                            "constant_position": coef[submodel].get_constants_positions()})
                regression_resources.merge({"submodel": submodel})
                estimated_coef[submodel] = self.procedure.run(self.data[submodel], self.regression,
                                                        resources=regression_resources)
                if "estimators" in estimated_coef[submodel].keys():
                    coef[submodel].set_coefficient_values(estimated_coef[submodel]["estimators"])
                if "standard_errors" in estimated_coef[submodel].keys():
                    coef[submodel].set_standard_errors(estimated_coef[submodel]["standard_errors"])
                if "other_measures" in estimated_coef[submodel].keys():
                    for measure in estimated_coef[submodel]["other_measures"].keys():
                        coef[submodel].set_measure(measure,
                              estimated_coef[submodel]["other_measures"][measure])
                if "other_info" in estimated_coef[submodel].keys():
                    for info in estimated_coef[submodel]["other_info"]:
                        coef[submodel].set_other_info(info,
                              estimated_coef[submodel]["other_info"][info])
        coefficients.fill_coefficients(coef)
        self.specified_coefficients.coefficients = coefficients
        self.save_predicted_values_and_errors(specification, coefficients, dataset, outcome_variable_name, index=index, data_objects=data_objects)
            
        return (coefficients, estimated_coef)

    def prepare_for_run(self, dataset=None, dataset_filter=None, filter_threshold=0, **kwargs):
        spec, coef = prepare_specification_and_coefficients(**kwargs)
        if (dataset is not None) and (dataset_filter is not None):
            filter_values = dataset.compute_variables([dataset_filter], dataset_pool=self.dataset_pool)
            index = where(filter_values > filter_threshold)[0]
        else:
            index = None
        return (spec, coef, index)

    def prepare_for_estimate(self, dataset=None, dataset_filter=None, filter_threshold=0, **kwargs):
        spec = get_specification_for_estimation(**kwargs)
        if (dataset is not None) and (dataset_filter is not None):
            filter_values = dataset.compute_variables([dataset_filter], dataset_pool=self.dataset_pool)
            index = where(filter_values > filter_threshold)[0]
        else:
            index = None
        return (spec, index)
    
    def get_data_as_dataset(self, submodel=-2):
        """Like get_all_data, but the retuning value is a Dataset containing attributes that
        correspond to the data columns. Their names are coefficient names."""
        all_data = self.get_all_data(submodel)
        if all_data is None:
            return None
        names = self.get_coefficient_names(submodel)
        if names is None:
            return None
        dataset_data = {}
        for i in range(names.size):
            dataset_data[names[i]] = all_data[:, i].reshape(all_data.shape[0])
        dataset_data["id"] = arange(all_data.shape[0])+1
        storage = StorageFactory().get_storage('dict_storage')
        storage.write_table(table_name='dataset', table_data=dataset_data)
        ds = Dataset(in_storage=storage, id_name="id", in_table_name='dataset')
        return ds

    def save_predicted_values_and_errors(self, specification, coefficients, dataset, outcome_variable, index=None, data_objects=None):
        if self.estimate_config.get('save_predicted_values_and_errors', False):
            logger.log_status('Computing predicted values and residuals.')
            original_values = dataset.get_attribute_by_index(outcome_variable, index)
            predicted_values = zeros(dataset.size(), dtype='float32')
            predicted_values[index] = self.run_after_estimation(specification, coefficients, dataset, index=index, data_objects=data_objects)
            predicted_attribute_name = 'predicted_%s' % outcome_variable.get_alias()
            dataset.add_primary_attribute(name=predicted_attribute_name, data=predicted_values)
            dataset.flush_attribute(predicted_attribute_name)
            predicted_error_attribute_name = 'residuals_%s' % outcome_variable.get_alias()
            error_values = zeros(dataset.size(), dtype='float32')
            error_values[index] = (original_values - predicted_values[index]).astype(error_values.dtype)
            dataset.add_primary_attribute(name=predicted_error_attribute_name, data = error_values)
            dataset.flush_attribute(predicted_error_attribute_name)
            logger.log_status('Predicted values saved as %s (for the %s dataset)' % (predicted_attribute_name, dataset.get_dataset_name()))
            logger.log_status('Residuals saved as %s (for the %s dataset)' % (predicted_error_attribute_name, dataset.get_dataset_name()))
        
    def export_estimation_data(self, submodel=-2, file_name='./estimation_data_regression.txt', delimiter = '\t'):
        import os
        from numpy import newaxis
        data = concatenate((self.outcome[submodel][...,newaxis], self.get_all_data(submodel=submodel)), axis=1)
        header = ['outcome'] + self.get_coefficient_names(submodel).tolist()
        nrows = data.shape[0]
        file_name_root, file_name_ext = os.path.splitext(file_name)
        out_file = "%s_submodel_%s.txt" % (file_name_root, submodel)
        fh = open(out_file,'w')
        fh.write(delimiter.join(header) + '\n')   #file header
        for row in range(nrows):
            line = [str(x) for x in data[row,]]
            fh.write(delimiter.join(line) + '\n')
        fh.flush()
        fh.close
        print 'Data written into %s' % out_file
        
    def run_after_estimation(self, *args, **kwargs):
        return self.run(*args, **kwargs)
            
    def _get_status_total_pieces(self):
        return ChunkModel._get_status_total_pieces(self) * self.get_status_for_gui().get_total_number_of_pieces()
    
    def _get_status_current_piece(self):
        return ChunkModel._get_status_current_piece(self)*self.get_status_for_gui().get_total_number_of_pieces() + self.get_status_for_gui().get_current_piece()
        
    def _get_status_piece_description(self):
        return "%s %s" % (ChunkModel._get_status_piece_description(self), self.get_status_for_gui().get_current_piece_description())
    
    def get_specified_coefficients(self):
        return self.specified_coefficients
    def estimate(self, specification, dataset, outcome_attribute, index = None, procedure=None, data_objects=None,
                        estimate_config=None,  debuglevel=0):
        """'specification' is of type EquationSpecification,
            'dataset' is of type Dataset,
            'outcome_attribute' - string that determines the dependent variable,
            'index' are indices of individuals in dataset for which
                    the model runs. If it is None, the whole dataset is considered.
            'procedure' - name of the estimation procedure. If it is None,
                there should be an entry "estimation" in 'estimate_config' that determines the procedure. The class
                must have a method 'run' that takes as arguments 'data', 'regression_procedure' and 'resources'.
                It returns a dictionary with entries 'estimators', 'standard_errors' and 't_values' (all 1D numpy arrays).
            'data_objects' is a dictionary where each key is the name of an data object
                    ('zone', ...) and its value is an object of class  Dataset.
            'estimate_config' is of type Resources, it gives additional arguments for the estimation procedure.
            'debuglevel' overwrites the class 'debuglevel'.
        """
        #import wingdbstub
        self.debug.flag = debuglevel
        if estimate_config == None:
            estimate_config = Resources()
        if not isinstance(estimate_config,Resources) and isinstance(estimate_config, dict):
            estimate_config = Resources(estimate_config)
        self.estimate_config = estimate_config.merge_with_defaults(self.estimate_config)
        if data_objects is not None:
            self.dataset_pool.add_datasets_if_not_included(data_objects)
        self.procedure=procedure
        if self.procedure == None:
            self.procedure = self.estimate_config.get("estimation", None)
        if self.procedure is not None:
            self.procedure = ModelComponentCreator().get_model_component(self.procedure)
        else:
            logger.log_warning("No estimation procedure given, or problems with loading the corresponding module.")

        compute_resources = Resources({"debug":self.debug})
        if dataset.size()<=0: # no data loaded yet
            dataset.get_id_attribute()
        if index == None:
            index = arange(dataset.size())
        if not isinstance(index,ndarray):
            index=array(index)

        estimation_size_agents = self.estimate_config.get("estimation_size_agents", None) # should be a proportion of the agent_set
        if estimation_size_agents == None:
            estimation_size_agents = 1.0
        else:
            estimation_size_agents = max(min(estimation_size_agents,1.0),0.0) # between 0 and 1

        if estimation_size_agents < 1.0:
            self.debug.print_debug("Sampling agents for estimation ...",3)
            estimation_idx = sample_noreplace(arange(index.size),
                                                         int(index.size*estimation_size_agents))
        else:
            estimation_idx = arange(index.size)

        estimation_idx = index[estimation_idx]
        self.debug.print_debug("Number of observations for estimation: " + str(estimation_idx.size),2)
        if estimation_idx.size <= 0:
            self.debug.print_debug("Nothing to be done.",2)
            return (None, None)

        coefficients = create_coefficient_from_specification(specification)
        self.specified_coefficients = SpecifiedCoefficients().create(coefficients, specification, neqs=1)
        submodels = self.specified_coefficients.get_submodels()
        self.get_status_for_gui().update_pieces_using_submodels(submodels=submodels, leave_pieces=2)
        self.map_agents_to_submodels(submodels, self.submodel_string, dataset, estimation_idx,
                                      dataset_pool=self.dataset_pool, resources = compute_resources,
                                      submodel_size_max=self.estimate_config.get('submodel_size_max', None))
        variables = self.specified_coefficients.get_full_variable_names_without_constants()
        self.debug.print_debug("Compute variables ...",4)
        self.increment_current_status_piece()
        dataset.compute_variables(variables, dataset_pool=self.dataset_pool, resources = compute_resources)

        coef = {}
        estimated_coef={}
        self.outcome = {}
        dataset.compute_variables([outcome_attribute], dataset_pool=self.dataset_pool, resources=compute_resources)
        regression_resources=Resources(estimate_config)
        regression_resources.merge({"debug":self.debug})
        outcome_variable_name = VariableName(outcome_attribute)
        for submodel in submodels:
            coef[submodel] = SpecifiedCoefficientsFor1Submodel(self.specified_coefficients,submodel)
            self.increment_current_status_piece()
            logger.log_status("Estimate regression for submodel " +str(submodel),
                               tags=["estimate"], verbosity_level=2)
            #logger.log_status("Number of observations: " +str(self.observations_mapping[submodel].size),
                               #tags=["estimate"], verbosity_level=2)
            self.data[submodel] = dataset.create_regression_data_for_estimation(coef[submodel],
                                                            index = estimation_idx[self.observations_mapping[submodel]])
            self.coefficient_names[submodel] = coef[submodel].get_coefficient_names_without_constant()[0,:]
            if (self.data[submodel].shape[0] > 0) and (self.data[submodel].size > 0) and (self.procedure is not None): # observations for this submodel available
                self.outcome[submodel] = dataset.get_attribute_by_index(outcome_variable_name.get_alias(), estimation_idx[self.observations_mapping[submodel]])   
                regression_resources.merge({"outcome":  self.outcome[submodel]})
                regression_resources.merge({"coefficient_names":self.coefficient_names[submodel].tolist(),
                            "constant_position": coef[submodel].get_constants_positions()})
                regression_resources.merge({"submodel": submodel})
                estimated_coef[submodel] = self.procedure.run(self.data[submodel], self.regression,
                                                        resources=regression_resources)
                if "estimators" in estimated_coef[submodel].keys():
                    coef[submodel].set_coefficient_values(estimated_coef[submodel]["estimators"])
                if "standard_errors" in estimated_coef[submodel].keys():
                    coef[submodel].set_standard_errors(estimated_coef[submodel]["standard_errors"])
                if "other_measures" in estimated_coef[submodel].keys():
                    for measure in estimated_coef[submodel]["other_measures"].keys():
                        coef[submodel].set_measure(measure,
                              estimated_coef[submodel]["other_measures"][measure])
                if "other_info" in estimated_coef[submodel].keys():
                    for info in estimated_coef[submodel]["other_info"]:
                        coef[submodel].set_other_info(info,
                              estimated_coef[submodel]["other_info"][info])
        coefficients.fill_coefficients(coef)
        self.specified_coefficients.coefficients = coefficients
        self.save_predicted_values_and_errors(specification, coefficients, dataset, outcome_variable_name, index=index, data_objects=data_objects)
            
        return (coefficients, estimated_coef)