def run_multiprocess(self, resources):
        resources = Resources(resources)
        profiler_name = resources.get("profile_filename", None)
        if resources['cache_directory'] is not None:
            cache_directory = resources['cache_directory']
        else:
            cache_directory = SimulationState().get_cache_directory()

        ### TODO: Get rid of this! There is absolutely no good reason to be
        ###       changing the Configuration!
        resources['cache_directory'] = cache_directory

        log_file = os.path.join(cache_directory, 'run_multiprocess.log')
        logger.enable_file_logging(log_file)

        start_year = resources["years"][0]
        end_year = resources["years"][-1]
        nyears = end_year - start_year + 1
        root_seed = resources.get("seed", NO_SEED)
        if resources.get('_seed_dictionary_', None) is not None:
            # This is added by the RunManager to ensure reproducibility including restarted runs
            seed_dict = resources.get('_seed_dictionary_')
            seed_array = array(
                map(lambda year: seed_dict[year],
                    range(start_year, end_year + 1)))
        else:
            seed(root_seed)
            seed_array = randint(1, 2**30, nyears)
        logger.log_status("Running simulation for years %d thru %d" %
                          (start_year, end_year))
        logger.log_status("Simulation root seed: %s" % root_seed)

        for iyear, year in enumerate(range(start_year, end_year + 1)):
            success = self._run_each_year_as_separate_process(
                iyear,
                year,
                seed=seed_array[iyear],
                resources=resources,
                profiler_name=profiler_name,
                log_file=log_file)
            if not success:
                break

        self._notify_stopped()
        if profiler_name is not None:  # insert original value
            resources["profile_filename"] = profiler_name
        logger.log_status("Done running simulation for years %d thru %d" %
                          (start_year, end_year))
Exemple #2
0
 def _compute_vacancy_and_total_units_variables(self, location_set, project_types, resources=None):
     compute_resources = Resources(resources)
     compute_resources.merge({"debug":self.debug})
     self.variable_for_vacancy = {}
     self.variable_for_total_units = {}
     for ptype in project_types:
         self.variable_for_vacancy[ptype] = compute_resources.get(
                                 "%s_vacant_variable" % ptype,
                                 "urbansim_zone.%s.vacant_%s" % (location_set.get_dataset_name(),
                                                                  self.project_specific_units[ptype]))
         self.variable_for_total_units[ptype] = compute_resources.get(
                                 "%s_total_units_variable" % ptype,
                                 "%s.aggregate(urbansim_zone.building.total_%s)" % (location_set.get_dataset_name(), 
                                                          self.project_specific_units[ptype]))
         location_set.compute_variables([self.variable_for_vacancy[ptype], self.variable_for_total_units[ptype]], 
                                        dataset_pool=self.dataset_pool, resources = compute_resources)
class RunSimulationFromMysql:
    def prepare_for_simulation(self, run_configuration, cache_directory=None):
        self.config = Resources(run_configuration)
        self.simulation_state = SimulationState(
            new_instance=True, base_cache_dir=cache_directory, start_time=self.config.get("base_year", 0)
        )

        ### TODO: Get rid of this! There is no good reason to be changing the
        ###       Configuration.
        if self.config["cache_directory"] is None:
            self.config["cache_directory"] = self.simulation_state.get_cache_directory()

        SessionConfiguration(
            new_instance=True,
            package_order=self.config["dataset_pool_configuration"].package_order,
            in_storage=AttributeCache(),
        )

        ForkProcess().fork_new_process(
            self.config["creating_baseyear_cache_configuration"].cache_scenario_database, self.config
        )

        # Create output database (normally done by run manager)
        if "estimation_database_configuration" in self.config:
            db_server = DatabaseServer(self.config["estimation_database_configuration"])
            if not db_server.has_database(self.config["estimation_database_configuration"].database_name):
                db_server.create_database(self.config["estimation_database_configuration"].database_name)

    def run_simulation(self, simulation_instance=None):
        logger.start_block("Simulation on database %s" % self.config["scenario_database_configuration"].database_name)
        try:
            if simulation_instance is None:
                simulation_instance = ModelSystem()
            simulation_instance.run(self.config)
            # simulation_instance.run_multiprocess(self.config, is_run_subset=True)
        finally:
            logger.end_block()
        logger.log_status("Data cache in %s" % self.simulation_state.get_cache_directory())

    def cleanup(self, remove_cache, remove_output_database):
        """Remove all outputs of this simulation."""
        self.simulation_state.remove_singleton(delete_cache=remove_cache)
        # Remove SessionConfiguration singleton, if it exists
        Singleton().remove_singleton_for_class(SessionConfiguration)

        cache_dir = self.config["cache_directory"]
        if os.path.exists(cache_dir):
            rmtree(cache_dir)
        if remove_output_database and ("estimation_database_configuration" in self.config):
            db_server = DatabaseServer(self.config["estimation_database_configuration"])
            db_server.drop_database(self.config["estimation_database_configuration"].database_name)

    def prepare_and_run(self, run_configuration, simulation_instance=None, remove_cache=True):
        self.prepare_for_simulation(run_configuration)
        self.run_simulation(simulation_instance)
        self.cleanup(remove_cache)
Exemple #4
0
class RunSimulation(object):
    def prepare_for_simulation(self, config, cache_directory=None):
        self.config = Resources(config)
        base_cache_dir = self.config[
            'creating_baseyear_cache_configuration'].cache_directory_root

        self.simulation_state = SimulationState(new_instance=True,
                                                base_cache_dir=base_cache_dir,
                                                start_time=self.config.get(
                                                    'base_year', 0))

        ### TODO: Get rid of this! There is no good reason to be changing the
        ###       Configuration.
        if self.config['cache_directory'] is None:
            self.config[
                'cache_directory'] = self.simulation_state.get_cache_directory(
                )

        SessionConfiguration(
            new_instance=True,
            package_order=self.config['dataset_pool_configuration'].
            package_order,
            in_storage=AttributeCache())

        if config['creating_baseyear_cache_configuration'].cache_from_database:
            ForkProcess().fork_new_process(
                self.config['creating_baseyear_cache_configuration'].
                cache_scenario_database, self.config)
        else:
            CacheFltData().run(self.config)

    def run_simulation(self, simulation_instance=None):
        if simulation_instance is None:
            simulation_instance = ModelSystem()
        simulation_instance.run(self.config)
        #simulation_instance.run_multiprocess(self.config, is_run_subset=True)
        logger.log_status("Data cache in %s" %
                          self.simulation_state.get_cache_directory())

    def cleanup(self, remove_cache=True):
        """Remove all outputs of this simulation."""
        self.simulation_state.remove_singleton(delete_cache=remove_cache)
        SessionConfiguration().remove_singleton()
        if remove_cache:
            cache_dir = self.config['cache_directory']
            if os.path.exists(cache_dir):
                rmtree(cache_dir)

    def prepare_and_run(self,
                        run_configuration,
                        simulation_instance=None,
                        remove_cache=True):
        self.prepare_for_simulation(run_configuration)
        self.run_simulation(simulation_instance)
        self.cleanup(remove_cache)
    def run_multiprocess(self, resources):
        resources = Resources(resources)
        profiler_name = resources.get("profile_filename", None)
        if resources['cache_directory'] is not None:
            cache_directory = resources['cache_directory']
        else:
            cache_directory = SimulationState().get_cache_directory()

        ### TODO: Get rid of this! There is absolutely no good reason to be
        ###       changing the Configuration!
        resources['cache_directory'] = cache_directory

        log_file = os.path.join(cache_directory, 'run_multiprocess.log')
        logger.enable_file_logging(log_file)

        start_year = resources["years"][0]
        end_year = resources["years"][-1]
        nyears = end_year - start_year + 1
        root_seed = resources.get("seed", NO_SEED)
        if resources.get('_seed_dictionary_', None) is not None:
            # This is added by the RunManager to ensure reproducibility including restarted runs 
            seed_dict = resources.get('_seed_dictionary_')
            seed_array = array(map(lambda year : seed_dict[year], range(start_year, end_year+1)))
        else:
            seed(root_seed)
            seed_array = randint(1,2**30, nyears)
        logger.log_status("Running simulation for years %d thru %d" % (start_year, end_year))
        logger.log_status("Simulation root seed: %s" % root_seed)

        for iyear, year in enumerate(range(start_year, end_year+1)):
            success = self._run_each_year_as_separate_process(iyear, year, 
                                                                 seed=seed_array[iyear],
                                                                 resources=resources,
                                                                 profiler_name=profiler_name,
                                                                 log_file=log_file)
            if not success:
                break

        self._notify_stopped()
        if profiler_name is not None: # insert original value
            resources["profile_filename"] = profiler_name
        logger.log_status("Done running simulation for years %d thru %d" % (start_year, end_year))
class RunSimulationFromMysql:
    def prepare_for_simulation(self, run_configuration, cache_directory=None):
        self.config = Resources(run_configuration)
        self.simulation_state = SimulationState(new_instance=True, base_cache_dir=cache_directory, 
                                                start_time=self.config.get('base_year', 0))

        ### TODO: Get rid of this! There is no good reason to be changing the
        ###       Configuration.
        if self.config['cache_directory'] is None:
            self.config['cache_directory'] = self.simulation_state.get_cache_directory()

        SessionConfiguration(new_instance=True,
                             package_order=self.config['dataset_pool_configuration'].package_order,
                             in_storage=AttributeCache())
        
        ForkProcess().fork_new_process(self.config['creating_baseyear_cache_configuration'].cache_scenario_database, self.config)
        
        # Create output database (normally done by run manager)
        if 'estimation_database_configuration' in self.config:
            db_server = DatabaseServer(self.config['estimation_database_configuration'])
            if not db_server.has_database(self.config['estimation_database_configuration'].database_name):
                db_server.create_database(self.config['estimation_database_configuration'].database_name)
                   
    def run_simulation(self, simulation_instance=None):
        logger.start_block('Simulation on database %s' 
            % self.config['scenario_database_configuration'].database_name)
        try:
            if simulation_instance is None:
                simulation_instance = ModelSystem()
            simulation_instance.run(self.config)
            #simulation_instance.run_multiprocess(self.config, is_run_subset=True)
        finally:
            logger.end_block()
        logger.log_status("Data cache in %s" % self.simulation_state.get_cache_directory())
        
    def cleanup(self, remove_cache, remove_output_database):
        """Remove all outputs of this simulation."""
        self.simulation_state.remove_singleton(delete_cache=remove_cache)
        # Remove SessionConfiguration singleton, if it exists
        Singleton().remove_singleton_for_class(SessionConfiguration)
        
        cache_dir = self.config['cache_directory']
        if os.path.exists(cache_dir):
            rmtree(cache_dir)
        if remove_output_database and ('estimation_database_configuration' in self.config):
            db_server = DatabaseServer(self.config['estimation_database_configuration'])
            db_server.drop_database(self.config['estimation_database_configuration'].database_name)

    def prepare_and_run(self, run_configuration, simulation_instance=None, remove_cache=True):
        self.prepare_for_simulation(run_configuration)
        self.run_simulation(simulation_instance)
        self.cleanup(remove_cache)
        
 def _compute_vacancy_variables(self, location_set, dev_model_configs, resources):
     compute_resources = Resources(resources)
     compute_resources.merge({"debug": self.debug})
     self.units_variable = {}
     self.variable_for_vacancy = {}
     for project_type in dev_model_configs:
         self.units_variable[project_type] = dev_model_configs[project_type]["units"]
         self.variable_for_vacancy[project_type] = compute_resources.get(
             "%s_vacant_variable" % project_type,
             "urbansim.%s.vacant_%s" % (location_set.get_dataset_name(), self.units_variable[project_type]),
         )
         location_set.compute_variables([self.variable_for_vacancy[project_type]], resources=compute_resources)
Exemple #8
0
    def run(self, data, coefficients, resources=None):
        """
        Like linear_utilities, but in addition it runs linear utilities for
        modified data and stores utilities when each variable is set to its 5%, 95% quantiles,
        keeping the other variables at their median. Last row in the resulting file is the difference in
        utilities between these two.
        The file name can be passed in resources - entry 'utilities_diagnose_file'.
        """
        if data.ndim < 3:
            raise StandardError, "Argument 'data' must be a 3D numpy array."

        if not isinstance(resources, Resources):
            resources= Resources(resources)
        nobs, neqs, nvar = data.shape
        medians = zeros(nvar, dtype=float32)
        quant = zeros((2,nvar), dtype=float32)
        data_with_medians = array(data[0,:,:])
        for ivar in range(nvar): # compute medain and quantiles for each variable
            medians[ivar], quant[0,ivar], quant[1,ivar] = quantile(data[:,:,ivar].ravel(), array([0.5, 0.05, 0.95]))
            data_with_medians[:,ivar] = medians[ivar]


        file_name = resources.get("utilities_diagnose_file", "util")
        if resources.get("submodel", None) is not None:
            file_name = "%s_submodel_%s" % (file_name, resources.get("submodel", 1))
        diagnose_utilities = zeros((3, nvar), dtype=float32)
        argcor = ()
        for ivar in range(nvar): # iterate over variables
            for iquant in [0,1]: # 0 for 5% quantile, 1 for 95% quantile
                mod_data = array(data_with_medians).reshape(1,neqs, nvar) # copy original data
                mod_data[0,:,ivar] = quant[iquant, ivar]
                utility = linear_utilities.run(self, mod_data, coefficients, resources)
                diagnose_utilities[iquant, ivar] = utility[0,0]
            argcor = argcor + (data[:,:,ivar].ravel(),)
        diagnose_utilities[2,:] = diagnose_utilities[1,:] - diagnose_utilities[0,:]
        coef_names = resources.get("coefficient_names", map(lambda x: 'x%s' % x, arange(nvar)+1))
        #write_to_text_file(file_name, coef_names, delimiter=' ')
        #write_table_to_text_file( file_name, diagnose_utilities, mode='ab')
        logger.log_status("Diagnosed utilities written into %s." % file_name)
        return linear_utilities.run(self, data, coefficients, resources)
 def _compute_vacancy_and_total_units_variables(self,
                                                location_set,
                                                project_types,
                                                resources=None):
     compute_resources = Resources(resources)
     compute_resources.merge({"debug": self.debug})
     self.variable_for_vacancy = {}
     self.variable_for_total_units = {}
     for ptype in project_types:
         self.variable_for_vacancy[ptype] = compute_resources.get(
             "%s_vacant_variable" % ptype, "urbansim_zone.%s.vacant_%s" %
             (location_set.get_dataset_name(),
              self.project_specific_units[ptype]))
         self.variable_for_total_units[ptype] = compute_resources.get(
             "%s_total_units_variable" % ptype,
             "%s.aggregate(urbansim_zone.building.total_%s)" %
             (location_set.get_dataset_name(),
              self.project_specific_units[ptype]))
         location_set.compute_variables([
             self.variable_for_vacancy[ptype],
             self.variable_for_total_units[ptype]
         ],
                                        dataset_pool=self.dataset_pool,
                                        resources=compute_resources)
 def _compute_vacancy_variables(self, location_set, dev_model_configs,
                                resources):
     compute_resources = Resources(resources)
     compute_resources.merge({"debug": self.debug})
     self.units_variable = {}
     self.variable_for_vacancy = {}
     for project_type in dev_model_configs:
         self.units_variable[project_type] = dev_model_configs[
             project_type]['units']
         self.variable_for_vacancy[project_type] = compute_resources.get(
             "%s_vacant_variable" % project_type,
             "urbansim.%s.vacant_%s" % (location_set.get_dataset_name(),
                                        self.units_variable[project_type]))
         location_set.compute_variables(
             [self.variable_for_vacancy[project_type]],
             resources=compute_resources)
Exemple #11
0
class RunSimulation(object):
    def prepare_for_simulation(self, config, cache_directory=None):
        self.config = Resources(config)
        base_cache_dir = self.config['creating_baseyear_cache_configuration'].cache_directory_root
        
        self.simulation_state = SimulationState(new_instance=True, base_cache_dir=base_cache_dir,
                                                start_time=self.config.get('base_year', 0))

        ### TODO: Get rid of this! There is no good reason to be changing the 
        ###       Configuration.
        if self.config['cache_directory'] is None:
            self.config['cache_directory'] = self.simulation_state.get_cache_directory()

        SessionConfiguration(new_instance=True,
                             package_order=self.config['dataset_pool_configuration'].package_order,
                             in_storage=AttributeCache())
        
        if config['creating_baseyear_cache_configuration'].cache_from_database:
            ForkProcess().fork_new_process(self.config['creating_baseyear_cache_configuration'].cache_scenario_database, self.config)
        else:
            CacheFltData().run(self.config)

    def run_simulation(self, simulation_instance=None):
        if simulation_instance is None:
            simulation_instance = ModelSystem()
        simulation_instance.run(self.config)
        #simulation_instance.run_multiprocess(self.config, is_run_subset=True)
        logger.log_status("Data cache in %s" % self.simulation_state.get_cache_directory())
        
    def cleanup(self, remove_cache=True):
        """Remove all outputs of this simulation."""    
        self.simulation_state.remove_singleton(delete_cache=remove_cache)
        SessionConfiguration().remove_singleton()
        if remove_cache:
            cache_dir = self.config['cache_directory']
            if os.path.exists(cache_dir):
                rmtree(cache_dir)

    def prepare_and_run(self, run_configuration, simulation_instance=None, remove_cache=True):
        self.prepare_for_simulation(run_configuration)
        self.run_simulation(simulation_instance)
        self.cleanup(remove_cache)
    def run(self, 
            config = None, ### TODO: Get rid of this parameter!
            unroll_gridcells = None, ### TODO: Get rid of this parameter!
            cache_directory = None, 
            base_year = None,
            creating_baseyear_cache_configuration = None,
            debuglevel = None,
            ):
        """
        Copy large baseyear datasets from MySQL into cache.
        """
        
        config = Resources(config)
        
        if unroll_gridcells is None:
            unroll_gridcells = config['creating_baseyear_cache_configuration'].unroll_gridcells
            
        if cache_directory is None:
            cache_directory = config['cache_directory']
            
        if base_year is None:
            base_year = config['base_year']
            
        if creating_baseyear_cache_configuration is None:
            creating_baseyear_cache_configuration = copy.deepcopy(config['creating_baseyear_cache_configuration'])
        
        if debuglevel is None:
            debuglevel = config.get('debuglevel', 3)

        CoreCacheScenarioDatabase().run(config)
        
        self.prepare_data_before_baseyear(
            cache_directory,
            base_year,
            creating_baseyear_cache_configuration
            )
    def run(
        self,
        config=None,  ### TODO: Get rid of this parameter!
        unroll_gridcells=None,  ### TODO: Get rid of this parameter!
        cache_directory=None,
        base_year=None,
        creating_baseyear_cache_configuration=None,
        debuglevel=None,
    ):
        """
        Copy large baseyear datasets from MySQL into cache.
        """

        config = Resources(config)

        if unroll_gridcells is None:
            unroll_gridcells = config[
                'creating_baseyear_cache_configuration'].unroll_gridcells

        if cache_directory is None:
            cache_directory = config['cache_directory']

        if base_year is None:
            base_year = config['base_year']

        if creating_baseyear_cache_configuration is None:
            creating_baseyear_cache_configuration = copy.deepcopy(
                config['creating_baseyear_cache_configuration'])

        if debuglevel is None:
            debuglevel = config.get('debuglevel', 3)

        CoreCacheScenarioDatabase().run(config)

        self.prepare_data_before_baseyear(
            cache_directory, base_year, creating_baseyear_cache_configuration)
class Estimator(GenericModelExplorer):
    def __init__(self, config=None, save_estimation_results=False):
        if 'cache_directory' not in config or config['cache_directory'] is None:
            raise KeyError("The cache directory must be specified in the "
                "given configuration, giving the filesystem path to the cache "
                "directory containing the data with which to estimate. Please "
                "check that your configuration contains the 'cache_directory' "
                "entry and that it is not None.")

        self.simulation_state = SimulationState(new_instance=True)
        self.simulation_state.set_cache_directory(config['cache_directory'])

        SessionConfiguration(new_instance=True,
                             package_order=config['dataset_pool_configuration'].package_order,
                             in_storage=AttributeCache())
        self.config = Resources(config)
        self.save_estimation_results = save_estimation_results
        self.debuglevel = self.config.get("debuglevel", 4)
        self.model_system = ModelSystem()
        self.agents_index_for_prediction = None
        
        models = self.config.get('models',[])

        self.model_name = None
        if "model_name" in config.keys():
            self.model_name = config["model_name"]
        else:
            for model in models:
                if isinstance(model, dict):
                    model_name = model.keys()[0]
                    if (model[model_name] == "estimate") or (isinstance(model[model_name], list)
                        and ("estimate" in model[model_name])):
                            self.model_name = model_name
                            break
        estimate_config_changes = self.config.get('config_changes_for_estimation', {}).get('estimate_config', {})
        if len(estimate_config_changes) > 0:
            change = Resources({'models_configuration': {self.model_name: {'controller': {'init': {'arguments': {}}}}}})
            estimate_config_str = self.config['models_configuration'].get(self.model_name, {}).get('controller', {}).get('init', {}).get('arguments', {}).get('estimate_config', '{}')
            estimate_config = Resources({})
            try:
                estimate_config = eval(estimate_config_str)
            except:
                pass
 
            estimate_config.merge(estimate_config_changes)
            self.config.merge(change)
            self.config['models_configuration'][self.model_name]['controller']['init']['arguments']['estimate_config'] = 'Resources(%s)' % estimate_config

            
       
    def estimate(self, out_storage=None):
        self.model_system.run(self.config, write_datasets_to_cache_at_end_of_year=False)
        self.extract_coefficients_and_specification()

        if self.save_estimation_results:
            self.save_results(out_storage=out_storage)

    def reestimate(self, specification_module_name=None, specification_dict=None, out_storage=None, type=None, submodels=None):
        """specification_module_name is name of a module that contains a dictionary called
        'specification'. If it is not given, the argument specification_dict must be given which is a dictionary object.
        'type' is the name of model member, such as 'commercial', 'residential'. The specification dictionary
        is expected to have an entry of this name. If 'submodels' is given (list or a number),
        the restimation is done only for those submodels.
        """
        if specification_module_name is not None:
            exec("import " + specification_module_name)
            eval("reload (" + specification_module_name + ")")
            exec("specification_dict =" + specification_module_name + ".specification")
            
        if type is not None:
            specification_dict = specification_dict[type]
        if submodels is not None: #remove all submodels but the given ones from specification
            submodels_to_be_deleted = specification_dict.keys()
            if not isinstance(submodels, list):
                submodels = [submodels]
            for sm in submodels:
                if sm not in submodels_to_be_deleted:
                    raise ValueError, "Submodel %s not in the specification." % sm
                submodels_to_be_deleted.remove(sm)
                if "_definition_" in submodels_to_be_deleted:
                    submodels_to_be_deleted.remove("_definition_")
            for sm in submodels_to_be_deleted:
                del specification_dict[sm]
        self.specification = EquationSpecification(specification_dict=specification_dict)
        new_namespace = self.model_system.run_year_namespace
        keys_coeff_spec = self.get_keys_for_coefficients_and_specification()
        new_namespace[keys_coeff_spec["specification"]] = self.specification
        self.coefficients, coeff_dict_dummy = self.model_system.do_process(new_namespace)
        ## update run_year_namespce since it's not been updated by do_process
        self.model_system.run_year_namespace = new_namespace
        self.model_system.run_year_namespace[keys_coeff_spec["coefficients"]] = self.coefficients
        
        ## this gets coeff and spec from run_year_namespce and is only updated in _run_year method
        #self.extract_coefficients_and_specification()  
        if self.save_estimation_results:
            self.save_results(out_storage=out_storage)

    def predict(self, predicted_choice_id_name, agents_index=None):
        """ Run prediction. Currently makes sense only for choice models."""
        # Create temporary configuration where all words 'estimate' are replaced by 'run'
        tmp_config = Resources(self.config)
        
        if self.agents_index_for_prediction is None:
            self.agents_index_for_prediction = self.get_agent_set_index().copy()
            
        if agents_index is None:
            agents_index = self.agents_index_for_prediction
        
        tmp_config['models_configuration'][self.model_name]['controller']['run']['arguments']['coefficients'] = "coeff_est"
        tmp_config['models_configuration'][self.model_name]['controller']['run']['arguments']['agents_index'] = "agents_index"
        tmp_config['models_configuration'][self.model_name]['controller']['run']['arguments']['chunk_specification'] = "{'nchunks':1}"

        ### save specification and coefficients to cache (no matter the save_estimation_results flag)
        ### so that the prepare_for_run method could load specification and coefficients from there
        #output_configuration = self.config['output_configuration']
        #del self.config['output_configuration']
        #self.save_results()
        
        #self.config['output_configuration'] = output_configuration
        
        #self.model_system.run_year_namespace["coefficients"] = self.coefficients
        #del tmp_config['models_configuration'][self.model_name]['controller']['prepare_for_run']
        
        try:
            run_year_namespace = copy.copy(self.model_system.run_year_namespace)
        except:
            logger.log_error("The estimate() method must be run first")
            return False
        
        try:
            agents = self.get_agent_set()
            choice_id_name = self.get_choice_set().get_id_name()[0]
            # save current locations of agents
            current_choices = agents.get_attribute(choice_id_name).copy()
            dummy_data = zeros(current_choices.size, dtype=current_choices.dtype)-1
            agents.modify_attribute(name=choice_id_name, data=dummy_data) #reset all choices
            
            run_year_namespace["process"] = "run"
            run_year_namespace["coeff_est"] = self.coefficients
            run_year_namespace["agents_index"] = agents_index
            run_year_namespace["processmodel_config"] = tmp_config['models_configuration'][self.model_name]['controller']['run']
            new_choices = self.model_system.do_process(run_year_namespace)
            
            #self.model_system.run(tmp_config, write_datasets_to_cache_at_end_of_year=False)
            #new_choices = agents.get_attribute(choice_id_name).copy()
            agents.modify_attribute(name=choice_id_name, data=current_choices)
            dummy_data[agents_index] = new_choices
            if predicted_choice_id_name not in agents.get_known_attribute_names():
                agents.add_primary_attribute(name=predicted_choice_id_name, data=dummy_data)
            else:
                agents.modify_attribute(name=predicted_choice_id_name, data=dummy_data)
            logger.log_status("Predictions saved into attribute " + predicted_choice_id_name)
            return True
        except Exception, e:
            logger.log_error("Error encountered in prediction: %s" % e)
            logger.log_stack_trace()
        
        return False
Exemple #15
0
class InteractionDataset(Dataset):
    """Class serves as a holder of interaction variables."""

    def __init__(self, resources=None, dataset1=None, dataset2=None, index1=None, index2=None, dataset_name=None,
                  debug=None):
        """ Argument 'resources' is of type Resources. It is merged with arguments. It should contain:
                dataset1 - agent class
                dataset2 - class of the choice dataset
            Optional:
                index1 - 1D array, indices of dataset1
                index2 - If 2D array: row i contains indices of individuals of dataset2 that belong to
                        i-th individual of dataset1[index1].
                        If 1D array: indices of individuals of dataset2 for all individuals of dataset1[index1].
                dataset_name - subdirectory in which implementation of the interaction variables is placed (default "")
            dataset1.resources and dataset2.resources should contain key 'dataset_name' (see Dataset.get_dataset_name()).
        """
        self.resources = Resources(resources)
        self.resources.merge_if_not_None({
                "dataset1":dataset1, "dataset2":dataset2,
                "index1":index1, "index2":index2,
                "dataset_name":dataset_name, "debug":debug})
        self.attribute_boxes = {}
        self.attribute_names = []
        self.debug = self.resources.get("debug",  0)
        if not isinstance(self.debug, DebugPrinter):
            self.debug = DebugPrinter(self.debug)
        self.resources.check_obligatory_keys(["dataset1", "dataset2"])
        self.dataset1 = self.resources["dataset1"]
        self.dataset2 = self.resources["dataset2"]
        self.index1 = self.resources.get("index1", None)
        self.index2 = self.resources.get("index2", None)
        self.dataset_name = self.resources.get("dataset_name", None)
        if self.dataset_name == None:
            self.dataset_name = self.dataset1.get_dataset_name() + '_x_' + self.dataset2.get_dataset_name()
        self._primary_attribute_names=[]
        self.index1_mapping = {}
        if self.index1 <> None:
            self.index1_mapping = do_id_mapping_dict_from_array(self.index1)
        self._id_names = None # for compatibility with Dataset
        self.variable_factory = VariableFactory()
        self._aliases = {} # for compatibility with Dataset

    def _ensure_id_attribute_is_loaded(self):
        pass
    
    def get_attribute(self, name):
        """ Return an array of the (by the argument name) given attribute. """
        if not isinstance(name, VariableName):
            attr_name = VariableName(name)
        else:
            attr_name = name
        alias = attr_name.get_alias()
        dataset_name = attr_name.get_dataset_name()
        if not (alias in self.get_attribute_names()):
            if dataset_name == self.get_dataset(1).dataset_name:
                index = self.get_2d_index_of_dataset1()
                return self.get_dataset(1).get_attribute_by_index(attr_name, index)
            if dataset_name == self.get_dataset(2).dataset_name:
                index = self.get_2d_index()
                return self.get_dataset(2).get_attribute_by_index(attr_name, index)
            
            if alias in self.get_dataset(1).get_known_attribute_names():
                index = self.get_2d_index_of_dataset1()
                return self.get_dataset(1).get_attribute_by_index(attr_name, index)
            if alias in self.get_dataset(2).get_known_attribute_names():
                index = self.get_2d_index()
                return self.get_dataset(2).get_attribute_by_index(attr_name, index)
            self._raise_error(NameError, "Variable %s not found!" % alias)
        return self.attribute_boxes[alias].get_data()

    def get_attribute_of_dataset(self, name, dataset_number=1):
        """ Return values of attribute given by 'name' belonging to the given dataset, 
        possibly filtred by the corresponding indes. It is a 1d array of size 
        reduced_n or reduced_m.
        """
        index = self.get_index(dataset_number)
        if index <> None:
            return self.get_dataset(dataset_number).get_attribute_by_index(name, index)
        return self.get_dataset(dataset_number).get_attribute(name)
        
    def get_id_attribute_of_dataset(self, dataset_number=1):
        """Like 'get_attribute_of_dataset' where name is the id_name of the given dataset.
        """
        index = self.get_index(dataset_number)
        if index <> None:
            return self.get_dataset(dataset_number).get_id_attribute()[index]
        return self.get_dataset(dataset_number).get_id_attribute()

    def add_primary_attribute(self, data, name):
        """ Add values given in argument 'data' to the dataset as an attribute 'name'. 
        'data' should be an array of the same size as the dataset.
        If this attribute already exists, its values are overwritten.
        The attribute is marked as a primary attribute.
        """
        if not isinstance(data, ndarray):
            data=array(data)
        if data.shape[0] <> self.size()[0][0] or data.shape[1] <> self.size()[0][1]:
            logger.log_warning("In add_primary_attribute: Mismatch in sizes of the argument 'data' and the InteractionDataset object.")
        self.add_attribute(data, name, metadata=AttributeType.PRIMARY)
        
    def _compute_if_needed(self, name, dataset_pool, resources=None, quiet=False, version=None):
        """ Compute variable given by the argument 'name' only if this variable
        has not been computed before.
        Check first if this variable belongs to dataset1 or dataset2.
        dataset_pool holds available datasets.
        """
        if not isinstance(name, VariableName):
            variable_name = VariableName(name)
        else:
            variable_name = name
        short_name = variable_name.get_alias()
        if (short_name in self.get_attribute_names()) and (self.are_dependent_variables_up_to_date(
                            variable_name, version=version)):
            return version #nothing to be done
        dataset_name = variable_name.get_dataset_name()
        if dataset_name == self.get_dataset_name():
            new_version = self._compute_one_variable(variable_name, dataset_pool, resources)
        else:
            owner_dataset, index = self.get_owner_dataset_and_index(dataset_name)
            if owner_dataset is None:
                self._raise_error(StandardError, "Cannot find variable '%s'\nin either dataset or in the interaction set." %
                                variable_name.get_expression())
            owner_dataset.compute_variables([variable_name], dataset_pool, resources=resources, quiet=True)
            new_version = self.add_attribute(data = owner_dataset.get_attribute_by_index(variable_name, index),
                name = variable_name, metadata = AttributeType.COMPUTED)
            attribute_box = owner_dataset._get_attribute_box(variable_name)
            variable = attribute_box.get_variable_instance()
            my_attribute_box = self._get_attribute_box(variable_name)
            my_attribute_box.set_variable_instance(variable)
        return new_version

    def get_owner_dataset_and_index(self, dataset_name):
        if dataset_name == self.dataset1.get_dataset_name():
            return (self.dataset1, self.get_2d_index_of_dataset1())
        elif dataset_name == self.dataset2.get_dataset_name():
            return (self.dataset2, self.get_2d_index())
        return (None, None)

    def are_dependent_variables_up_to_date(self, variable_name, version):
        """ Return True if the version of this variable correspond to versions of all
        dependent variables, otherwise False. That is, if any of the dependent variable
        must be recomputed, the method returns False.
        """
        short_name = variable_name.get_alias()
        if short_name in self.get_primary_attribute_names():
            return self.is_version(short_name, version)

        dataset_name = variable_name.get_dataset_name()
        owner_name = variable_name.get_dataset_name()
        if owner_name == self.dataset1.get_dataset_name():
            owner_dataset = self.dataset1
        elif owner_name == self.dataset2.get_dataset_name():
            owner_dataset = self.dataset2
        else:
            owner_dataset = self

        if not(dataset_name == owner_dataset.get_dataset_name()):
                self._raise_mismatch_dataset_name_error(variable_name)
        if owner_dataset is self:
            attribute_box = owner_dataset._get_attribute_box(variable_name)
            if attribute_box is None:
                return False
            variable = attribute_box.get_variable_instance()
            res = variable.are_dependent_variables_up_to_date(version)
            return not(False in res)
        return owner_dataset.are_dependent_variables_up_to_date(variable_name, version)

    def _prepare_dataset_pool_for_variable(self, dataset_pool=None, resources=None):
        dataset_pool, compute_resources = Dataset._prepare_dataset_pool_for_variable(self, dataset_pool, resources)
        dataset1_name = "dataset1"
        dataset2_name = "dataset2"
        dataset1 = self.get_dataset(1)
        dataset2 = self.get_dataset(2)
        if dataset1 <> None:
            dataset1_name=dataset1.get_dataset_name()
        if dataset2 <> None:
            dataset2_name=dataset2.get_dataset_name()
        dataset_pool.add_datasets_if_not_included({dataset1_name: dataset1, dataset2_name: dataset2})
        return dataset_pool, compute_resources

    def get_n(self):
        """Return size of dataset 1.
        """
        return self.dataset1.size()

    def get_m(self):
        """Return size of dataset 2.
        """
        return self.dataset2.size()

    def get_reduced_n(self):
        if self.index1 == None:
            return self.get_n()
        if isinstance(self.index1, ndarray):
            return self.index1.shape[0]
        return self.get_n()

    def get_reduced_m(self):
        if self.index2 == None:
            return self.get_m()
        if isinstance(self.index2, ndarray):
            if self.index2.ndim == 1:
                return self.index2.shape[0]
            else:
                return self.index2.shape[1]
        return self.get_m()

    def size(self):
        return [(self.get_reduced_n(), self.get_reduced_m()), (self.get_n(), self.get_m())]

    def get_dataset(self, nr):
        if (nr == 1):
            return self.dataset1
        if (nr == 2):
            return self.dataset2
        return None

    def get_dataset_named(self, name):
        if name==self.dataset1.get_dataset_name():
            return self.dataset1
        if name==self.dataset2.get_dataset_name():
            return self.dataset2
        raise ValueError, 'trying to get an interaction set component named %s but it does not exist' % name

    def get_index(self, nr):
        if (nr == 1):
            return self.index1
        if (nr == 2):
            return self.index2
        return None

    def attribute_sum(self, name):
        """Return the sum of values of the given attribute.
        """
        return (ma.ravel(self.get_attribute(name))).sum()

    def attribute_average(self, name):
        """Return the value of the given attribute averaged over the dataset.
        """
        return ma.average(ma.ravel(self.get_attribute(name)))

    def summary(self, names, resources=None):
        """Print a marginal summary of the attributes given in the list 'names'.
        """
        print "Summary\t\tsum\t\taverage"
        print "------------------------------------------------"
        if not isinstance(names,list):
            names = [names]
        for item in names:
            if not (item.get_alias() in self.get_attribute_names()):
                self.compute_variables([item], resources=resources)

            print item + "\t" + str(self.attribute_sum(item.alias))\
                     + "\t" + str(round(self.attribute_average(item.get_alias(),5)))

    def get_2d_dataset_attribute(self, name):
        """ Return a 2D array of the attribute given by 'name'. It is assumed
        to be an attribute of dataset2.
        The method should serve the purpose of preparing 1D arrays for computing
        intraction operations (between dataset1 and dataset2) by transfering them to the corresponding 2D array.
        The resulting array is of size n x m, where m is either the attribute length of dataset2,
        or, if index2 is a 1D array, its length, or, if index2 is a 2D array,
        the number of columns. n is size of dataset1 or of index1 if given.
        If index2 is None, all values of the given attribute are repeated n times.
        """
        dataset = self.get_dataset(2)
        index = self.get_2d_index()
        return dataset.get_attribute_by_index(name, index)

    def get_2d_index(self):
        n = self.get_reduced_n()
        m = self.get_reduced_m()
        if self.index2 == None:
            index = indices((n,m))[1]
        elif isinstance(self.index2, ndarray):
            if self.index2.ndim == 1: # one-dim array
                index = repeat(reshape(self.index2,(1,self.index2.shape[0])), n, 0)
            else:
                index = self.index2
        else:
            self._raise_error(StandardError, "'index2' has incompatible type. It should be a numpy array or None.")
        if (index.shape[0] <> n) or (index.shape[1] <> m):
            self._raise_error(StandardError, "'index2' has wrong dimensions.")
        return index

    def get_2d_index_of_dataset1(self):
        n = self.get_reduced_n()
        m = self.get_reduced_m()
        index = self.get_index(1)
        if index == None:
            index = arange(n)
        return repeat(reshape(index, (index.size,1)), m, 1)

    def create_logit_data(self, coefficients, index=None):
        """It creates a data array corresponding to specified coefficients
        (=coefficients connected to a specification) as one variable per column.
        'coefficients' is of type "SpecifiedCoefficientsFor1Submodel".
        If 'index' is not None, it is considered as index (1D array) of dataset1 determining
        which individuals should be considered.
        Return a 3D array (nobservations|len(index) x nequations x nvariables).
        """
        shape = coefficients.getshape()
        neqs, nvar = shape[0:2]
        other_dims = ()
        if len(shape) > 2:
            other_dims = shape[2:]
        nparenteqs = coefficients.parent.nequations()
        if (neqs <> self.get_reduced_m()) and (nparenteqs <> self.get_reduced_m()):
            self._raise_error(StandardError, "create_logit_data: Mismatch in number of equations and size of dataset2.")

        if index <> None:
            nobs = index.size
        else:
            nobs = self.get_reduced_n()
            index = arange(nobs)

        variables = coefficients.get_full_variable_names()
        mapping = coefficients.get_coefficient_mapping()
        # Fill the x array from data array
        data_shape = tuple([nobs,neqs,nvar] + list(other_dims))
        try:
            x = zeros(data_shape, dtype=float32)
        except:    # in case it fails due to memory allocation error
            logger.log_warning("Not enough memory. Deleting not used attributes.",
                                tags=["memory", "logit"])
            var_names = map(lambda x: x.get_alias(), variables)
            self.dataset1.unload_not_used_attributes(var_names)
            self.dataset2.unload_not_used_attributes(var_names)
            collect()
            x = zeros(data_shape, dtype=float32)
        if (len(variables) <= 0) or (nobs <= 0):
            return x
        for ivar in range(nvar): # Iterate over variables
            if variables[ivar].is_constant_or_reserved_name():
                c = where(mapping[:,ivar] < 0, 0.0, 1)
                x[:,:,ivar] = c
            else:
                data = ma.filled(self.get_attribute(variables[ivar]),0.0)[index,]
                if neqs < nparenteqs:
                    data = take(data, coefficients.get_equations_index(), axis=1)
                if x.ndim > 3:
                    data = resize(data, tuple(list(x.shape[0:2]) + list(other_dims)))
                x[:,:,ivar] = data
        return x

    def create_logit_data_from_beta_alt(self, coefficients, index=None):
        """It creates a data array corresponding to specified coefficients
        (=coefficients connected to a specification) as one coefficient per column. (Thus there can be multiple columns
        of one variable.)
        'coefficients' is of type "SpecifiedCoefficientsFor1Submodel".
        If 'index' is not None, it is considered as index (1D array) of dataset1 determining
        which individuals should be considered.
        It puts zeros on spots where the corresponding coefficient is zero. It is meant to be used for preparing data
        for estimation.
        Return a 3D array (nobservations|len(index) x nequations x ncoefficients).
        """
        shape = coefficients.getshape()
        neqs, nvar = shape[0:2]
        other_dims = ()
        if len(shape) > 2:
            other_dims = shape[2:]
        nparenteqs = coefficients.parent.nequations()
        if (neqs <> self.get_reduced_m()) and (nparenteqs <> self.get_reduced_m()):
            self._raise_error(StandardError, "create_logit_data: Mismatch in number of equations and size of dataset2.")

        mapping = coefficients.get_coefmap_alt()
        ncoef = mapping.size
        if index <> None:
            nobs = index.size
        else:
            nobs = self.get_reduced_n()
            index = arange(nobs)

        variables = coefficients.get_variable_names_from_alt()

        # Fill the x array from data array
        data_shape = tuple([nobs,neqs,ncoef] + list(other_dims))
        try:
            x = zeros(data_shape, dtype=float32)
        except:    # in case it fails due to memory allocation error
            logger.log_warning("Not enough memory. Deleting not used attributes.",
                                tags=["memory", "logit"])
            self.dataset1.unload_not_used_attributes(unique(variables))
            self.dataset2.unload_not_used_attributes(unique(variables))
            collect()
            x = zeros(data_shape, dtype=float32)

        if (len(variables) <= 0) or (nobs <= 0):
            return x

        coefvalues = coefficients.get_beta_alt()
        for ivar in range(len(variables)): # Iterate over variables
            if coefficients.is_variable_constant_or_reserved_name(variables[ivar]):
                c = where(coefvalues[:,ivar] == 0, 0.0, 1)
                x[:,:,ivar] = c
            else:
                data = ma.filled(self.get_attribute(variables[ivar]),0.0)[index,]
                if neqs < nparenteqs:
                    data = take(data, coefficients.get_equations_index(), axis=1)
                if x.ndim > 3:
                    data = reshape(data, tuple(list(x.shape[0:2]) + len(other_dims)*[1]))
                    for iodim in range(len(other_dims)):
                        data = repeat(data, other_dims[iodim], axis=2+iodim)
                x[:,:,ivar] = data
                w = where(coefvalues[:,ivar] == 0)
                if x.ndim > 3:
                    x[:,w[0], ivar, w[1:]] = 0.0
                else:
                    x[:,w,ivar] = 0.0
        return x

    def modify_logit_data_for_estimation(self, data, choice, constants_positions=array([], dtype='int32')):
        """Modify the variable columns for alternative specific constants. It is set to one
        for choices where the actual choice have been made, otherwise zeros.
        'data' is a 3D array (output of create_logit_data).
        'choice' is a 1D array containing indices of the actual choices (within the sampled choice set)
            for each agent that was included in the data array.
        'constants_positions' is an array with indices of the alternative specific constants
            within the data array.
        """
        nobs, neqs, nvar = data.shape
        if where(choice<0)[0].size > 0:
            self._raise_error(StandardError, "There are no choices for some agents. Check argument 'choice'.")
        if constants_positions.size > 0:
            for const in constants_positions:
                data[:,:,const] = 0
                data[arange(nobs), choice, const] = 1
        return data

    def get_attribute_by_choice(self, name, choices, resources=None):
        """  'name' is an attribute of dataset2, 'choices' is 1D array - choices[i] represents a choice
        (index of attribute 'name' among the values index2[i,]) for individual i of dataset1[index1].
        If name == None, indices belonging to dataset2 are returned.
        The method returns 1D array - the actual values of the choices.
        """
        if choices.size <> self.get_n():
            self._raise_error(StandardError, "get_attribute_by_choice: Argument 'choices' must be the same size as dataset1")
        resources.merge_with_defaults(self.resources)
        if name == None:
            twoDattr = self.get_2d_index()
        else:
            twoDattr = self.get_2d_dataset_attribute(name, resources)
        return take_choices(twoDattr, choices)

    def is_same_as(self, name1, name2):
        """Test equality of 2 variables. 'name1' is an attribute of dataset1, 'name2' is an attribute of 'dataset2'.
        Return a 2D array.
        """
        self.load_datasets()
        attr1 = reshape(self.get_attribute_of_dataset(name1),(self.get_reduced_n(), 1))
        return attr1 == self.get_2d_dataset_attribute(name2)

    def is_less_or_equal(self, name1, name2):
        """Test if attribute 'name1' (attr. of dataset1) is <= than attr. 'name2' (attr. 'dataset2').
        Return a 2D array.
        """
        self.load_datasets()
        attr1 = reshape(self.get_attribute_of_dataset(name1),(self.get_reduced_n(), 1))
        return attr1 <= self.get_2d_dataset_attribute(name2)

    def is_greater_or_equal(self, name1, name2):
        """est if attribute 'name1' (attr. of dataset1) is >= than attr. 'name2' (attr. 'dataset2').
        Return a 2D array.
        """
        self.load_datasets()
        attr1 = reshape(self.get_attribute_of_dataset(name1),(self.get_reduced_n(), 1))
        return attr1 >= self.get_2d_dataset_attribute(name2)

    def multiply(self, name1, name2):
        """Multiply 2 variables. 'name1' is an attribute of dataset1, 'name2' is an attribute of 'dataset2'.
        Return a 2D array.
        """
        self.load_datasets()
        attr1 = reshape(self.get_attribute_of_dataset(name1),(self.get_reduced_n(), 1))
        return attr1 * self.get_2d_dataset_attribute(name2)

    def divide(self, name1, name2):
        """ Divide variable 'name1' (attribute of dataset1) by variable 'name2' (attribute of 'dataset2').
        Return a masked 2D array.
        """
        self.load_datasets()
        attr2 = reshape(self.get_attribute_of_dataset(name2),(self.get_reduced_n(), 1))
        return self.get_2d_dataset_attribute(name1) / ma.masked_where(attr2 == 0.0, attr2.astype(float32))

    def match_agent_attribute_to_choice(self, name, dataset_pool=None):
        """ Return a tuple where the first element is a 2D array of the attribute 'name_{postfix}'. 
        It is assumed to be an attribute
        of dataset1 (possibly computed). {postfix} is created either by values of the attribute
        'name' of dataset2 (if it has any such attribute), or by the id values of dataset2.
        The second value of the resulting tuple is a list of dependent variables.
        """
        if 'name' in self.get_dataset(2).get_known_attribute_names():
            name_postfix = self.get_attribute_of_dataset('name', 2)
        else:
            name_postfix = self.get_id_attribute_of_dataset(2)
        name_postfix_alt = self.get_id_attribute_of_dataset(2)
        
        dependencies = []
        for i in range(self.get_reduced_m()):
            full_name = VariableName("%s_%s" % (name, name_postfix[i]))
            if full_name.get_dataset_name() is None:
                full_name = VariableName("%s.%s" % (self.get_dataset(1).get_dataset_name(), full_name.get_expression()))
            try:
                self.get_dataset(1).compute_variables(full_name, dataset_pool=dataset_pool)
            except:
                full_name = VariableName("%s_%s" % (name, name_postfix_alt[i]))
                if full_name.get_dataset_name() is None:
                    full_name = VariableName("%s.%s" % (self.get_dataset(1).get_dataset_name(), full_name.get_expression()))
                self.get_dataset(1).compute_variables(full_name, dataset_pool=dataset_pool)
            
            dependencies.append(full_name.get_expression())
            if i == 0:
                result = self.get_attribute(full_name)
            else:
                result[:,i] = self.get_attribute_of_dataset(full_name, 1)
        return result, dependencies
            
    def load_datasets(self):
        if self.dataset1.size() <= 0:
            self.dataset1.get_id_attribute()
        if self.dataset2.size() <= 0:
            self.dataset2.get_id_attribute()

    def get_index1_idx(self, ids):
        id = asarray(ids)
        try:
            return array(map(lambda x: self.index1_mapping[x], ids))
        except:
            return None

    def get_dependent_datasets(self, variables):
        """Return a list of dataset names that the given variables depend on."""
        result = []
        for variable in variables:
            try:
                result = result + self.get_dataset(1).get_dependent_datasets(variables=[variable], quiet=True)
            except:
                try:
                    result = result + self.get_dataset(2).get_dependent_datasets(variables=[variable], quiet=True)
                except:
                    result = result + get_dependency_datasets(variables=[variable])
        result = get_distinct_list(result)
        for i in [1,2]: # remove dependencies on datasets of this interaction, since it is implicitly given
            dataset_name = self.get_dataset(i).get_dataset_name()
            if dataset_name in result:
                result.remove(dataset_name)
        return result

    def _raise_error(self, error, msg):
        raise error("In interaction set '%s': %s'" % (self.name(), msg))

    def name(self):
        return "%s -> %s" % (self.dataset1.get_dataset_name(),
                                            self.dataset2.get_dataset_name())

    def get_mask(self, index):
        """index is an array of size reduced_n. The method returns array of 1's and 0's
        (of size reduced_n x reduced_m) where 0's are on rows determined by index.
        """
        mask = ones((self.get_reduced_n(), self.get_reduced_m()), dtype="int32")
        for i in index:
            mask[i,:] = 0
        return mask

    def interact_attribute_with_condition(self, attribute, condition, filled_value=0.0, do_logical_not=False):
        """Creates a 2D array (reduced_n x reduced_m) with values of 'attribute' on spots where values of the 'condition'
        attribute are > 0. All other spots have 'filled_value'. 'attribute' is an attribute name of
        the second dataset, condition is an attribute name of teh first dataset.
        If 'do_logical_not' is True, the condition is negated.
        """
        cond_values = self.get_attribute_of_dataset(condition)
        if do_logical_not:
            cond_values = logical_not(cond_values)
        index = where(cond_values > 0)[0]
        mask = self.get_mask(index)
        return ma.filled(ma.masked_array(self.get_2d_dataset_attribute(attribute), mask=mask), filled_value)

    def create_and_check_qualified_variable_name(self, name):
        """Convert name to a VariableName if it isn't already, and add dataset_name to
        the VariableName if it is missing.  If it already has a dataset_name, make sure
        it is the same as the name of this dataset.
        """
        if isinstance(name, VariableName):
            vname = name
        else:
            vname = VariableName(name)
        if vname.get_dataset_name() is None:
            vname.set_dataset_name(self.get_dataset_name())
        else:
            self._check_dataset_name(vname)
            
        return vname
    
    def get_flatten_dataset(self):
        """Creates a new dataset that is a 1D version of this dataset. All attributes are flattened.
        Id name is a combination of the two id attributes.
        """
        storage = StorageFactory().get_storage('dict_storage')
            
        table_name = '%s_flatten' % self.get_dataset_name()
        data = {}
        for attr in self.get_known_attribute_names():
            data[attr] = self.get_attribute(attr).ravel()
            
        ids = []
        for i in [1,2]:
            id_name = self.get_dataset(i).get_id_name()[0]
            ids.append(id_name)
            if id_name not in data.keys():
                data[id_name] = self.get_attribute(id_name).ravel()
            
        storage.write_table(
                    table_name=table_name,
                    table_data=data
                )
        dataset = Dataset(in_storage=storage, id_name=ids,
                          dataset_name=table_name, in_table_name=table_name)
        return dataset
    
    def _check_dataset_name(self, vname):
        """check that name is the name of this dataset or one of its components"""
        name = vname.get_dataset_name()
        dataset_names = set([self.get_dataset_name()] + list(self.get_dataset(i).get_dataset_name() for i in [1,2]))
        if name not in dataset_names:
            raise ValueError, "When checking dataset name of '%s': different dataset names for variable and dataset or a component: '%s' <> '%s'" % (vname.get_expression(), name, dataset_names)

    def add_mnl_bias_correction_term(self, probability, sampled_index, bias_attribute_name='__mnl_bias_correction_term'):
        """Compute and add an MNL bias correction term introduced by sampling. 
        'probability' is a probability array of the whole choice set. 
        'sampled_index' is an index of elements within the 'probability' array determining the sampled set of alternatives.
        The computed term is added to the interaction set as an additional attribute,
        using the name given in 'bias_attribute_name'.
        This method is mainly to be used by Samplers classes.
        """
        lnprob = ln(probability)
        ln1minusprob = ln(1-probability)
        bias_term = ln1minusprob.sum() - \
                    take(ln1minusprob, sampled_index).sum(axis=1).reshape((self.get_reduced_n(),1)) + \
                    take(lnprob, sampled_index).sum(axis=1).reshape((self.get_reduced_n(),1)) - \
                    take(lnprob, sampled_index)       
        self.add_attribute(bias_term, bias_attribute_name)
def create_from_parcel_and_development_template(parcel_dataset,
                                                development_template_dataset,
                                                parcel_index=None,
                                                template_index=None,
                                                filter_attribute=None,
                                                consider_constraints_as_rules=True,
                                                template_opus_path="urbansim_parcel.development_template",
                                                proposed_units_variable="urbansim_parcel.development_project_proposal.units_proposed",
                                                dataset_pool=None,
                                                resources=None):
    """create development project proposals from parcel and development_template_dataset,
    parcel_index - 1D array, indices of parcel_dataset. Status of the proposals is set to 'tentative'.
    template_index - index to templates that are available to create proposals;
    filter_attribute - variable that is used to filter proposals;
    
    If a development constraint table exists, create proposal dataset include only proposals that are allowed by constraints,
    otherwise, create a proposal dataset with Cartesian product of parcels x templates 
    """

    resources = Resources(resources)
    debug = resources.get("debug",  0)
    if not isinstance(debug, DebugPrinter):
        debug = DebugPrinter(debug)

    if parcel_index is not None and parcel_index.size <= 0:
        logger.log_warning("parcel index for creating development proposals is of size 0. No proposals will be created.")
        return None
        
    storage = StorageFactory().get_storage('dict_storage')
    current_year = SimulationState().get_current_time()
    
    def _get_data(parcel_ids, template_ids):
        return {
                "proposal_id": arange(1, parcel_ids.size+1, 1),
                "parcel_id" : parcel_ids,
                "template_id": template_ids,
                "start_year": array(parcel_ids.size*[current_year]),
                "status_id": resize(array([DevelopmentProjectProposalDataset.id_tentative], dtype="int16"), 
                    parcel_ids.size)
                }
        
    def _create_project_proposals(parcel_ids, template_ids):
        storage.write_table(table_name='development_project_proposals',
            table_data = _get_data(parcel_ids, template_ids)
            )
        development_project_proposals = DevelopmentProjectProposalDataset(resources=Resources(resources),
                                                                          dataset1 = parcel_dataset,
                                                                          dataset2 = development_template_dataset,
                                                                          index1 = parcel_index,
                                                                          index2 = template_index,
                                                                          in_storage=storage,
                                                                          in_table_name='development_project_proposals',
                                                                          )
        return development_project_proposals
    
    def _compute_filter(proposals):
        if filter_attribute is not None:
            proposals.compute_variables(filter_attribute, dataset_pool=dataset_pool,
                                                          resources=Resources(resources))
            filter_index = where(proposals.get_attribute(filter_attribute) > 0)[0]
            return filter_index
        return None
    
    def _subset_by_filter(proposals):
        filter_index = _compute_filter(proposals)
        if filter_index is not None:
            proposals.subset_by_index(filter_index, flush_attributes_if_not_loaded=False)
        return proposals


    if parcel_index is not None:
        index1 = parcel_index
    else:
        index1 = arange(parcel_dataset.size())

    if template_index is not None:
        index2 = template_index
    else:
        index2 = arange(development_template_dataset.size())

    has_constraint_dataset = True
    try:
        constraints = dataset_pool.get_dataset("development_constraint") 
        constraints.load_dataset_if_not_loaded()
    except:
        has_constraint_dataset = False

    if has_constraint_dataset:
        constraint_types = unique(constraints.get_attribute("constraint_type"))  #unit_per_acre, far etc
        development_template_dataset.compute_variables(map(lambda x: "%s.%s" % (template_opus_path, x), constraint_types), dataset_pool)
            
        parcel_dataset.get_development_constraints(constraints, dataset_pool, 
                                                   index=index1, 
                                                   consider_constraints_as_rules=consider_constraints_as_rules)
        generic_land_use_type_ids = development_template_dataset.compute_variables("urbansim_parcel.development_template.generic_land_use_type_id",
                                                       dataset_pool=dataset_pool)
    parcel_ids = parcel_dataset.get_id_attribute()
    template_ids = development_template_dataset.get_id_attribute()
    
    proposal_parcel_ids = array([],dtype="int32")
    proposal_template_ids = array([],dtype="int32")
    logger.start_block("Combine parcels, templates and constraints")
    for i_template in index2:
        this_template_id = template_ids[i_template]
        fit_indicator = ones(index1.size, dtype="bool8")
        if has_constraint_dataset:
            generic_land_use_type_id = generic_land_use_type_ids[i_template]
            for constraint_type, constraint in parcel_dataset.development_constraints[generic_land_use_type_id].iteritems():
                template_attribute = development_template_dataset.get_attribute(constraint_type)[i_template]  #density converted to constraint variable name
                if template_attribute == 0:
                    continue
                min_constraint = constraint[:, 0].copy()
                max_constraint = constraint[:, 1].copy()
                ## treat -1 as unconstrainted
                w_unconstr = min_constraint == -1
                if w_unconstr.any():
                    min_constraint[w_unconstr] = template_attribute
                
                w_unconstr = max_constraint == -1
                if w_unconstr.any():
                    max_constraint[w_unconstr] = template_attribute

                fit_indicator = logical_and(fit_indicator, 
                                            logical_and(template_attribute >= min_constraint,
                                                        template_attribute <= max_constraint))
                

                if constraint_type == "units_per_acre":
                    res_units_capacity = parcel_dataset.get_attribute("parcel_sqft")[index1] * max_constraint / 43560.0 
                    debug.print_debug("template_id %s (GLU ID %s) max total residential capacity %s, %s of them fit constraints " % (this_template_id, generic_land_use_type_id, res_units_capacity.sum(), (res_units_capacity * fit_indicator).sum() ), 12)
                else:
                    non_res_capacity = parcel_dataset.get_attribute("parcel_sqft")[index1] * max_constraint
                    debug.print_debug("template_id %s (GLU ID %s) max total non residential capacity %s, %s of them fit constraints " % (this_template_id, generic_land_use_type_id, non_res_capacity.sum(), (non_res_capacity * fit_indicator).sum() ), 12)
                
        proposal_parcel_ids = concatenate((proposal_parcel_ids, parcel_ids[index1[fit_indicator]]))
        proposal_template_ids = concatenate( (proposal_template_ids, resize(array([this_template_id]), fit_indicator.sum())))
        
    logger.end_block()
    proposals = _create_project_proposals(proposal_parcel_ids, proposal_template_ids)
    proposals = _subset_by_filter(proposals)

    # eliminate proposals with zero units_proposed
    units_proposed = proposals.compute_variables([proposed_units_variable], dataset_pool = dataset_pool)
    where_up_greater_zero = where(units_proposed > 0)[0]
    if where_up_greater_zero.size > 0:
        proposals.subset_by_index(where_up_greater_zero, flush_attributes_if_not_loaded=False)
    
    logger.log_status("proposal set created with %s proposals." % proposals.size())
    #proposals.flush_dataset_if_low_memory_mode()
    return proposals
    def run(self, dataset1, dataset2, index1=None, index2=None, sample_size=10, weight=None,
            include_chosen_choice=None, with_replacement=True, resources=None, dataset_pool=None):
        """
        
        
        this function samples number of sample_size (scalar value) alternatives from dataset2
        for agent set specified by dataset1.
        If index1 is not None, only samples alterantives for agents with indices in index1;
        if index2 is not None, only samples alternatives from indices in index2.
        sample_size specifies number of alternatives to be sampled for each agent.
        weight, to be used as sampling weight, is either an attribute name of dataset2, or a 1d
        array of the same length as index2 or 2d array of shape (index1.size, index2.size).

        Also refer to document of interaction_dataset"""

        if dataset_pool is None:
            sc = SessionConfiguration()
            try:
                dataset_pool=sc.get_dataset_pool()
            except:
                dataset_pool = DatasetPool(sc.package_order)

        local_resources = Resources(resources)
        local_resources.merge_if_not_None(
                {"dataset1": dataset1, "dataset2": dataset2,
                "index1":index1, "index2": index2,
                "sample_size": sample_size, "weight": weight,
                "with_replacement": with_replacement,
                "include_chosen_choice": include_chosen_choice})

        local_resources.check_obligatory_keys(['dataset1', 'dataset2', 'sample_size'])
        agent = local_resources["dataset1"]
        choice = local_resources["dataset2"]
        index1 = local_resources.get("index1", None)
        if index1 is None:
            index1 = arange(agent.size())
        index2 = local_resources.get("index2", None)
        if index2 is None:
            index2 = arange(choice.size())
            
        if index1.size == 0 or index2.size == 0:
            err_msg = "either choice size or agent size is zero, return None"
            logger.log_warning(err_msg)
            return (None, None)        

        agent_category_definition = local_resources.get("agent_category_definition", [])
        choice_category_definition = local_resources.get("choice_category_definition", [])
        agent_filter_attribute = local_resources.get("agent_filter_attribute", None)
        category_inflating_factor = local_resources.get("category_inflating_factor", 10)

        frequency, unique_agent_category_id, unique_choice_category_id, agent_category_id, choice_category_id = \
                get_category_and_frequency(agent, agent_category_definition,
                                           choice, choice_category_definition,
                                           agent_filter_attribute, category_inflating_factor,
                                           dataset_pool=dataset_pool)
         
        include_chosen_choice = local_resources.get("include_chosen_choice",  False)
        chosen_choice_id = agent.get_attribute(choice.get_id_name()[0])[index1]
        chosen_choice_index = choice.try_get_id_index(chosen_choice_id, return_value_if_not_found=-1)
        chosen_choice_index_to_index2 = lookup(chosen_choice_index, index2, index_if_not_found=UNPLACED_ID)
        
        J = local_resources["sample_size"]
        if include_chosen_choice:
            J = J - 1
        local_resources.merge_with_defaults({'with_replacement': with_replacement})
        with_replacement = local_resources.get("with_replacement")
        
        sampled_index = empty((index1.size, J), dtype="int32")
        sampling_prob = empty((index1.size, J), dtype="float64")
        
        _digitize, _where,  _normalize = digitize, where, normalize
        _ncumsum, _rand, _searchsorted = ncumsum, rand, searchsorted   #speed hack
        for i in range(unique_agent_category_id.size):
            category_id = unique_agent_category_id[i]
            agents_in_this_category = _where(agent_category_id[index1] == category_id)[0]
            num_agents = agents_in_this_category.size
            if num_agents == 0: continue
            #import pdb; pdb.set_trace()
            
            ## divide frequency by the mean frequency to avoid overflow
            weights = frequency[i, _digitize(choice_category_id[index2], unique_choice_category_id)-1]  / frequency[i, :].mean()
            prob = _normalize(weights)
            index = _searchsorted(_ncumsum(prob), _rand(num_agents * J)).reshape(-1, J)

            if not with_replacement:
                raise NotImplementedError, "Sample without replacement is not implemented for this sampler yet."
                #    nz = nonzero(prob)[0].size
                #    if J < nz:
                    #        ## number of non zero weight less than alternatives, sample with replacement
                    #        logger.log_warning("There are %s non zero weights and are less than the number of alternatives proposed %s. " % (nz, J) + 
                    #                           "Sample with replacement instead.")
                    #        continue
                    #    i=0; max_iterations=200
                    #    while True:
                        #        index = sort(index, axis=1)
                        #        where_repeats = nonzero( logical_not(diff(index, axis=1)) ) 
                        #        num_repeats = where_repeats[0].size
                        #        if num_repeats == 0: break
                        #        index[where_repeats] = _searchsorted(_rand(num_repeats), prob)
                        #        i += 1
                        #        if i > max_iterations:
                            #            logger.log_warning("weight_sampler_by_category is unable to sample %i alternatives without replacement in %i iterations; " % \
                                    #                               (J, max_iterations) + 
                            #                               "give up sampling without replacement and results may contain replacement."
                            #                              )
                            #            break

            sampled_index[agents_in_this_category, :] = index
            sampling_prob[agents_in_this_category, :] = prob[index] 

        sampled_index = index2[sampled_index]
        is_chosen_choice = zeros(sampled_index.shape, dtype="bool")
        #chosen_choice = -1 * ones(chosen_choice_index.size, dtype="int32")
        if include_chosen_choice:
            sampled_index = column_stack((chosen_choice_index[:,newaxis],sampled_index))
            is_chosen_choice[chosen_choice_index!=UNPLACED_ID, 0] = 1
            
            sampling_prob_for_chosen_choices = take(prob, chosen_choice_index_to_index2[:, newaxis])
            ## if chosen choice chosen is unplaced has the sampling prob is 0
            sampling_prob_for_chosen_choices[where(chosen_choice_index==UNPLACED_ID)[0],] = 0.0
            sampling_prob = column_stack([sampling_prob_for_chosen_choices, sampling_prob])
            
        #chosen_choice[where(is_chosen_choice)[0]] = where(is_chosen_choice)[1]
        
        interaction_dataset = self.create_interaction_dataset(dataset1, dataset2, index1, sampled_index)
        interaction_dataset.add_attribute(sampling_prob, '__sampling_probability')
        interaction_dataset.add_attribute(is_chosen_choice, 'chosen_choice')

        ## to get the older returns
        #sampled_index = interaction_dataset.get_2d_index()
        #chosen_choices = UNPLACED_ID * ones(index1.size, dtype="int32") 
        #where_chosen = where(interaction_dataset.get_attribute("chosen_choice"))
        #chosen_choices[where_chosen[0]]=where_chosen[1]
        #return (sampled_index, chosen_choice)
        
        return interaction_dataset
    def run(self,
            dataset1,
            dataset2,
            index1=None,
            index2=None,
            sample_size=10,
            weight=None,
            include_chosen_choice=False,
            with_replacement=False,
            resources=None,
            dataset_pool=None):
        """this function samples number of sample_size (scalar value) alternatives from dataset2
        for agent set specified by dataset1.
        If index1 is not None, only samples alterantives for agents with indices in index1;
        if index2 is not None, only samples alternatives from indices in index2.
        sample_size specifies number of alternatives to be sampled for each agent.
        weight, to be used as sampling weight, is either an attribute name of dataset2, or a 1d
        array of the same length as index2 or 2d array of shape (index1.size, index2.size).

        Also refer to document of interaction_dataset"""

        if dataset_pool is None:
            try:
                sc = SessionConfiguration()
                dataset_pool = sc.get_dataset_pool()
            except:
                dataset_pool = DatasetPool()

        local_resources = Resources(resources)
        local_resources.merge_if_not_None({
            "dataset1":
            dataset1,
            "dataset2":
            dataset2,
            "index1":
            index1,
            "index2":
            index2,
            "sample_size":
            sample_size,
            "weight":
            weight,
            "with_replacement":
            with_replacement,
            "include_chosen_choice":
            include_chosen_choice
        })

        local_resources.check_obligatory_keys(
            ['dataset1', 'dataset2', 'sample_size'])
        agent = local_resources["dataset1"]
        index1 = local_resources.get("index1", None)
        if index1 is None:
            index1 = arange(agent.size())
        choice = local_resources["dataset2"]
        index2 = local_resources.get("index2", None)
        if index2 is None:
            index2 = arange(choice.size())

        if index1.size == 0 or index2.size == 0:
            err_msg = "either choice size or agent size is zero, return None"
            logger.log_warning(err_msg)
            return None

        include_chosen_choice = local_resources.get("include_chosen_choice",
                                                    False)
        J = local_resources["sample_size"]
        if include_chosen_choice:
            J = J - 1

        with_replacement = local_resources.get("with_replacement")

        weight = local_resources.get("weight", None)
        if isinstance(weight, str):
            if weight in choice.get_known_attribute_names():
                weight = choice.get_attribute(weight)
                rank_of_weight = 1
            else:
                varname = VariableName(weight)
                if varname.get_dataset_name() == choice.get_dataset_name():
                    weight = choice.compute_variables(
                        weight, dataset_pool=dataset_pool)
                    rank_of_weight = 1
                elif varname.get_interaction_set_names() is not None:
                    ## weights can be an interaction variable
                    interaction_dataset = InteractionDataset(local_resources)
                    weight = interaction_dataset.compute_variables(
                        weight, dataset_pool=dataset_pool)
                    rank_of_weight = 2
                    assert (len(weight.shape) >= rank_of_weight)
                else:
                    err_msg = ("weight is neither a known attribute name "
                               "nor a simple variable from the choice dataset "
                               "nor an interaction variable: '%s'" % weight)
                    logger.log_error(err_msg)
                    raise ValueError, err_msg
        elif isinstance(weight, ndarray):
            rank_of_weight = weight.ndim
        elif not weight:  ## weight is None or empty string
            weight = ones(index2.size)
            rank_of_weight = 1
        else:
            err_msg = "unkown weight type"
            logger.log_error(err_msg)
            raise TypeError, err_msg

        if (weight.size <> index2.size) and (weight.shape[rank_of_weight - 1]
                                             <> index2.size):
            if weight.shape[rank_of_weight - 1] == choice.size():
                if rank_of_weight == 1:
                    weight = take(weight, index2)
                if rank_of_weight == 2:
                    weight = take(weight, index2, axis=1)
            else:
                err_msg = "weight array size doesn't match to size of dataset2 or its index"
                logger.log_error(err_msg)
                raise ValueError, err_msg

        prob = normalize(weight)

        #chosen_choice = ones(index1.size) * UNPLACED_ID
        chosen_choice_id = agent.get_attribute(choice.get_id_name()[0])[index1]
        #index_of_placed_agent = where(greater(chosen_choice_id, UNPLACED_ID))[0]
        chosen_choice_index = choice.try_get_id_index(
            chosen_choice_id, return_value_if_not_found=UNPLACED_ID)
        chosen_choice_index_to_index2 = lookup(chosen_choice_index,
                                               index2,
                                               index_if_not_found=UNPLACED_ID)

        if rank_of_weight == 1:  # if weight_array is 1d, then each agent shares the same weight for choices
            replace = with_replacement  # sampling with no replacement
            non_zero_counts = nonzerocounts(weight)
            if non_zero_counts < J:
                logger.log_warning(
                    "weight array dosen't have enough non-zero counts, use sample with replacement"
                )
                replace = True
            if non_zero_counts > 0:
                sampled_index = prob2dsample(
                    index2,
                    sample_size=(index1.size, J),
                    prob_array=prob,
                    exclude_index=chosen_choice_index_to_index2,
                    replace=replace,
                    return_index=True)
            else:
                # all alternatives have a zero weight
                sampled_index = zeros((index1.size, 0), dtype=DTYPE)
            #return index2[sampled_index]

        if rank_of_weight == 2:
            sampled_index = zeros((index1.size, J), dtype=DTYPE) - 1

            for i in range(index1.size):
                replace = with_replacement  # sampling with/without replacement
                i_prob = prob[i, :]
                if nonzerocounts(i_prob) < J:
                    logger.log_warning(
                        "weight array dosen't have enough non-zero counts, use sample with replacement"
                    )
                    replace = True

                #exclude_index passed to probsample_noreplace needs to be indexed to index2
                sampled_index[i, :] = probsample_noreplace(
                    index2,
                    sample_size=J,
                    prob_array=i_prob,
                    exclude_index=chosen_choice_index_to_index2[i],
                    return_index=True)
        sampling_prob = take(prob, sampled_index)
        sampled_index_within_prob = sampled_index.copy()
        sampled_index = index2[sampled_index]
        is_chosen_choice = zeros(sampled_index.shape, dtype="bool")
        #chosen_choice = -1 * ones(chosen_choice_index.size, dtype="int32")
        if include_chosen_choice:
            sampled_index = column_stack(
                (chosen_choice_index[:, newaxis], sampled_index))
            is_chosen_choice = zeros(sampled_index.shape, dtype="bool")
            is_chosen_choice[chosen_choice_index != UNPLACED_ID, 0] = 1
            #chosen_choice[where(is_chosen_choice)[0]] = where(is_chosen_choice)[1]
            ## this is necessary because prob is indexed to index2, not to the choice set (as is chosen_choice_index)
            sampling_prob_for_chosen_choices = take(
                prob, chosen_choice_index_to_index2[:, newaxis])
            ## if chosen choice chosen equals unplaced_id then the sampling prob is 0
            sampling_prob_for_chosen_choices[where(
                chosen_choice_index == UNPLACED_ID)[0], ] = 0.0
            sampling_prob = column_stack(
                [sampling_prob_for_chosen_choices, sampling_prob])

        interaction_dataset = self.create_interaction_dataset(
            dataset1, dataset2, index1, sampled_index)
        interaction_dataset.add_attribute(sampling_prob,
                                          '__sampling_probability')
        interaction_dataset.add_attribute(is_chosen_choice, 'chosen_choice')

        if local_resources.get("include_mnl_bias_correction_term", False):
            if include_chosen_choice:
                sampled_index_within_prob = column_stack(
                    (chosen_choice_index_to_index2[:, newaxis],
                     sampled_index_within_prob))
            interaction_dataset.add_mnl_bias_correction_term(
                prob, sampled_index_within_prob)

        ## to get the older returns
        #sampled_index = interaction_dataset.get_2d_index()
        #chosen_choices = UNPLACED_ID * ones(index1.size, dtype="int32")
        #where_chosen = where(interaction_dataset.get_attribute("chosen_choice"))
        #chosen_choices[where_chosen[0]]=where_chosen[1]
        #return (sampled_index, chosen_choice)

        return interaction_dataset
class RegressionModel(ChunkModel):

    model_name = "Regression Model"
    model_short_name = "RM"

    def __init__(self,
                 regression_procedure="opus_core.linear_regression",
                 submodel_string=None,
                 run_config=None,
                 estimate_config=None,
                 debuglevel=0,
                 dataset_pool=None):

        self.debug = DebugPrinter(debuglevel)

        self.dataset_pool = self.create_dataset_pool(dataset_pool)

        self.regression = RegressionModelFactory().get_model(
            name=regression_procedure)
        if self.regression == None:
            raise StandardError, "No regression procedure given."

        self.submodel_string = submodel_string

        self.run_config = run_config
        if self.run_config == None:
            self.run_config = Resources()
        if not isinstance(self.run_config, Resources) and isinstance(
                self.run_config, dict):
            self.run_config = Resources(self.run_config)

        self.estimate_config = estimate_config
        if self.estimate_config == None:
            self.estimate_config = Resources()
        if not isinstance(self.estimate_config, Resources) and isinstance(
                self.estimate_config, dict):
            self.estimate_config = Resources(self.estimate_config)

        self.data = {}
        self.coefficient_names = {}
        ChunkModel.__init__(self)
        self.get_status_for_gui().initialize_pieces(3,
                                                    pieces_description=array([
                                                        'initialization',
                                                        'computing variables',
                                                        'submodel: 1'
                                                    ]))

    def run(self,
            specification,
            coefficients,
            dataset,
            index=None,
            chunk_specification=None,
            data_objects=None,
            run_config=None,
            initial_values=None,
            procedure=None,
            debuglevel=0):
        """'specification' is of type EquationSpecification,
            'coefficients' is of type Coefficients,
            'dataset' is of type Dataset,
            'index' are indices of individuals in dataset for which
                        the model runs. If it is None, the whole dataset is considered.
            'chunk_specification' determines  number of chunks in which the simulation is processed.
            'data_objects' is a dictionary where each key is the name of an data object
            ('zone', ...) and its value is an object of class  Dataset.
           'run_config' is of type Resources, it gives additional arguments for the run.
           If 'procedure' is given, it overwrites the regression_procedure of the constructor.
           'initial_values' is an array of the initial values of the results. It will be overwritten
           by the results for those elements that are handled by the model (defined by submodels in the specification).
           By default the results are initialized with 0.
            'debuglevel' overwrites the constructor 'debuglevel'.
        """
        self.debug.flag = debuglevel
        if run_config == None:
            run_config = Resources()
        if not isinstance(run_config, Resources) and isinstance(
                run_config, dict):
            run_config = Resources(run_config)
        self.run_config = run_config.merge_with_defaults(self.run_config)
        self.run_config.merge({"debug": self.debug})
        if data_objects is not None:
            self.dataset_pool.add_datasets_if_not_included(data_objects)
        if procedure is not None:
            self.regression = RegressionModelFactory().get_model(
                name=procedure)
        if initial_values is None:
            self.initial_values = zeros((dataset.size(), ), dtype=float32)
        else:
            self.initial_values = zeros((dataset.size(), ),
                                        dtype=initial_values.dtype)
            self.initial_values[index] = initial_values

        if dataset.size() <= 0:  # no data loaded yet
            dataset.get_id_attribute()
        if index == None:
            index = arange(dataset.size())

        result = ChunkModel.run(self,
                                chunk_specification,
                                dataset,
                                index,
                                float32,
                                specification=specification,
                                coefficients=coefficients)
        return result

    def run_chunk(self, index, dataset, specification, coefficients):
        self.specified_coefficients = SpecifiedCoefficients().create(
            coefficients, specification, neqs=1)
        compute_resources = Resources({"debug": self.debug})
        submodels = self.specified_coefficients.get_submodels()
        self.get_status_for_gui().update_pieces_using_submodels(
            submodels=submodels, leave_pieces=2)
        self.map_agents_to_submodels(submodels,
                                     self.submodel_string,
                                     dataset,
                                     index,
                                     dataset_pool=self.dataset_pool,
                                     resources=compute_resources)
        variables = self.specified_coefficients.get_full_variable_names_without_constants(
        )
        self.debug.print_debug("Compute variables ...", 4)
        self.increment_current_status_piece()
        dataset.compute_variables(variables,
                                  dataset_pool=self.dataset_pool,
                                  resources=compute_resources)
        data = {}
        coef = {}
        outcome = self.initial_values[index].copy()
        for submodel in submodels:
            coef[submodel] = SpecifiedCoefficientsFor1Submodel(
                self.specified_coefficients, submodel)
            self.coefficient_names[submodel] = coef[
                submodel].get_coefficient_names_without_constant()[0, :]
            self.debug.print_debug(
                "Compute regression for submodel " + str(submodel), 4)
            self.increment_current_status_piece()
            self.data[submodel] = dataset.create_regression_data(
                coef[submodel],
                index=index[self.observations_mapping[submodel]])
            nan_index = where(isnan(self.data[submodel]))[1]
            inf_index = where(isinf(self.data[submodel]))[1]
            if nan_index.size > 0:
                nan_var_index = unique(nan_index)
                raise ValueError, "NaN(Not A Number) is returned from variable %s; check the model specification table and/or attribute values used in the computation for the variable." % coef[
                    submodel].get_variable_names()[nan_var_index]
            if inf_index.size > 0:
                inf_var_index = unique(inf_index)
                raise ValueError, "Inf is returned from variable %s; check the model specification table and/or attribute values used in the computation for the variable." % coef[
                    submodel].get_variable_names()[inf_var_index]

            if (self.data[submodel].shape[0] >
                    0) and (self.data[submodel].size >
                            0):  # observations for this submodel available
                outcome[self.observations_mapping[submodel]] = \
                    self.regression.run(self.data[submodel], coef[submodel].get_coefficient_values()[0,:],
                        resources=self.run_config).astype(outcome.dtype)
        return outcome

    def correct_infinite_values(self,
                                dataset,
                                outcome_attribute_name,
                                maxvalue=1e+38,
                                clip_all_larger_values=False):
        """Check if the model resulted in infinite values. If yes,
        print warning and clip the values to maxvalue. 
        If clip_all_larger_values is True, all values larger than maxvalue are clip to maxvalue.
        """
        infidx = where(dataset.get_attribute(outcome_attribute_name) == inf)[0]

        if infidx.size > 0:
            logger.log_warning("Infinite values in %s. Clipped to %s." %
                               (outcome_attribute_name, maxvalue))
            dataset.set_values_of_one_attribute(outcome_attribute_name,
                                                maxvalue, infidx)
        if clip_all_larger_values:
            idx = where(
                dataset.get_attribute(outcome_attribute_name) > maxvalue)[0]
            if idx.size > 0:
                logger.log_warning(
                    "Values in %s larger than %s. Clipped to %s." %
                    (outcome_attribute_name, maxvalue, maxvalue))
                dataset.set_values_of_one_attribute(outcome_attribute_name,
                                                    maxvalue, idx)

    def estimate(self,
                 specification,
                 dataset,
                 outcome_attribute,
                 index=None,
                 procedure=None,
                 data_objects=None,
                 estimate_config=None,
                 debuglevel=0):
        """'specification' is of type EquationSpecification,
            'dataset' is of type Dataset,
            'outcome_attribute' - string that determines the dependent variable,
            'index' are indices of individuals in dataset for which
                    the model runs. If it is None, the whole dataset is considered.
            'procedure' - name of the estimation procedure. If it is None,
                there should be an entry "estimation" in 'estimate_config' that determines the procedure. The class
                must have a method 'run' that takes as arguments 'data', 'regression_procedure' and 'resources'.
                It returns a dictionary with entries 'estimators', 'standard_errors' and 't_values' (all 1D numpy arrays).
            'data_objects' is a dictionary where each key is the name of an data object
                    ('zone', ...) and its value is an object of class  Dataset.
            'estimate_config' is of type Resources, it gives additional arguments for the estimation procedure.
            'debuglevel' overwrites the class 'debuglevel'.
        """
        #import wingdbstub
        self.debug.flag = debuglevel
        if estimate_config == None:
            estimate_config = Resources()
        if not isinstance(estimate_config, Resources) and isinstance(
                estimate_config, dict):
            estimate_config = Resources(estimate_config)
        self.estimate_config = estimate_config.merge_with_defaults(
            self.estimate_config)
        if data_objects is not None:
            self.dataset_pool.add_datasets_if_not_included(data_objects)
        self.procedure = procedure
        if self.procedure == None:
            self.procedure = self.estimate_config.get("estimation", None)
        if self.procedure is not None:
            self.procedure = ModelComponentCreator().get_model_component(
                self.procedure)
        else:
            logger.log_warning(
                "No estimation procedure given, or problems with loading the corresponding module."
            )

        compute_resources = Resources({"debug": self.debug})
        if dataset.size() <= 0:  # no data loaded yet
            dataset.get_id_attribute()
        if index == None:
            index = arange(dataset.size())
        if not isinstance(index, ndarray):
            index = array(index)

        estimation_size_agents = self.estimate_config.get(
            "estimation_size_agents",
            None)  # should be a proportion of the agent_set
        if estimation_size_agents == None:
            estimation_size_agents = 1.0
        else:
            estimation_size_agents = max(min(estimation_size_agents, 1.0),
                                         0.0)  # between 0 and 1

        if estimation_size_agents < 1.0:
            self.debug.print_debug("Sampling agents for estimation ...", 3)
            estimation_idx = sample_noreplace(
                arange(index.size), int(index.size * estimation_size_agents))
        else:
            estimation_idx = arange(index.size)

        estimation_idx = index[estimation_idx]
        self.debug.print_debug(
            "Number of observations for estimation: " +
            str(estimation_idx.size), 2)
        if estimation_idx.size <= 0:
            self.debug.print_debug("Nothing to be done.", 2)
            return (None, None)

        coefficients = create_coefficient_from_specification(specification)
        specified_coefficients = SpecifiedCoefficients().create(coefficients,
                                                                specification,
                                                                neqs=1)
        submodels = specified_coefficients.get_submodels()
        self.get_status_for_gui().update_pieces_using_submodels(
            submodels=submodels, leave_pieces=2)
        self.map_agents_to_submodels(
            submodels,
            self.submodel_string,
            dataset,
            estimation_idx,
            dataset_pool=self.dataset_pool,
            resources=compute_resources,
            submodel_size_max=self.estimate_config.get('submodel_size_max',
                                                       None))
        variables = specified_coefficients.get_full_variable_names_without_constants(
        )
        self.debug.print_debug("Compute variables ...", 4)
        self.increment_current_status_piece()
        dataset.compute_variables(variables,
                                  dataset_pool=self.dataset_pool,
                                  resources=compute_resources)

        coef = {}
        estimated_coef = {}
        self.outcome = {}
        dataset.compute_variables([outcome_attribute],
                                  dataset_pool=self.dataset_pool,
                                  resources=compute_resources)
        regression_resources = Resources(estimate_config)
        regression_resources.merge({"debug": self.debug})
        outcome_variable_name = VariableName(outcome_attribute)
        for submodel in submodels:
            coef[submodel] = SpecifiedCoefficientsFor1Submodel(
                specified_coefficients, submodel)
            self.increment_current_status_piece()
            logger.log_status("Estimate regression for submodel " +
                              str(submodel),
                              tags=["estimate"],
                              verbosity_level=2)
            logger.log_status("Number of observations: " +
                              str(self.observations_mapping[submodel].size),
                              tags=["estimate"],
                              verbosity_level=2)
            self.data[
                submodel] = dataset.create_regression_data_for_estimation(
                    coef[submodel],
                    index=estimation_idx[self.observations_mapping[submodel]])
            self.coefficient_names[submodel] = coef[
                submodel].get_coefficient_names_without_constant()[0, :]
            if (self.data[submodel].shape[0] > 0
                ) and (self.data[submodel].size > 0) and (
                    self.procedure
                    is not None):  # observations for this submodel available
                self.outcome[submodel] = dataset.get_attribute_by_index(
                    outcome_variable_name.get_alias(),
                    estimation_idx[self.observations_mapping[submodel]])
                regression_resources.merge({"outcome": self.outcome[submodel]})
                regression_resources.merge({
                    "coefficient_names":
                    self.coefficient_names[submodel].tolist(),
                    "constant_position":
                    coef[submodel].get_constants_positions()
                })
                estimated_coef[submodel] = self.procedure.run(
                    self.data[submodel],
                    self.regression,
                    resources=regression_resources)
                if "estimators" in estimated_coef[submodel].keys():
                    coef[submodel].set_coefficient_values(
                        estimated_coef[submodel]["estimators"])
                if "standard_errors" in estimated_coef[submodel].keys():
                    coef[submodel].set_standard_errors(
                        estimated_coef[submodel]["standard_errors"])
                if "other_measures" in estimated_coef[submodel].keys():
                    for measure in estimated_coef[submodel][
                            "other_measures"].keys():
                        coef[submodel].set_measure(
                            measure, estimated_coef[submodel]["other_measures"]
                            [measure])
                if "other_info" in estimated_coef[submodel].keys():
                    for info in estimated_coef[submodel]["other_info"]:
                        coef[submodel].set_other_info(
                            info, estimated_coef[submodel]["other_info"][info])
        coefficients.fill_coefficients(coef)

        self.save_predicted_values_and_errors(specification,
                                              coefficients,
                                              dataset,
                                              outcome_variable_name,
                                              index=index,
                                              data_objects=data_objects)

        return (coefficients, estimated_coef)

    def prepare_for_run(self,
                        dataset=None,
                        dataset_filter=None,
                        filter_threshold=0,
                        **kwargs):
        spec, coef = prepare_specification_and_coefficients(**kwargs)
        if (dataset is not None) and (dataset_filter is not None):
            filter_values = dataset.compute_variables(
                [dataset_filter], dataset_pool=self.dataset_pool)
            index = where(filter_values > filter_threshold)[0]
        else:
            index = None
        return (spec, coef, index)

    def prepare_for_estimate(self,
                             dataset=None,
                             dataset_filter=None,
                             filter_threshold=0,
                             **kwargs):
        spec = get_specification_for_estimation(**kwargs)
        if (dataset is not None) and (dataset_filter is not None):
            filter_values = dataset.compute_variables(
                [dataset_filter], dataset_pool=self.dataset_pool)
            index = where(filter_values > filter_threshold)[0]
        else:
            index = None
        return (spec, index)

    def get_data_as_dataset(self, submodel=-2):
        """Like get_all_data, but the retuning value is a Dataset containing attributes that
        correspond to the data columns. Their names are coefficient names."""
        all_data = self.get_all_data(submodel)
        if all_data is None:
            return None
        names = self.get_coefficient_names(submodel)
        if names is None:
            return None
        dataset_data = {}
        for i in range(names.size):
            dataset_data[names[i]] = all_data[:, i].reshape(all_data.shape[0])
        dataset_data["id"] = arange(all_data.shape[0]) + 1
        storage = StorageFactory().get_storage('dict_storage')
        storage.write_table(table_name='dataset', table_data=dataset_data)
        ds = Dataset(in_storage=storage, id_name="id", in_table_name='dataset')
        return ds

    def save_predicted_values_and_errors(self,
                                         specification,
                                         coefficients,
                                         dataset,
                                         outcome_variable,
                                         index=None,
                                         data_objects=None):
        if self.estimate_config.get('save_predicted_values_and_errors', False):
            logger.log_status('Computing predicted values and residuals.')
            original_values = dataset.get_attribute_by_index(
                outcome_variable, index)
            predicted_values = zeros(dataset.size(), dtype='float32')
            predicted_values[index] = self.run_after_estimation(
                specification,
                coefficients,
                dataset,
                index=index,
                data_objects=data_objects)
            predicted_attribute_name = 'predicted_%s' % outcome_variable.get_alias(
            )
            dataset.add_primary_attribute(name=predicted_attribute_name,
                                          data=predicted_values)
            dataset.flush_attribute(predicted_attribute_name)
            predicted_error_attribute_name = 'residuals_%s' % outcome_variable.get_alias(
            )
            error_values = zeros(dataset.size(), dtype='float32')
            error_values[index] = (original_values -
                                   predicted_values[index]).astype(
                                       error_values.dtype)
            dataset.add_primary_attribute(name=predicted_error_attribute_name,
                                          data=error_values)
            dataset.flush_attribute(predicted_error_attribute_name)
            logger.log_status(
                'Predicted values saved as %s (for the %s dataset)' %
                (predicted_attribute_name, dataset.get_dataset_name()))
            logger.log_status(
                'Residuals saved as %s (for the %s dataset)' %
                (predicted_error_attribute_name, dataset.get_dataset_name()))

    def export_estimation_data(self,
                               submodel=-2,
                               file_name='./estimation_data_regression.txt',
                               delimiter='\t'):
        import os
        from numpy import newaxis
        data = concatenate((self.outcome[submodel][..., newaxis],
                            self.get_all_data(submodel=submodel)),
                           axis=1)
        header = ['outcome'] + self.get_coefficient_names(submodel).tolist()
        nrows = data.shape[0]
        file_name_root, file_name_ext = os.path.splitext(file_name)
        out_file = "%s_submodel_%s.txt" % (file_name_root, submodel)
        fh = open(out_file, 'w')
        fh.write(delimiter.join(header) + '\n')  #file header
        for row in range(nrows):
            line = [str(x) for x in data[row, ]]
            fh.write(delimiter.join(line) + '\n')
        fh.flush()
        fh.close
        print 'Data written into %s' % out_file

    def run_after_estimation(self, *args, **kwargs):
        return self.run(*args, **kwargs)

    def _get_status_total_pieces(self):
        return ChunkModel._get_status_total_pieces(
            self) * self.get_status_for_gui().get_total_number_of_pieces()

    def _get_status_current_piece(self):
        return ChunkModel._get_status_current_piece(
            self) * self.get_status_for_gui().get_total_number_of_pieces(
            ) + self.get_status_for_gui().get_current_piece()

    def _get_status_piece_description(self):
        return "%s %s" % (ChunkModel._get_status_piece_description(
            self), self.get_status_for_gui().get_current_piece_description())

    def get_specified_coefficients(self):
        return self.specified_coefficients
def create_from_parcel_and_development_template(
        parcel_dataset,
        development_template_dataset,
        parcel_index=None,
        template_index=None,
        filter_attribute=None,
        consider_constraints_as_rules=True,
        template_opus_path="urbansim_parcel.development_template",
        proposed_units_variable="urbansim_parcel.development_project_proposal.units_proposed",
        dataset_pool=None,
        resources=None):
    """create development project proposals from parcel and development_template_dataset,
    parcel_index - 1D array, indices of parcel_dataset. Status of the proposals is set to 'tentative'.
    template_index - index to templates that are available to create proposals;
    filter_attribute - variable that is used to filter proposals;
    
    If a development constraint table exists, create proposal dataset include only proposals that are allowed by constraints,
    otherwise, create a proposal dataset with Cartesian product of parcels x templates 
    """

    resources = Resources(resources)
    debug = resources.get("debug", 0)
    if not isinstance(debug, DebugPrinter):
        debug = DebugPrinter(debug)

    if parcel_index is not None and parcel_index.size <= 0:
        logger.log_warning(
            "parcel index for creating development proposals is of size 0. No proposals will be created."
        )
        return None

    storage = StorageFactory().get_storage('dict_storage')
    current_year = SimulationState().get_current_time()

    def _get_data(parcel_ids, template_ids):
        return {
            "proposal_id":
            arange(1, parcel_ids.size + 1, 1),
            "parcel_id":
            parcel_ids,
            "template_id":
            template_ids,
            "start_year":
            array(parcel_ids.size * [current_year]),
            "status_id":
            resize(
                array([DevelopmentProjectProposalDataset.id_tentative],
                      dtype="int16"), parcel_ids.size)
        }

    def _create_project_proposals(parcel_ids, template_ids):
        storage.write_table(table_name='development_project_proposals',
                            table_data=_get_data(parcel_ids, template_ids))
        development_project_proposals = DevelopmentProjectProposalDataset(
            resources=Resources(resources),
            dataset1=parcel_dataset,
            dataset2=development_template_dataset,
            index1=parcel_index,
            index2=template_index,
            in_storage=storage,
            in_table_name='development_project_proposals',
        )
        return development_project_proposals

    def _compute_filter(proposals):
        if filter_attribute is not None:
            proposals.compute_variables(filter_attribute,
                                        dataset_pool=dataset_pool,
                                        resources=Resources(resources))
            filter_index = where(
                proposals.get_attribute(filter_attribute) > 0)[0]
            return filter_index
        return None

    def _subset_by_filter(proposals):
        filter_index = _compute_filter(proposals)
        if filter_index is not None:
            proposals.subset_by_index(filter_index,
                                      flush_attributes_if_not_loaded=False)
        return proposals

    if parcel_index is not None:
        index1 = parcel_index
    else:
        index1 = arange(parcel_dataset.size())

    if template_index is not None:
        index2 = template_index
    else:
        index2 = arange(development_template_dataset.size())

    has_constraint_dataset = True
    try:
        constraints = dataset_pool.get_dataset("development_constraint")
        constraints.load_dataset_if_not_loaded()
    except:
        has_constraint_dataset = False

    if has_constraint_dataset:
        constraint_types = unique(constraints.get_attribute(
            "constraint_type"))  #unit_per_acre, far etc
        development_template_dataset.compute_variables(
            map(lambda x: "%s.%s" % (template_opus_path, x), constraint_types),
            dataset_pool)

        parcel_dataset.get_development_constraints(
            constraints,
            dataset_pool,
            index=index1,
            consider_constraints_as_rules=consider_constraints_as_rules)
        generic_land_use_type_ids = development_template_dataset.compute_variables(
            "urbansim_parcel.development_template.generic_land_use_type_id",
            dataset_pool=dataset_pool)
    parcel_ids = parcel_dataset.get_id_attribute()
    template_ids = development_template_dataset.get_id_attribute()

    proposal_parcel_ids = array([], dtype="int32")
    proposal_template_ids = array([], dtype="int32")
    logger.start_block("Combine parcels, templates and constraints")
    for i_template in index2:
        this_template_id = template_ids[i_template]
        fit_indicator = ones(index1.size, dtype="bool8")
        if has_constraint_dataset:
            generic_land_use_type_id = generic_land_use_type_ids[i_template]
            for constraint_type, constraint in parcel_dataset.development_constraints[
                    generic_land_use_type_id].iteritems():
                template_attribute = development_template_dataset.get_attribute(
                    constraint_type
                )[i_template]  #density converted to constraint variable name
                if template_attribute == 0:
                    continue
                min_constraint = constraint[:, 0].copy()
                max_constraint = constraint[:, 1].copy()
                ## treat -1 as unconstrainted
                w_unconstr = min_constraint == -1
                if w_unconstr.any():
                    min_constraint[w_unconstr] = template_attribute

                w_unconstr = max_constraint == -1
                if w_unconstr.any():
                    max_constraint[w_unconstr] = template_attribute

                fit_indicator = logical_and(
                    fit_indicator,
                    logical_and(template_attribute >= min_constraint,
                                template_attribute <= max_constraint))

                if constraint_type == "units_per_acre":
                    res_units_capacity = parcel_dataset.get_attribute(
                        "parcel_sqft")[index1] * max_constraint / 43560.0
                    debug.print_debug(
                        "template_id %s (GLU ID %s) max total residential capacity %s, %s of them fit constraints "
                        % (this_template_id, generic_land_use_type_id,
                           res_units_capacity.sum(),
                           (res_units_capacity * fit_indicator).sum()), 12)
                else:
                    non_res_capacity = parcel_dataset.get_attribute(
                        "parcel_sqft")[index1] * max_constraint
                    debug.print_debug(
                        "template_id %s (GLU ID %s) max total non residential capacity %s, %s of them fit constraints "
                        % (this_template_id, generic_land_use_type_id,
                           non_res_capacity.sum(),
                           (non_res_capacity * fit_indicator).sum()), 12)

        proposal_parcel_ids = concatenate(
            (proposal_parcel_ids, parcel_ids[index1[fit_indicator]]))
        proposal_template_ids = concatenate((proposal_template_ids,
                                             resize(array([this_template_id]),
                                                    fit_indicator.sum())))

    logger.end_block()
    proposals = _create_project_proposals(proposal_parcel_ids,
                                          proposal_template_ids)
    proposals = _subset_by_filter(proposals)

    # eliminate proposals with zero units_proposed
    units_proposed = proposals.compute_variables([proposed_units_variable],
                                                 dataset_pool=dataset_pool)
    where_up_greater_zero = where(units_proposed > 0)[0]
    if where_up_greater_zero.size > 0:
        proposals.subset_by_index(where_up_greater_zero,
                                  flush_attributes_if_not_loaded=False)

    logger.log_status("proposal set created with %s proposals." %
                      proposals.size())
    #proposals.flush_dataset_if_low_memory_mode()
    return proposals
Exemple #21
0
    def run(self,
            dataset1,
            dataset2,
            index1=None,
            index2=None,
            stratum=None,
            weight=None,
            sample_size=1,
            sample_size_from_each_stratum=None,
            sample_size_from_chosen_stratum=None,
            sample_rate=None,
            include_chosen_choice=False,
            resources=None,
            with_replacement=False,
            dataset_pool=None,
            **kwargs):
        """this function samples number of sample_size (scalar value) alternatives from dataset2
        for agent set specified by dataset1.
        If index1 is not None, only samples alternatives for agents with indices in index1;
        if index2 is not None, only samples alternatives from indices in index2.
        sample_size specifies number of alternatives to be sampled from each stratum, and is overwritten
          by sample_size_from_each_stratum if it's not None
        weight, to be used as sampling weight, is either an attribute name of dataset2, or a 1d
        array of the same length as index2 or 2d array of shape (index1.size, index2.size).

        Also refer to document of interaction_dataset"""
        if dataset_pool is None:
            try:
                sc = SessionConfiguration()
                dataset_pool = sc.get_dataset_pool()
            except:
                dataset_pool = DatasetPool()

        local_resources = Resources(resources)
        local_resources.merge_if_not_None({
            "dataset1":
            dataset1,
            "dataset2":
            dataset2,
            "index1":
            index1,
            "index2":
            index2,
            "with_replacement":
            with_replacement,
            "stratum":
            stratum,
            "weight":
            weight,
            "sample_size":
            sample_size,
            "sample_size_from_each_stratum":
            sample_size_from_each_stratum,
            "sample_size_from_chosen_stratum":
            sample_size_from_chosen_stratum,
            "sample_rate":
            sample_rate,
            "include_chosen_choice":
            include_chosen_choice
        })

        local_resources.check_obligatory_keys(['dataset1', 'dataset2'])
        index1 = local_resources.get("index1", None)

        agent = dataset1

        if index1 is None:
            agent.get_id_attribute()
            index1 = arange(agent.size())

        choice = local_resources["dataset2"]
        index2 = local_resources.get("index2", None)

        if index2 is None:
            choice.get_id_attribute()
            index2 = arange(choice.size())

        if index1.size == 0 or index2.size == 0:
            err_msg = "either choice size or agent size is zero, return None"
            logger.log_warning(err_msg)
            return (None, None)

        include_chosen_choice = local_resources.get("include_chosen_choice",
                                                    False)
        weight = local_resources.get("weight", None)

        if isinstance(weight, str):
            choice.compute_variables(weight, resources=local_resources)
            weight = choice.get_attribute(weight)
            rank_of_weight = 1
        elif isinstance(weight, ndarray):
            rank_of_weight = weight.ndim
        elif weight is None:
            weight = ones(index2.size)
            rank_of_weight = 1
        else:
            err_msg = "unknown weight type"
            logger.log_error(err_msg)
            raise TypeError, err_msg

        if (weight.size <> index2.size) and (weight.shape[rank_of_weight - 1]
                                             <> index2.size):
            if weight.shape[rank_of_weight - 1] == choice.size():
                weight = take(weight, index2)
            else:
                err_msg = "weight array size doesn't match to size of dataset2 or its index"
                logger.log_error(err_msg)
                raise ValueError, err_msg

        prob = normalize(weight)

        stratum = local_resources.get("stratum", None)
        if stratum is None:
            raise StandardError, "'stratum' must be defined for stratified sampling."
        if isinstance(stratum, str):
            choice.compute_variables(stratum, resources=local_resources)
            stratum = choice.get_attribute(stratum)

        #chosen_choice = ones(index1.size) * UNPLACED_ID
        chosen_choice_id = agent.get_attribute(choice.get_id_name()[0])[index1]
        #index_of_placed_agent = where(greater(chosen_choice_id, UNPLACED_ID))[0]
        chosen_choice_index = choice.try_get_id_index(
            chosen_choice_id, return_value_if_not_found=-1)
        chosen_choice_index_to_index2 = lookup(chosen_choice_index,
                                               index2,
                                               index_if_not_found=UNPLACED_ID)

        ##TODO: check all chosen strata are in selectable strata
        #i.e. chosen_choice_index is in index2
        chosen_stratum = ones(chosen_choice_index.size,
                              dtype=DTYPE) * NO_STRATUM_ID
        chosen_stratum[where(
            chosen_choice_index != -1)] = stratum[chosen_choice_index[where(
                chosen_choice_index != -1)]]
        selectable_strata = stratum[index2]
        unique_strata = unique(selectable_strata)
        unique_strata = unique_strata[where(unique_strata != NO_STRATUM_ID)]

        #        if rank_of_weight == 2:
        #            raise RuntimeError, "stratified sampling for 2d weight is unimplemented yet"

        #        sampled_index = zeros((index1.size,1)) - 1

        sample_size = local_resources.get("sample_size", None)
        sample_size_from_each_stratum = local_resources.get(
            "sample_size_from_each_stratum", None)
        if sample_size_from_each_stratum is None:
            sample_size_from_each_stratum = sample_size
        strata_sample_size = ones(unique_strata.size,
                                  dtype=DTYPE) * sample_size_from_each_stratum
        sample_rate = local_resources.get("sample_rate", None)
        if sample_rate is not None:
            raise UnImplementedError, "sample_rate is not implemented yet."
            ##TODO: to be finished
            #num_elements_in_strata = histogram(selectable_strata, unique_strata)
            #strata_sample_size = round(num_elements_in_strata * sample_rate)

        sample_size_from_chosen_stratum = local_resources.get(
            "sample_size_from_chosen_stratum", None)
        if sample_size_from_chosen_stratum is None and not include_chosen_choice:
            strata_sample_pairs = array(
                map(lambda x, y: [x, y], unique_strata, strata_sample_size))
            if rank_of_weight == 1:
                sampled_index = self._sample_by_stratum(
                    index1, index2, selectable_strata, prob,
                    chosen_choice_index_to_index2, strata_sample_pairs)
            elif rank_of_weight == 2:
                sampled_index = self._sample_by_agent_and_stratum(
                    index1, index2, selectable_strata, prob,
                    chosen_choice_index_to_index2, strata_sample_pairs)
        else:
            strata_sample_setting = zeros((index1.size, unique_strata.size, 2),
                                          dtype=DTYPE)
            for i in range(index1.size):
                agents_strata_sample_size = copy.copy(strata_sample_size)
                if sample_size_from_chosen_stratum is None:
                    ## if sample_size_from_chosen_stratum is None and include_chosen_choice is True,
                    ## sample one less from the chosen stratum
                    agents_strata_sample_size[where(
                        unique_strata == chosen_stratum[i])] += -1
                else:
                    agents_strata_sample_size[where(
                        unique_strata ==
                        chosen_stratum[i])] = sample_size_from_chosen_stratum
                strata_sample_pairs = array(
                    map(lambda x, y: [x, y], unique_strata,
                        agents_strata_sample_size))
                strata_sample_setting[i, ...] = strata_sample_pairs

            sampled_index = self._sample_by_agent_and_stratum(
                index1, index2, selectable_strata, prob,
                chosen_choice_index_to_index2, strata_sample_setting)
        #chosen_choice = None
        is_chosen_choice = zeros(sampled_index.shape, dtype="bool")
        if include_chosen_choice:
            sampled_index = concatenate(
                (chosen_choice_index[:, newaxis], sampled_index), axis=1)
            #chosen_choice = zeros(chosen_choice_index.shape, dtype="int32") - 1
            #chosen_choice[where(chosen_choice_index>UNPLACED_ID)] = 0 #make chosen_choice index to sampled_index, instead of choice (as chosen_choice_index does)
            #since the chosen choice index is attached to the first column, the chosen choice should be all zeros
            #for valid chosen_choice_index
            is_chosen_choice = zeros(sampled_index.shape, dtype="bool")
            is_chosen_choice[chosen_choice_index != UNPLACED_ID, 0] = 1

            chosen_probability = zeros(
                (chosen_choice_index.size, ), dtype=float32) - 1
            for stratum in unique_strata:
                w = chosen_stratum == stratum
                chosen_probability[w] = (
                    prob[chosen_choice_index[w]] /
                    prob[selectable_strata == stratum].sum()).astype(float32)
            self._sampling_probability = concatenate(
                (chosen_probability[:, newaxis], self._sampling_probability),
                axis=1)
            self._stratum_id = concatenate(
                (chosen_stratum[:, newaxis], self._stratum_id), axis=1)

        interaction_dataset = self.create_interaction_dataset(
            dataset1, dataset2, index1, sampled_index)
        interaction_dataset.add_attribute(self._sampling_probability,
                                          '__sampling_probability')
        interaction_dataset.add_attribute(is_chosen_choice, 'chosen_choice')
        interaction_dataset.add_attribute(self._stratum_id, 'stratum_id')

        ## to get the older returns
        #sampled_index = interaction_dataset.get_2d_index()
        #chosen_choices = UNPLACED_ID * ones(index1.size, dtype="int32")
        #where_chosen = where(interaction_dataset.get_attribute("chosen_choice"))
        #chosen_choices[where_chosen[0]]=where_chosen[1]
        #return (sampled_index, chosen_choice)

        return interaction_dataset
Exemple #22
0
class PandasDataset(Dataset):
    """
    This is under construction.
    It is an attempt to have an analogous to an Opus Dataset that would use 
    Pandas DataFrame. The actual data is stored in an attribute called df 
    which is a DataFrame and is indexed by the dataset's unique identifier. 
    The dataset can be created from the same inputs as Opus dataset.
    Alternatively, it can be created from an existing Opus dataset using 
    the constructor PandasClassFactory.
    """
    def __init__(self, create_from_data=True, **kwargs):
        if create_from_data:
            self.create_from_data(**kwargs)

    
    def create_from_data(self, resources=None, id_name=None, in_storage=None, dataset_name=None,
            out_storage=None, in_table_name=None, out_table_name=None):
        self.resources = Resources(resources)
        self.resources.merge_if_not_None({ "id_name":id_name,
                            "dataset_name":dataset_name,
                            "in_storage":in_storage,
                            "out_storage":out_storage,
                            "in_table_name":in_table_name,
                            "out_table_name":out_table_name})
        self.resources.merge_with_defaults({"dataset_name":"dataset"})
        self.dataset_name = self.resources.get("dataset_name", None)
        self.attribute_cache = AttributeCache()
        self._aliases = {}
        self._id_names = self.resources.get("id_name", [])
        if not isinstance(self._id_names, list):
            self._id_names = [self._id_names]
        self.variable_factory = VariableFactory()
        self.debug = self.resources.get("debug",  0)
        self.df = pd.DataFrame(self.resources.get('in_storage').load_table(self.resources.get('in_table_name')))
        self._primary_attribute_names = self.get_attribute_names()
        self.df.set_index(self._id_names, inplace=True)
        self.attribute_boxes = {}
        for attr in self._primary_attribute_names:
            self.attribute_boxes[attr] = AttributeBox(self, [],
                                                variable_name=self.create_and_check_qualified_variable_name(attr),
                                                type=AttributeType.PRIMARY,
                                                is_in_memory=True,
                                                header=None,
                                                version=0)
        self.n = self.df.shape[0]
            
    def __getitem__(self, attr):
        """ dataset[attr]
        """
        return self.get_attribute(attr)

    def __setitem__(self, attr, values):
        """ dataset[attr] = values
        """
        self.df[attr] = values

    def get_attribute(self, name):
        if isinstance(name, VariableName):
            name = name.get_alias()
        else:
            name = VariableName(name).get_alias()
        if name in self.get_id_name():
            return self.get_id_attribute()
        return self.df[name].values
    
    def get_id_attribute(self):
        return self.df.index.values
    
    def get_attribute_by_id(self, name, id):
        return self.df[name][id]
    
    def get_attribute_names(self):
        return self.df.columns
    
    def _do_flush_attribute(self, name):
        """For now don't do anything."""
        pass
        
    def load_dataset(self, resources=None, attributes=None, in_storage=None,
                     in_table_name=None, lowercase=None, **kwargs):

        #set defaults
        attributes_default = '*'
        lower_default = 1 # if 1, use lowercase for attribute names

        # merge arguments with dictionaries and add missing entries
        local_resources = Resources(self.resources)
        if resources is not None:
            local_resources.merge_if_not_None(resources)
        local_resources.merge_if_not_None({"attributes":attributes,
                                           "in_storage":in_storage,
                                           "in_table_name":in_table_name,
                                           "lowercase":lowercase})
        local_resources.merge_with_defaults({"attributes":attributes_default,
                                             "lowercase":lower_default,
                                            })

        # check obligatory entries
        local_resources.check_obligatory_keys(["in_storage", "in_table_name"])

        # prepare for loading
        in_storage = local_resources["in_storage"]

        if not self._is_hidden_id():
            local_resources.merge({"id_name":self._id_names})
            
        table_name = local_resources['in_table_name']
        column_names = local_resources['attributes']
        chunked_attributes = self.chunk_columns(storage=in_storage,
                                                   table_name=table_name, 
                                                   column_names=column_names,
                                                   nchunks=1)
        # flatten list
        column_names = [name for name in chunked_attributes[0]
                                if name in in_storage.get_column_names(table_name)]
        data = in_storage.load_table(table_name = table_name, 
                                             column_names = column_names)
        self.df = pd.DataFrame(data)
        self.df.set_index(self._id_names, inplace=True)
        data_computed = {}
        if table_name+".computed" in in_storage.get_table_names():
            column_names_computed = [name for name in column_names
                                if name in in_storage.get_column_names(table_name+".computed")]
            data_computed = in_storage.load_table(table_name = table_name+".computed", 
                                                 column_names = column_names_computed)
            dfcomp = pd.DataFrame(data_computed)
            dfcomp.set_index(self._id_names, inplace=True)
            self.df = concat(self.df, dfcomp)
                      
        for attr in data:
            if not ((attr in self._id_names) and self.attribute_boxes.has_key(attr)): #do not store id_name every time
                self.attribute_boxes[attr] = AttributeBox(self, [],
                                                variable_name=self.create_and_check_qualified_variable_name(attr),
                                                type=AttributeType.PRIMARY,
                                                is_in_memory=True,
                                                header=None,
                                                version=0)

        for attr in data_computed:
            if not ((attr in self._id_names) and self.attribute_boxes.has_key(attr)): #do not store id_name every time
                self.attribute_boxes[attr] = AttributeBox(self, [],
                                                variable_name=self.create_and_check_qualified_variable_name(attr),
                                                type=AttributeType.COMPUTED,
                                                is_in_memory=True,
                                                header=None,
                                                version=0)
                                                                        
        self.n = self.df.shape[0]

    def add_attribute(self, data, name, metadata=2):
        """Add values given in argument 'data' to dataset as an attribute 'name' as type 'metadata'. If this
        attribute already exists, its values are overwritten. 
        'metadata' should be of type AttributeType (PRIMARY=1, COMPUTED=2).
        The method increments and returns the version number of the attribute.
        """
        if not (isinstance(data, ndarray) or is_masked_array(data)):
            data=array(data)
        name = self.create_and_check_qualified_variable_name(name)
        short_name = name.get_alias()
        if short_name in self.get_attribute_names():
            self.attribute_boxes[short_name].set_is_in_memory(True)
            self.attribute_boxes[short_name].set_type(metadata)
        else:
            self.attribute_boxes[short_name] = AttributeBox(self, data=[], variable_name=name,
                                                type=metadata)
        if metadata == AttributeType.PRIMARY:
            self._add_to_primary_attribute_names(short_name)
        self.df[short_name] = data
        self.__increment_version(short_name)
        return self.get_version(short_name)
    
    def attribute_sum(self, name):
        """Return the sum of values of the attribute 'name'.
        """
        return self.df[name].sum()

    def attribute_average(self, name):
        """Return the value of the given attribute averaged over the dataset.
        """
        return self.df[name].mean()

    def summary(self, index=None):
        if index is not None:
            self.df[index].describe()
        else:
            self.df.describe()

    def size(self):
        """Return size of the dataset."""
        return self.df.shape[0]
    
    def get_data_element_by_id(self, id, all_attributes=False):
        """Return an object of class DataElement of the given identifier id. See get_data_element."""
        return self.get_data_element(id, all_attributes)
    
    def get_data_element(self, id, **kwargs):
        """Return an object of class DataElement of the given index. 
        """
        object = DataElement()
        for col in self.get_attribute_names():
            setattr(object, col, self.df[col][id])
        return object
    
    def subset_by_ids(self, ids, **kwargs):
        """Shrink the dataset to values given by 'index'. The removed data are then lost.
        """
        self.df = self.df.loc[ids]
        self.n = self.df.shape[0]

    def aggregate_dataset_over_ids(self, dataset, function='sum', attribute_name=None, constant=None):
        """Aggregate attribute (given by 'attribute_name') of the given 'dataset' over
        self by applying the given function. The dataset is expected to have an attribute of the same
        name as the unique identifier of self. If attribute_name is not given, the
        argument 'constant' must be given, which is either a scalar or a numpy array. if it
        is a scalar, for each individual to be counted the constant value is taken into the function;
        if it is a numpy array of the same size as dataset, the value in the same index as
        individual is counted into the function.
        """
        workdf = dataset.df
        if attribute_name == None:
            if constant == None:
                self._raise_error(StandardError,
                                  "Either 'attribute_name' or 'constant' must be given.")
            elif isinstance(constant, ndarray):
                if constant.size <> dataset_id_values.size:
                    self._raise_error(StandardError,
                                      "constant's size (%d) must be of the same as dataset's size (%d)"
                                      % (constant.size, dataset_id_values.size))
                values = constant
            else:
                values = resize(array([constant]), dataset.size())
            attribute_name = '__constant__'
            workdf[attribute_name] = values 
        else: 
            if is_masked_array(dataset[attribute_name]):
                w = where(ma.getmask(dataset[attribute_name]))
                if len(w)>0:
                    where_masked = w[0]
                    # do not consider those elements in the computation
                    workdf[attribute_name] = ma.filled(workdf[attribute_name], NaN)
        #logger.start_block('Aggregate Pandas')
        grouped = workdf.groupby(self.get_id_name())[attribute_name]
        f = getattr(np, function)
        res = grouped.aggregate(f)
        #logger.end_block()
        return res

    def get_join_data(self, dataset, name, join_attribute=None, return_value_if_not_found=None, **kwargs):
        """Does a join on a attribute of two datasets (self and 'dataset').
        'join_attribute' specifies the join attribute of self. If this is None it is
        assumed to be identical to dataset._id_names which is the join attribute of 'dataset'.
        The method returns values of the attribute 'name' (which is an attribute of 'dataset')
        for the joined ids, i.e. the resulting array should have the same size as self.
        """
        default_return_values_by_type = default_filled_values_by_type = {'S':'',
                                                                         'U':'',
                                                                         'b':False,
                                                                         'i':-1,
                                                                         'u':0,
                                                                         'f':-1.0}
        id_name = dataset.get_id_name()
        jattr = join_attribute
        if jattr == None:
            jattr = id_name
        if not isinstance(jattr, list):
            jattr = [jattr]
        if not isinstance(name, list):
            name = [name]
        #logger.start_block('Disaggregate Pandas')
        result = self.df[jattr].join(dataset.df[name], on=jattr)[name]
        #result = dataset.df[name].loc[self.df[jattr[0]]]
        #logger.end_block()
        for attr in result.columns:
            if result[attr].dtype == object:
                result[attr] = result[attr].astype(dataset.df[attr].dtype)
            if np.isnan(result[attr].values).any():
                k = dataset.df[attr].values.dtype.kind
                if return_value_if_not_found is None and default_return_values_by_type.has_key(k):
                    val = default_return_values_by_type[k]
                else:
                    val = return_value_if_not_found
                result[attr].iloc[where(np.isnan(result[attr].values))] = val                
        return result
    
    def __set_version(self, name, version):
        self.attribute_boxes[name].set_version(version)

    def __increment_version(self, name):
        if self.get_version(name) == None:
            self.__set_version(name, 0)
        else:
            self.__set_version(name, self.get_version(name)+1)
    def run( self, building_set, building_types_table, vacancy_table, year, location_set,
            building_categories=None, dataset_pool=None, resources=None ):
        building_types = building_types_table.get_attribute("name")
        building_id_name = building_set.get_id_name()[0]
        location_id_name = location_set.get_id_name()[0]
        new_buildings = {building_id_name: array([], dtype=building_set.get_data_type(building_id_name)),
                         "building_type_id":array([], dtype=building_set.get_data_type("building_type_id", int8)),
                         "year_built": array([], dtype=building_set.get_data_type("year_built", int32)),
                         "sqft": array([], dtype=building_set.get_data_type("sqft", int32)),
                         "residential_units": array([], dtype=building_set.get_data_type("residential_units", int32)),
                         "improvement_value": array([], dtype= building_set.get_data_type("improvement_value", float32)),
                         "land_value": array([], dtype= building_set.get_data_type("land_value", float32)),
                         location_id_name: array([], dtype=building_set.get_data_type(location_id_name, int32))}
        max_id = building_set.get_id_attribute().max()
        buildings_set_size_orig = building_set.size()

        for itype in range(building_types_table.size()): # iterate over building types
            type = building_types[itype]
            type_code = building_types_table.get_id_attribute()[itype]
            is_residential = building_types_table.get_attribute("is_residential")[itype]
            vacancy_attribute = 'target_total_%s_vacancy' % type
            if vacancy_attribute not in vacancy_table.get_known_attribute_names():
                logger.log_warning("No target vacancy for building type '%s'. Transition model for this building type skipped." % type)
                continue
            vacancy_table.get_attribute(vacancy_attribute)  # ensures that the attribute is loaded
            target_vacancy_rate = eval("vacancy_table.get_data_element_by_id( year ).%s" % vacancy_attribute)

            compute_resources = Resources(resources)
            compute_resources.merge({"debug":self.debug})
            units_attribute = building_types_table.get_attribute('units')[itype]

            # determine current-year vacancy rates
            if is_residential:
                default_vacancy_variable = "urbansim.%s.vacant_%s_units_from_buildings" % (
                                                                   location_set.get_dataset_name(), type)
            else:
                default_vacancy_variable = "urbansim.%s.vacant_%s_sqft_from_buildings" % (
                                                                   location_set.get_dataset_name(), type)
            variable_for_vacancy = compute_resources.get(
                                    "%s_vacant_variable" % type, default_vacancy_variable)
            location_set.compute_variables([variable_for_vacancy, "urbansim.%s.buildings_%s_space" % (
                                                                      location_set.get_dataset_name(),type)],
                                        dataset_pool=dataset_pool, resources = compute_resources)

            vacant_units_sum = location_set.get_attribute(variable_for_vacancy).sum()
            units_sum = float( location_set.get_attribute("buildings_%s_space" % type).sum() )
            vacant_rate = self.safe_divide(vacant_units_sum, units_sum)

            should_develop_units = int(round(max( 0, ( target_vacancy_rate * units_sum - vacant_units_sum ) /
                                         ( 1 - target_vacancy_rate ) )))
            logger.log_status(type + ": vacant units: %d, should be vacant: %f, sum units: %d"
                          % (vacant_units_sum, target_vacancy_rate * units_sum, units_sum))

            if not should_develop_units:
                logger.log_note(("Will not build any " + type + " units, because the current vacancy of %d units\n"
                             + "is more than the %d units desired for the vacancy rate of %f.")
                            % (vacant_units_sum,
                               target_vacancy_rate * units_sum,
                               target_vacancy_rate))
                continue

            improvement_value = building_set.compute_variables("urbansim.%s.%s_improvement_value" % (
                                                                     building_set.get_dataset_name(), type),
                                                                   dataset_pool=dataset_pool,
                                                                   resources=compute_resources)
            average_improvement_value = improvement_value.sum()/ units_sum

            #create buildings
            is_building_type = building_set.compute_variables("urbansim.building.is_building_type_%s" % type,
                                                              dataset_pool=dataset_pool,
                                                              resources=compute_resources)
            units_of_this_type = building_set.compute_variables(units_attribute, dataset_pool=dataset_pool,
                                           resources=compute_resources)
            units_of_this_type = units_of_this_type*is_building_type
            units_without_zeros_idx = where(units_of_this_type > 0)[0]
            history_values_without_zeros = units_of_this_type[units_without_zeros_idx]
            history_improvement_values_without_zeros = where(improvement_value[units_without_zeros_idx]>0,
                                                             improvement_value[units_without_zeros_idx],
                                                             average_improvement_value)
            mean_size = history_values_without_zeros.mean()
            idx = array( [], dtype="int32" )
            # Ensure that there are some development projects to choose from.
            num_of_projects_to_select = max( 10, int( should_develop_units / mean_size ) )
            while True:
                idx = concatenate( ( idx, randint( 0, history_values_without_zeros.size,
                                                   size=num_of_projects_to_select) ) )
                csum = history_values_without_zeros[idx].cumsum()
                idx = idx[where( csum <= should_develop_units )]
                if csum[-1] >= should_develop_units:
                    break
            nbuildings = idx.size
            new_buildings["building_type_id"] = concatenate((new_buildings["building_type_id"], type_code*ones(nbuildings)))
            new_buildings["year_built"] = concatenate((new_buildings["year_built"], year*ones(nbuildings)))
            new_max_id = max_id + nbuildings
            new_buildings[building_id_name]=concatenate((new_buildings[building_id_name], arange(max_id+1, new_max_id+1)))
            max_id = new_max_id
            new_buildings["improvement_value"] = concatenate((new_buildings["improvement_value"],
                                                              history_improvement_values_without_zeros[idx]))

            if is_residential:
                target_size_attribute = "residential_units"
                zero_attribute = "sqft"
            else:
                target_size_attribute = "sqft"
                zero_attribute = "residential_units"
            new_buildings[target_size_attribute] = concatenate((new_buildings[target_size_attribute], history_values_without_zeros[idx]))
            new_buildings[zero_attribute] = concatenate((new_buildings[zero_attribute], zeros(nbuildings)))
            new_buildings[location_id_name] = concatenate((new_buildings[location_id_name], zeros(nbuildings)))
            new_buildings["land_value"] = concatenate((new_buildings["land_value"], zeros(nbuildings)))
            logger.log_status("Creating %s %s of %s %s buildings." % (history_values_without_zeros[idx].sum(),
                                                                   target_size_attribute, nbuildings, type))

        building_set.add_elements(new_buildings, require_all_attributes=False)
        if building_categories: # should be a dictionary of categories for each building type
            building_set.resources['building_categories'] = building_categories
        # add submodel attribute
        category_variables = map(lambda type: "urbansim.%s.size_category_%s" % (building_set.get_dataset_name(), type),
                                           building_types)

        for category_var in category_variables:
            var = VariableName(category_var)
            if var.get_alias() in building_set.get_known_attribute_names():
                building_set.delete_one_attribute(var)
            building_set.compute_variables(var, dataset_pool=dataset_pool, resources = compute_resources)
            building_set.add_primary_attribute(building_set.get_attribute(var), var.get_alias())

        difference = building_set.size() - buildings_set_size_orig
        return difference
Exemple #24
0
    def run(self,
            dataset1,
            dataset2,
            index1=None,
            index2=None,
            sample_size=10,
            weight=None,
            include_chosen_choice=None,
            with_replacement=True,
            resources=None,
            dataset_pool=None):
        """
        
        
        this function samples number of sample_size (scalar value) alternatives from dataset2
        for agent set specified by dataset1.
        If index1 is not None, only samples alterantives for agents with indices in index1;
        if index2 is not None, only samples alternatives from indices in index2.
        sample_size specifies number of alternatives to be sampled for each agent.
        weight, to be used as sampling weight, is either an attribute name of dataset2, or a 1d
        array of the same length as index2 or 2d array of shape (index1.size, index2.size).

        Also refer to document of interaction_dataset"""

        if dataset_pool is None:
            sc = SessionConfiguration()
            try:
                dataset_pool = sc.get_dataset_pool()
            except:
                dataset_pool = DatasetPool(sc.package_order)

        local_resources = Resources(resources)
        local_resources.merge_if_not_None({
            "dataset1":
            dataset1,
            "dataset2":
            dataset2,
            "index1":
            index1,
            "index2":
            index2,
            "sample_size":
            sample_size,
            "weight":
            weight,
            "with_replacement":
            with_replacement,
            "include_chosen_choice":
            include_chosen_choice
        })

        local_resources.check_obligatory_keys(
            ['dataset1', 'dataset2', 'sample_size'])
        agent = local_resources["dataset1"]
        choice = local_resources["dataset2"]
        index1 = local_resources.get("index1", None)
        if index1 is None:
            index1 = arange(agent.size())
        index2 = local_resources.get("index2", None)
        if index2 is None:
            index2 = arange(choice.size())

        if index1.size == 0 or index2.size == 0:
            err_msg = "either choice size or agent size is zero, return None"
            logger.log_warning(err_msg)
            return (None, None)

        agent_category_definition = local_resources.get(
            "agent_category_definition", [])
        choice_category_definition = local_resources.get(
            "choice_category_definition", [])
        agent_filter_attribute = local_resources.get("agent_filter_attribute",
                                                     None)
        category_inflating_factor = local_resources.get(
            "category_inflating_factor", 10)

        frequency, unique_agent_category_id, unique_choice_category_id, agent_category_id, choice_category_id = \
                get_category_and_frequency(agent, agent_category_definition,
                                           choice, choice_category_definition,
                                           agent_filter_attribute, category_inflating_factor,
                                           dataset_pool=dataset_pool)

        include_chosen_choice = local_resources.get("include_chosen_choice",
                                                    False)
        chosen_choice_id = agent.get_attribute(choice.get_id_name()[0])[index1]
        chosen_choice_index = choice.try_get_id_index(
            chosen_choice_id, return_value_if_not_found=-1)
        chosen_choice_index_to_index2 = lookup(chosen_choice_index,
                                               index2,
                                               index_if_not_found=UNPLACED_ID)

        J = local_resources["sample_size"]
        if include_chosen_choice:
            J = J - 1
        local_resources.merge_with_defaults(
            {'with_replacement': with_replacement})
        with_replacement = local_resources.get("with_replacement")

        sampled_index = empty((index1.size, J), dtype=DTYPE)
        sampling_prob = empty((index1.size, J), dtype="float64")

        _digitize, _where, _normalize = digitize, where, normalize
        _ncumsum, _rand, _searchsorted = ncumsum, rand, searchsorted  #speed hack
        for i in range(unique_agent_category_id.size):
            category_id = unique_agent_category_id[i]
            agents_in_this_category = _where(
                agent_category_id[index1] == category_id)[0]
            num_agents = agents_in_this_category.size
            if num_agents == 0: continue
            #import pdb; pdb.set_trace()

            ## divide frequency by the mean frequency to avoid overflow
            weights = frequency[
                i,
                _digitize(choice_category_id[index2], unique_choice_category_id
                          ) - 1] / frequency[i, :].mean()
            prob = _normalize(weights)
            index = _searchsorted(_ncumsum(prob),
                                  _rand(num_agents * J)).reshape(-1, J)

            if not with_replacement:
                raise NotImplementedError, "Sample without replacement is not implemented for this sampler yet."
                #    nz = nonzero(prob)[0].size
                #    if J < nz:
                #        ## number of non zero weight less than alternatives, sample with replacement
                #        logger.log_warning("There are %s non zero weights and are less than the number of alternatives proposed %s. " % (nz, J) +
                #                           "Sample with replacement instead.")
                #        continue
                #    i=0; max_iterations=200
                #    while True:
                #        index = sort(index, axis=1)
                #        where_repeats = nonzero( logical_not(diff(index, axis=1)) )
                #        num_repeats = where_repeats[0].size
                #        if num_repeats == 0: break
                #        index[where_repeats] = _searchsorted(_rand(num_repeats), prob)
                #        i += 1
                #        if i > max_iterations:
                #            logger.log_warning("weight_sampler_by_category is unable to sample %i alternatives without replacement in %i iterations; " % \
                #                               (J, max_iterations) +
                #                               "give up sampling without replacement and results may contain replacement."
                #                              )
                #            break

            sampled_index[agents_in_this_category, :] = index
            sampling_prob[agents_in_this_category, :] = prob[index]

        sampled_index = index2[sampled_index]
        is_chosen_choice = zeros(sampled_index.shape, dtype="bool")
        #chosen_choice = -1 * ones(chosen_choice_index.size, dtype="int32")
        if include_chosen_choice:
            sampled_index = column_stack(
                (chosen_choice_index[:, newaxis], sampled_index))
            is_chosen_choice[chosen_choice_index != UNPLACED_ID, 0] = 1

            sampling_prob_for_chosen_choices = take(
                prob, chosen_choice_index_to_index2[:, newaxis])
            ## if chosen choice chosen is unplaced has the sampling prob is 0
            sampling_prob_for_chosen_choices[where(
                chosen_choice_index == UNPLACED_ID)[0], ] = 0.0
            sampling_prob = column_stack(
                [sampling_prob_for_chosen_choices, sampling_prob])

        #chosen_choice[where(is_chosen_choice)[0]] = where(is_chosen_choice)[1]

        interaction_dataset = self.create_interaction_dataset(
            dataset1, dataset2, index1, sampled_index)
        interaction_dataset.add_attribute(sampling_prob,
                                          '__sampling_probability')
        interaction_dataset.add_attribute(is_chosen_choice, 'chosen_choice')

        ## to get the older returns
        #sampled_index = interaction_dataset.get_2d_index()
        #chosen_choices = UNPLACED_ID * ones(index1.size, dtype="int32")
        #where_chosen = where(interaction_dataset.get_attribute("chosen_choice"))
        #chosen_choices[where_chosen[0]]=where_chosen[1]
        #return (sampled_index, chosen_choice)

        return interaction_dataset
    def run(self, dataset1, dataset2, index1=None, index2=None, sample_size=10, weight=None,
            include_chosen_choice=False, with_replacement=False, resources=None, dataset_pool=None):
        
        """this function samples number of sample_size (scalar value) alternatives from dataset2
        for agent set specified by dataset1.
        If index1 is not None, only samples alterantives for agents with indices in index1;
        if index2 is not None, only samples alternatives from indices in index2.
        sample_size specifies number of alternatives to be sampled for each agent.
        weight, to be used as sampling weight, is either an attribute name of dataset2, or a 1d
        array of the same length as index2 or 2d array of shape (index1.size, index2.size).

        Also refer to document of interaction_dataset"""

        if dataset_pool is None:
            try:
                sc = SessionConfiguration()
                dataset_pool=sc.get_dataset_pool()
            except:
                dataset_pool = DatasetPool()
        
        local_resources = Resources(resources)
        local_resources.merge_if_not_None(
                {"dataset1": dataset1, "dataset2": dataset2,
                "index1":index1, "index2": index2,
                "sample_size": sample_size, "weight": weight,
                "with_replacement": with_replacement,
                "include_chosen_choice": include_chosen_choice})

        local_resources.check_obligatory_keys(['dataset1', 'dataset2', 'sample_size'])
        agent = local_resources["dataset1"]
        index1 = local_resources.get("index1", None)
        if index1 is None:
            index1 = arange(agent.size())
        choice = local_resources["dataset2"]
        index2 = local_resources.get("index2", None)
        if index2 is None:
            index2 = arange(choice.size())
            
        if index1.size == 0 or index2.size == 0:
            err_msg = "either choice size or agent size is zero, return None"
            logger.log_warning(err_msg)
            return None
        
        include_chosen_choice = local_resources.get("include_chosen_choice",  False)
        J = local_resources["sample_size"]
        if include_chosen_choice:
            J = J - 1
            
        with_replacement = local_resources.get("with_replacement")
            
        weight = local_resources.get("weight", None)
        if isinstance(weight, str):
            if weight in choice.get_known_attribute_names():
                weight=choice.get_attribute(weight)
                rank_of_weight = 1 
            elif VariableName(weight).get_dataset_name() == choice.get_dataset_name():
                weight=choice.compute_variables(weight, dataset_pool=dataset_pool)
                rank_of_weight = 1
            else:
                ## weights can be an interaction variable
                interaction_dataset = InteractionDataset(local_resources)
                weight=interaction_dataset.compute_variables(weight, dataset_pool=dataset_pool)
                rank_of_weight = 2
        elif isinstance(weight, ndarray):
            rank_of_weight = weight.ndim
        elif not weight:  ## weight is None or empty string
            weight = ones(index2.size)
            rank_of_weight = 1
        else:
            err_msg = "unkown weight type"
            logger.log_error(err_msg)
            raise TypeError, err_msg

        if (weight.size <> index2.size) and (weight.shape[rank_of_weight-1] <> index2.size):
            if weight.shape[rank_of_weight-1] == choice.size():
                if rank_of_weight == 1:
                    weight = take(weight, index2)
                if rank_of_weight == 2:
                    weight = take(weight, index2, axis=1)
            else:
                err_msg = "weight array size doesn't match to size of dataset2 or its index"
                logger.log_error(err_msg)
                raise ValueError, err_msg

        prob = normalize(weight)

        #chosen_choice = ones(index1.size) * UNPLACED_ID
        chosen_choice_id = agent.get_attribute(choice.get_id_name()[0])[index1]
        #index_of_placed_agent = where(greater(chosen_choice_id, UNPLACED_ID))[0]
        chosen_choice_index = choice.try_get_id_index(chosen_choice_id, return_value_if_not_found=UNPLACED_ID)
        chosen_choice_index_to_index2 = lookup(chosen_choice_index, index2, index_if_not_found=UNPLACED_ID)
        
        if rank_of_weight == 1: # if weight_array is 1d, then each agent shares the same weight for choices
            replace = with_replacement           # sampling with no replacement 
            if nonzerocounts(weight) < J:
                logger.log_warning("weight array dosen't have enough non-zero counts, use sample with replacement")
                replace = True
            sampled_index = prob2dsample( index2, sample_size=(index1.size, J),
                                        prob_array=prob, exclude_index=chosen_choice_index_to_index2,
                                        replace=replace, return_index=True )
            #return index2[sampled_index]

        if rank_of_weight == 2:
            sampled_index = zeros((index1.size,J), dtype="int32") - 1
                
            for i in range(index1.size):
                replace = with_replacement          # sampling with/without replacement
                i_prob = prob[i,:]
                if nonzerocounts(i_prob) < J:
                    logger.log_warning("weight array dosen't have enough non-zero counts, use sample with replacement")
                    replace = True

                #exclude_index passed to probsample_noreplace needs to be indexed to index2
                sampled_index[i,:] = probsample_noreplace( index2, sample_size=J, prob_array=i_prob,
                                                     exclude_index=chosen_choice_index_to_index2[i],
                                                     return_index=True )
        sampling_prob = take(prob, sampled_index)
        sampled_index = index2[sampled_index]
        is_chosen_choice = zeros(sampled_index.shape, dtype="bool")
        #chosen_choice = -1 * ones(chosen_choice_index.size, dtype="int32")
        if include_chosen_choice:
            sampled_index = column_stack((chosen_choice_index[:,newaxis],sampled_index))
            is_chosen_choice = zeros(sampled_index.shape, dtype="bool")
            is_chosen_choice[chosen_choice_index!=UNPLACED_ID, 0] = 1
            #chosen_choice[where(is_chosen_choice)[0]] = where(is_chosen_choice)[1]
            ## this is necessary because prob is indexed to index2, not to the choice set (as is chosen_choice_index)
            sampling_prob_for_chosen_choices = take(prob, chosen_choice_index_to_index2[:, newaxis])
            ## if chosen choice chosen equals unplaced_id then the sampling prob is 0
            sampling_prob_for_chosen_choices[where(chosen_choice_index==UNPLACED_ID)[0],] = 0.0
            sampling_prob = column_stack([sampling_prob_for_chosen_choices, sampling_prob])
        
        interaction_dataset = self.create_interaction_dataset(dataset1, dataset2, index1, sampled_index)
        interaction_dataset.add_attribute(sampling_prob, '__sampling_probability')
        interaction_dataset.add_attribute(is_chosen_choice, 'chosen_choice')
        
        ## to get the older returns
        #sampled_index = interaction_dataset.get_2d_index()
        #chosen_choices = UNPLACED_ID * ones(index1.size, dtype="int32") 
        #where_chosen = where(interaction_dataset.get_attribute("chosen_choice"))
        #chosen_choices[where_chosen[0]]=where_chosen[1]
        #return (sampled_index, chosen_choice)
        
        return interaction_dataset
Exemple #26
0
    def run(self, dataset1, dataset2, index1=None, index2=None, stratum=None, weight=None,
            sample_size=1, sample_size_from_each_stratum=None, sample_size_from_chosen_stratum=None, sample_rate=None,
            include_chosen_choice=False, resources=None, with_replacement=False, dataset_pool=None, **kwargs):
        """this function samples number of sample_size (scalar value) alternatives from dataset2
        for agent set specified by dataset1.
        If index1 is not None, only samples alternatives for agents with indices in index1;
        if index2 is not None, only samples alternatives from indices in index2.
        sample_size specifies number of alternatives to be sampled from each stratum, and is overwritten
          by sample_size_from_each_stratum if it's not None
        weight, to be used as sampling weight, is either an attribute name of dataset2, or a 1d
        array of the same length as index2 or 2d array of shape (index1.size, index2.size).

        Also refer to document of interaction_dataset"""
        if dataset_pool is None:
            try:
                sc = SessionConfiguration()
                dataset_pool=sc.get_dataset_pool()
            except:
                dataset_pool = DatasetPool()
                        
        local_resources = Resources(resources)
        local_resources.merge_if_not_None(
                {"dataset1": dataset1, "dataset2": dataset2,
                "index1":index1, "index2": index2,
                "with_replacement": with_replacement,
                "stratum":stratum, "weight": weight,
                "sample_size": sample_size,
                "sample_size_from_each_stratum": sample_size_from_each_stratum,
                "sample_size_from_chosen_stratum": sample_size_from_chosen_stratum,
                
                "sample_rate": sample_rate,
                "include_chosen_choice": include_chosen_choice})

        local_resources.check_obligatory_keys(['dataset1', 'dataset2'])
        index1 = local_resources.get("index1", None)

        agent = dataset1

        if index1 is None:
            agent.get_id_attribute()
            index1 = arange(agent.size())

        choice = local_resources["dataset2"]
        index2 = local_resources.get("index2", None)

        if index2 is None:
            choice.get_id_attribute()
            index2 = arange(choice.size())

        if index1.size == 0 or index2.size == 0:
            err_msg = "either choice size or agent size is zero, return None"
            logger.log_warning(err_msg)
            return (None, None)

        include_chosen_choice = local_resources.get("include_chosen_choice",  False)
        weight = local_resources.get("weight", None)

        if isinstance(weight, str):
            choice.compute_variables(weight,
                resources = local_resources )
            weight=choice.get_attribute(weight)
            rank_of_weight = 1
        elif isinstance(weight, ndarray):
            rank_of_weight = weight.ndim
        elif weight is None:
            weight = ones(index2.size)
            rank_of_weight = 1
        else:
            err_msg = "unknown weight type"
            logger.log_error(err_msg)
            raise TypeError, err_msg

        if (weight.size <> index2.size) and (weight.shape[rank_of_weight-1] <> index2.size):
            if weight.shape[rank_of_weight-1] == choice.size():
                weight = take(weight, index2)
            else:
                err_msg = "weight array size doesn't match to size of dataset2 or its index"
                logger.log_error(err_msg)
                raise ValueError, err_msg

        prob = normalize(weight)

        stratum = local_resources.get("stratum", None)
        if stratum is None:
            raise StandardError, "'stratum' must be defined for stratified sampling."
        if isinstance(stratum, str):
            choice.compute_variables(stratum,
                resources = local_resources )
            stratum=choice.get_attribute(stratum)

        #chosen_choice = ones(index1.size) * UNPLACED_ID
        chosen_choice_id = agent.get_attribute(choice.get_id_name()[0])[index1]
        #index_of_placed_agent = where(greater(chosen_choice_id, UNPLACED_ID))[0]
        chosen_choice_index = choice.try_get_id_index(chosen_choice_id, return_value_if_not_found=-1)
        chosen_choice_index_to_index2 = lookup(chosen_choice_index, index2, index_if_not_found=UNPLACED_ID)
        
        ##TODO: check all chosen strata are in selectable strata
        #i.e. chosen_choice_index is in index2
        chosen_stratum = ones(chosen_choice_index.size, dtype=DTYPE) * NO_STRATUM_ID
        chosen_stratum[where(chosen_choice_index!=-1)] = stratum[chosen_choice_index[where(chosen_choice_index!=-1)]]
        selectable_strata = stratum[index2]
        unique_strata = unique(selectable_strata)
        unique_strata = unique_strata[where(unique_strata!=NO_STRATUM_ID)]

#        if rank_of_weight == 2:
#            raise RuntimeError, "stratified sampling for 2d weight is unimplemented yet"

#        sampled_index = zeros((index1.size,1)) - 1

        sample_size = local_resources.get("sample_size", None)
        sample_size_from_each_stratum = local_resources.get("sample_size_from_each_stratum", None)
        if sample_size_from_each_stratum is None:
            sample_size_from_each_stratum = sample_size
        strata_sample_size = ones(unique_strata.size, dtype=DTYPE) * sample_size_from_each_stratum
        sample_rate = local_resources.get("sample_rate", None)
        if sample_rate is not None:
            raise UnImplementedError, "sample_rate is not implemented yet."
            ##TODO: to be finished
            #num_elements_in_strata = histogram(selectable_strata, unique_strata)
            #strata_sample_size = round(num_elements_in_strata * sample_rate)

        sample_size_from_chosen_stratum = local_resources.get("sample_size_from_chosen_stratum", None)
        if sample_size_from_chosen_stratum is None and not include_chosen_choice:
            strata_sample_pairs = array(map(lambda x,y: [x,y], unique_strata, strata_sample_size))
            if rank_of_weight == 1:
                sampled_index = self._sample_by_stratum(index1, index2, selectable_strata, prob,
                                                        chosen_choice_index_to_index2, strata_sample_pairs)
            elif rank_of_weight == 2:
                sampled_index = self._sample_by_agent_and_stratum(index1, index2, selectable_strata, prob,
                                                                  chosen_choice_index_to_index2, strata_sample_pairs)
        else:
            strata_sample_setting = zeros((index1.size,unique_strata.size,2), dtype=DTYPE)
            for i in range(index1.size):
                agents_strata_sample_size = copy.copy(strata_sample_size)
                if sample_size_from_chosen_stratum is None:
                    ## if sample_size_from_chosen_stratum is None and include_chosen_choice is True, 
                    ## sample one less from the chosen stratum
                    agents_strata_sample_size[where(unique_strata==chosen_stratum[i])] += - 1
                else:
                    agents_strata_sample_size[where(unique_strata==chosen_stratum[i])] = sample_size_from_chosen_stratum
                strata_sample_pairs = array(map(lambda x,y: [x,y], unique_strata, agents_strata_sample_size))
                strata_sample_setting[i,...] = strata_sample_pairs

            sampled_index = self._sample_by_agent_and_stratum(index1, index2, selectable_strata, prob,
                                                              chosen_choice_index_to_index2, strata_sample_setting)
        #chosen_choice = None
        is_chosen_choice = zeros(sampled_index.shape, dtype="bool")
        if include_chosen_choice:
            sampled_index = concatenate((chosen_choice_index[:,newaxis],sampled_index), axis=1)
            #chosen_choice = zeros(chosen_choice_index.shape, dtype="int32") - 1
            #chosen_choice[where(chosen_choice_index>UNPLACED_ID)] = 0 #make chosen_choice index to sampled_index, instead of choice (as chosen_choice_index does)
                                                                      #since the chosen choice index is attached to the first column, the chosen choice should be all zeros
                                                                      #for valid chosen_choice_index
            is_chosen_choice = zeros(sampled_index.shape, dtype="bool")
            is_chosen_choice[chosen_choice_index!=UNPLACED_ID, 0] = 1
            
            chosen_probability = zeros((chosen_choice_index.size,),dtype=float32) - 1
            for stratum in unique_strata:
                w = chosen_stratum==stratum
                chosen_probability[w] = (prob[chosen_choice_index[w]] / prob[selectable_strata==stratum].sum()).astype(float32)
            self._sampling_probability = concatenate((chosen_probability[:,newaxis], self._sampling_probability), axis=1)
            self._stratum_id = concatenate((chosen_stratum[:,newaxis], self._stratum_id), axis=1)

        interaction_dataset = self.create_interaction_dataset(dataset1, dataset2, index1, sampled_index)
        interaction_dataset.add_attribute(self._sampling_probability, '__sampling_probability')
        interaction_dataset.add_attribute(is_chosen_choice, 'chosen_choice')
        interaction_dataset.add_attribute(self._stratum_id, 'stratum_id')

        ## to get the older returns
        #sampled_index = interaction_dataset.get_2d_index()
        #chosen_choices = UNPLACED_ID * ones(index1.size, dtype="int32") 
        #where_chosen = where(interaction_dataset.get_attribute("chosen_choice"))
        #chosen_choices[where_chosen[0]]=where_chosen[1]
        #return (sampled_index, chosen_choice)
        
        return interaction_dataset    
class RegressionModel(ChunkModel):

    model_name = "Regression Model"
    model_short_name = "RM"

    def __init__(self, regression_procedure="opus_core.linear_regression",
                  submodel_string=None,
                  run_config=None, estimate_config=None, debuglevel=0, dataset_pool=None):
 
        self.debug = DebugPrinter(debuglevel)

        self.dataset_pool = self.create_dataset_pool(dataset_pool)

        self.regression = RegressionModelFactory().get_model(name=regression_procedure)
        if self.regression == None:
            raise StandardError, "No regression procedure given."

        self.submodel_string = submodel_string

        self.run_config = run_config
        if self.run_config == None:
            self.run_config = Resources()
        if not isinstance(self.run_config,Resources) and isinstance(self.run_config, dict):
            self.run_config = Resources(self.run_config)

        self.estimate_config = estimate_config
        if self.estimate_config == None:
            self.estimate_config = Resources()
        if not isinstance(self.estimate_config,Resources) and isinstance(self.estimate_config, dict):
            self.estimate_config = Resources(self.estimate_config)
            
        self.data = {}
        self.coefficient_names = {}
        ChunkModel.__init__(self)
        self.get_status_for_gui().initialize_pieces(3, pieces_description = array(['initialization', 'computing variables', 'submodel: 1']))

    def run(self, specification, coefficients, dataset, index=None, chunk_specification=None,
            data_objects=None, run_config=None, initial_values=None, procedure=None, debuglevel=0):
        """'specification' is of type EquationSpecification,
            'coefficients' is of type Coefficients,
            'dataset' is of type Dataset,
            'index' are indices of individuals in dataset for which
                        the model runs. If it is None, the whole dataset is considered.
            'chunk_specification' determines  number of chunks in which the simulation is processed.
            'data_objects' is a dictionary where each key is the name of an data object
            ('zone', ...) and its value is an object of class  Dataset.
           'run_config' is of type Resources, it gives additional arguments for the run.
           If 'procedure' is given, it overwrites the regression_procedure of the constructor.
           'initial_values' is an array of the initial values of the results. It will be overwritten
           by the results for those elements that are handled by the model (defined by submodels in the specification).
           By default the results are initialized with 0.
            'debuglevel' overwrites the constructor 'debuglevel'.
        """
        self.debug.flag = debuglevel
        if run_config == None:
            run_config = Resources()
        if not isinstance(run_config,Resources) and isinstance(run_config, dict):
            run_config = Resources(run_config)
        self.run_config = run_config.merge_with_defaults(self.run_config)
        self.run_config.merge({"debug":self.debug})
        if data_objects is not None:
            self.dataset_pool.add_datasets_if_not_included(data_objects)
        self.dataset_pool.replace_dataset(dataset.get_dataset_name(), dataset)
        if procedure is not None: 
            self.regression = RegressionModelFactory().get_model(name=procedure)
        if initial_values is None:
            self.initial_values = zeros((dataset.size(),), dtype=float32)
        else:
            self.initial_values = zeros((dataset.size(),), dtype=initial_values.dtype)
            self.initial_values[index] = initial_values
            
        if dataset.size()<=0: # no data loaded yet
            dataset.get_id_attribute()
        if index == None:
            index = arange(dataset.size())
            
        result = ChunkModel.run(self, chunk_specification, dataset, index, float32,
                                 specification=specification, coefficients=coefficients)
        return result

    def run_chunk (self, index, dataset, specification, coefficients):
        self.specified_coefficients = SpecifiedCoefficients().create(coefficients, specification, neqs=1)
        compute_resources = Resources({"debug":self.debug})
        submodels = self.specified_coefficients.get_submodels()
        self.get_status_for_gui().update_pieces_using_submodels(submodels=submodels, leave_pieces=2)
        self.map_agents_to_submodels(submodels, self.submodel_string, dataset, index,
                                      dataset_pool=self.dataset_pool, resources = compute_resources)
        variables = self.specified_coefficients.get_full_variable_names_without_constants()
        self.debug.print_debug("Compute variables ...",4)
        self.increment_current_status_piece()
        dataset.compute_variables(variables, dataset_pool = self.dataset_pool, resources = compute_resources)
        data = {}
        coef = {}
        outcome=self.initial_values[index].copy()
        for submodel in submodels:
            coef[submodel] = SpecifiedCoefficientsFor1Submodel(self.specified_coefficients,submodel)
            self.coefficient_names[submodel] = coef[submodel].get_coefficient_names_without_constant()[0,:]
            self.debug.print_debug("Compute regression for submodel " +str(submodel),4)
            self.increment_current_status_piece()
            self.data[submodel] = dataset.create_regression_data(coef[submodel],
                                                                index = index[self.observations_mapping[submodel]])
            nan_index = where(isnan(self.data[submodel]))[1]
            inf_index = where(isinf(self.data[submodel]))[1]
            vnames = asarray(coef[submodel].get_variable_names())
            if nan_index.size > 0:
                nan_var_index = unique(nan_index)
                self.data[submodel] = nan_to_num(self.data[submodel])
                logger.log_warning("NaN(Not A Number) is returned from variable %s; it is replaced with %s." % (vnames[nan_var_index], nan_to_num(nan)))
                #raise ValueError, "NaN(Not A Number) is returned from variable %s; check the model specification table and/or attribute values used in the computation for the variable." % vnames[nan_var_index]
            if inf_index.size > 0:
                inf_var_index = unique(inf_index)
                self.data[submodel] = nan_to_num(self.data[submodel])
                logger.log_warning("Inf is returned from variable %s; it is replaced with %s." % (vnames[inf_var_index], nan_to_num(inf)))
                #raise ValueError, "Inf is returned from variable %s; check the model specification table and/or attribute values used in the computation for the variable." % vnames[inf_var_index]
            
            if (self.data[submodel].shape[0] > 0) and (self.data[submodel].size > 0): # observations for this submodel available
                outcome[self.observations_mapping[submodel]] = \
                    self.regression.run(self.data[submodel], coef[submodel].get_coefficient_values()[0,:],
                        resources=self.run_config).astype(outcome.dtype)
        return outcome

    def correct_infinite_values(self, dataset, outcome_attribute_name, maxvalue=1e+38, clip_all_larger_values=False):
        """Check if the model resulted in infinite values. If yes,
        print warning and clip the values to maxvalue. 
        If clip_all_larger_values is True, all values larger than maxvalue are clip to maxvalue.
        """
        infidx = where(dataset.get_attribute(outcome_attribute_name) == inf)[0]

        if infidx.size > 0:
            logger.log_warning("Infinite values in %s. Clipped to %s." % (outcome_attribute_name, maxvalue))
            dataset.set_values_of_one_attribute(outcome_attribute_name, maxvalue, infidx)
        if clip_all_larger_values:
            idx = where(dataset.get_attribute(outcome_attribute_name) > maxvalue)[0]
            if idx.size > 0:
                logger.log_warning("Values in %s larger than %s. Clipped to %s." % (outcome_attribute_name, maxvalue, maxvalue))
                dataset.set_values_of_one_attribute(outcome_attribute_name, maxvalue, idx)
            
    def estimate(self, specification, dataset, outcome_attribute, index = None, procedure=None, data_objects=None,
                        estimate_config=None,  debuglevel=0):
        """'specification' is of type EquationSpecification,
            'dataset' is of type Dataset,
            'outcome_attribute' - string that determines the dependent variable,
            'index' are indices of individuals in dataset for which
                    the model runs. If it is None, the whole dataset is considered.
            'procedure' - name of the estimation procedure. If it is None,
                there should be an entry "estimation" in 'estimate_config' that determines the procedure. The class
                must have a method 'run' that takes as arguments 'data', 'regression_procedure' and 'resources'.
                It returns a dictionary with entries 'estimators', 'standard_errors' and 't_values' (all 1D numpy arrays).
            'data_objects' is a dictionary where each key is the name of an data object
                    ('zone', ...) and its value is an object of class  Dataset.
            'estimate_config' is of type Resources, it gives additional arguments for the estimation procedure.
            'debuglevel' overwrites the class 'debuglevel'.
        """
        #import wingdbstub
        self.debug.flag = debuglevel
        if estimate_config == None:
            estimate_config = Resources()
        if not isinstance(estimate_config,Resources) and isinstance(estimate_config, dict):
            estimate_config = Resources(estimate_config)
        self.estimate_config = estimate_config.merge_with_defaults(self.estimate_config)
        if data_objects is not None:
            self.dataset_pool.add_datasets_if_not_included(data_objects)
        self.procedure=procedure
        if self.procedure == None:
            self.procedure = self.estimate_config.get("estimation", None)
        if self.procedure is not None:
            self.procedure = ModelComponentCreator().get_model_component(self.procedure)
        else:
            logger.log_warning("No estimation procedure given, or problems with loading the corresponding module.")

        compute_resources = Resources({"debug":self.debug})
        if dataset.size()<=0: # no data loaded yet
            dataset.get_id_attribute()
        if index == None:
            index = arange(dataset.size())
        if not isinstance(index,ndarray):
            index=array(index)

        estimation_size_agents = self.estimate_config.get("estimation_size_agents", None) # should be a proportion of the agent_set
        if estimation_size_agents == None:
            estimation_size_agents = 1.0
        else:
            estimation_size_agents = max(min(estimation_size_agents,1.0),0.0) # between 0 and 1

        if estimation_size_agents < 1.0:
            self.debug.print_debug("Sampling agents for estimation ...",3)
            estimation_idx = sample_noreplace(arange(index.size),
                                                         int(index.size*estimation_size_agents))
        else:
            estimation_idx = arange(index.size)

        estimation_idx = index[estimation_idx]
        self.debug.print_debug("Number of observations for estimation: " + str(estimation_idx.size),2)
        if estimation_idx.size <= 0:
            self.debug.print_debug("Nothing to be done.",2)
            return (None, None)

        coefficients = create_coefficient_from_specification(specification)
        self.specified_coefficients = SpecifiedCoefficients().create(coefficients, specification, neqs=1)
        submodels = self.specified_coefficients.get_submodels()
        self.get_status_for_gui().update_pieces_using_submodels(submodels=submodels, leave_pieces=2)
        self.map_agents_to_submodels(submodels, self.submodel_string, dataset, estimation_idx,
                                      dataset_pool=self.dataset_pool, resources = compute_resources,
                                      submodel_size_max=self.estimate_config.get('submodel_size_max', None))
        variables = self.specified_coefficients.get_full_variable_names_without_constants()
        self.debug.print_debug("Compute variables ...",4)
        self.increment_current_status_piece()
        dataset.compute_variables(variables, dataset_pool=self.dataset_pool, resources = compute_resources)

        coef = {}
        estimated_coef={}
        self.outcome = {}
        dataset.compute_variables([outcome_attribute], dataset_pool=self.dataset_pool, resources=compute_resources)
        regression_resources=Resources(estimate_config)
        regression_resources.merge({"debug":self.debug})
        outcome_variable_name = VariableName(outcome_attribute)
        for submodel in submodels:
            coef[submodel] = SpecifiedCoefficientsFor1Submodel(self.specified_coefficients,submodel)
            self.increment_current_status_piece()
            logger.log_status("Estimate regression for submodel " +str(submodel),
                               tags=["estimate"], verbosity_level=2)
            #logger.log_status("Number of observations: " +str(self.observations_mapping[submodel].size),
                               #tags=["estimate"], verbosity_level=2)
            self.data[submodel] = dataset.create_regression_data_for_estimation(coef[submodel],
                                                            index = estimation_idx[self.observations_mapping[submodel]])
            self.coefficient_names[submodel] = coef[submodel].get_coefficient_names_without_constant()[0,:]
            if (self.data[submodel].shape[0] > 0) and (self.data[submodel].size > 0) and (self.procedure is not None): # observations for this submodel available
                self.outcome[submodel] = dataset.get_attribute_by_index(outcome_variable_name.get_alias(), estimation_idx[self.observations_mapping[submodel]])   
                regression_resources.merge({"outcome":  self.outcome[submodel]})
                regression_resources.merge({"coefficient_names":self.coefficient_names[submodel].tolist(),
                            "constant_position": coef[submodel].get_constants_positions()})
                regression_resources.merge({"submodel": submodel})
                estimated_coef[submodel] = self.procedure.run(self.data[submodel], self.regression,
                                                        resources=regression_resources)
                if "estimators" in estimated_coef[submodel].keys():
                    coef[submodel].set_coefficient_values(estimated_coef[submodel]["estimators"])
                if "standard_errors" in estimated_coef[submodel].keys():
                    coef[submodel].set_standard_errors(estimated_coef[submodel]["standard_errors"])
                if "other_measures" in estimated_coef[submodel].keys():
                    for measure in estimated_coef[submodel]["other_measures"].keys():
                        coef[submodel].set_measure(measure,
                              estimated_coef[submodel]["other_measures"][measure])
                if "other_info" in estimated_coef[submodel].keys():
                    for info in estimated_coef[submodel]["other_info"]:
                        coef[submodel].set_other_info(info,
                              estimated_coef[submodel]["other_info"][info])
        coefficients.fill_coefficients(coef)
        self.specified_coefficients.coefficients = coefficients
        self.save_predicted_values_and_errors(specification, coefficients, dataset, outcome_variable_name, index=index, data_objects=data_objects)
            
        return (coefficients, estimated_coef)

    def prepare_for_run(self, dataset=None, dataset_filter=None, filter_threshold=0, **kwargs):
        spec, coef = prepare_specification_and_coefficients(**kwargs)
        if (dataset is not None) and (dataset_filter is not None):
            filter_values = dataset.compute_variables([dataset_filter], dataset_pool=self.dataset_pool)
            index = where(filter_values > filter_threshold)[0]
        else:
            index = None
        return (spec, coef, index)

    def prepare_for_estimate(self, dataset=None, dataset_filter=None, filter_threshold=0, **kwargs):
        spec = get_specification_for_estimation(**kwargs)
        if (dataset is not None) and (dataset_filter is not None):
            filter_values = dataset.compute_variables([dataset_filter], dataset_pool=self.dataset_pool)
            index = where(filter_values > filter_threshold)[0]
        else:
            index = None
        return (spec, index)
    
    def get_data_as_dataset(self, submodel=-2):
        """Like get_all_data, but the retuning value is a Dataset containing attributes that
        correspond to the data columns. Their names are coefficient names."""
        all_data = self.get_all_data(submodel)
        if all_data is None:
            return None
        names = self.get_coefficient_names(submodel)
        if names is None:
            return None
        dataset_data = {}
        for i in range(names.size):
            dataset_data[names[i]] = all_data[:, i].reshape(all_data.shape[0])
        dataset_data["id"] = arange(all_data.shape[0])+1
        storage = StorageFactory().get_storage('dict_storage')
        storage.write_table(table_name='dataset', table_data=dataset_data)
        ds = Dataset(in_storage=storage, id_name="id", in_table_name='dataset')
        return ds

    def save_predicted_values_and_errors(self, specification, coefficients, dataset, outcome_variable, index=None, data_objects=None):
        if self.estimate_config.get('save_predicted_values_and_errors', False):
            logger.log_status('Computing predicted values and residuals.')
            original_values = dataset.get_attribute_by_index(outcome_variable, index)
            predicted_values = zeros(dataset.size(), dtype='float32')
            predicted_values[index] = self.run_after_estimation(specification, coefficients, dataset, index=index, data_objects=data_objects)
            predicted_attribute_name = 'predicted_%s' % outcome_variable.get_alias()
            dataset.add_primary_attribute(name=predicted_attribute_name, data=predicted_values)
            dataset.flush_attribute(predicted_attribute_name)
            predicted_error_attribute_name = 'residuals_%s' % outcome_variable.get_alias()
            error_values = zeros(dataset.size(), dtype='float32')
            error_values[index] = (original_values - predicted_values[index]).astype(error_values.dtype)
            dataset.add_primary_attribute(name=predicted_error_attribute_name, data = error_values)
            dataset.flush_attribute(predicted_error_attribute_name)
            logger.log_status('Predicted values saved as %s (for the %s dataset)' % (predicted_attribute_name, dataset.get_dataset_name()))
            logger.log_status('Residuals saved as %s (for the %s dataset)' % (predicted_error_attribute_name, dataset.get_dataset_name()))
        
    def export_estimation_data(self, submodel=-2, file_name='./estimation_data_regression.txt', delimiter = '\t'):
        import os
        from numpy import newaxis
        data = concatenate((self.outcome[submodel][...,newaxis], self.get_all_data(submodel=submodel)), axis=1)
        header = ['outcome'] + self.get_coefficient_names(submodel).tolist()
        nrows = data.shape[0]
        file_name_root, file_name_ext = os.path.splitext(file_name)
        out_file = "%s_submodel_%s.txt" % (file_name_root, submodel)
        fh = open(out_file,'w')
        fh.write(delimiter.join(header) + '\n')   #file header
        for row in range(nrows):
            line = [str(x) for x in data[row,]]
            fh.write(delimiter.join(line) + '\n')
        fh.flush()
        fh.close
        print 'Data written into %s' % out_file
        
    def run_after_estimation(self, *args, **kwargs):
        return self.run(*args, **kwargs)
            
    def _get_status_total_pieces(self):
        return ChunkModel._get_status_total_pieces(self) * self.get_status_for_gui().get_total_number_of_pieces()
    
    def _get_status_current_piece(self):
        return ChunkModel._get_status_current_piece(self)*self.get_status_for_gui().get_total_number_of_pieces() + self.get_status_for_gui().get_current_piece()
        
    def _get_status_piece_description(self):
        return "%s %s" % (ChunkModel._get_status_piece_description(self), self.get_status_for_gui().get_current_piece_description())
    
    def get_specified_coefficients(self):
        return self.specified_coefficients
Exemple #28
0
class Estimator(GenericModelExplorer):
    def __init__(self, config=None, save_estimation_results=False):
        if 'cache_directory' not in config or config['cache_directory'] is None:
            raise KeyError(
                "The cache directory must be specified in the "
                "given configuration, giving the filesystem path to the cache "
                "directory containing the data with which to estimate. Please "
                "check that your configuration contains the 'cache_directory' "
                "entry and that it is not None.")

        self.simulation_state = SimulationState(new_instance=True)
        self.simulation_state.set_cache_directory(config['cache_directory'])

        SessionConfiguration(
            new_instance=True,
            package_order=config['dataset_pool_configuration'].package_order,
            in_storage=AttributeCache())
        self.config = Resources(config)
        self.save_estimation_results = save_estimation_results
        self.debuglevel = self.config.get("debuglevel", 4)
        self.model_system = ModelSystem()
        self.agents_index_for_prediction = None

        models = self.config.get('models', [])

        self.model_name = None
        if "model_name" in config.keys():
            self.model_name = config["model_name"]
        else:
            for model in models:
                if isinstance(model, dict):
                    model_name = model.keys()[0]
                    if (model[model_name] == "estimate") or (
                            isinstance(model[model_name], list) and
                        ("estimate" in model[model_name])):
                        self.model_name = model_name
                        break
        estimate_config_changes = self.config.get(
            'config_changes_for_estimation', {}).get('estimate_config', {})
        if len(estimate_config_changes) > 0:
            change = Resources({
                'models_configuration': {
                    self.model_name: {
                        'controller': {
                            'init': {
                                'arguments': {}
                            }
                        }
                    }
                }
            })
            estimate_config_str = self.config['models_configuration'].get(
                self.model_name,
                {}).get('controller',
                        {}).get('init',
                                {}).get('arguments',
                                        {}).get('estimate_config', '{}')
            estimate_config = Resources({})
            try:
                estimate_config = eval(estimate_config_str)
            except:
                pass

            estimate_config.merge(estimate_config_changes)
            self.config.merge(change)
            self.config['models_configuration'][
                self.model_name]['controller']['init']['arguments'][
                    'estimate_config'] = 'Resources(%s)' % estimate_config

    def estimate(self, out_storage=None):
        self.model_system.run(self.config,
                              write_datasets_to_cache_at_end_of_year=False)
        self.extract_coefficients_and_specification()

        if self.save_estimation_results:
            self.save_results(out_storage=out_storage)

    def reestimate(self,
                   specification_module_name=None,
                   specification_dict=None,
                   out_storage=None,
                   type=None,
                   submodels=None):
        """specification_module_name is name of a module that contains a dictionary called
        'specification'. If it is not given, the argument specification_dict must be given which is a dictionary object.
        'type' is the name of model member, such as 'commercial', 'residential'. The specification dictionary
        is expected to have an entry of this name. If 'submodels' is given (list or a number),
        the restimation is done only for those submodels.
        """
        if specification_module_name is not None:
            exec("import " + specification_module_name)
            eval("reload (" + specification_module_name + ")")
            exec("specification_dict =" + specification_module_name +
                 ".specification")

        if type is not None:
            specification_dict = specification_dict[type]
        if submodels is not None:  #remove all submodels but the given ones from specification
            submodels_to_be_deleted = specification_dict.keys()
            if not isinstance(submodels, list):
                submodels = [submodels]
            for sm in submodels:
                if sm not in submodels_to_be_deleted:
                    raise ValueError, "Submodel %s not in the specification." % sm
                submodels_to_be_deleted.remove(sm)
                if "_definition_" in submodels_to_be_deleted:
                    submodels_to_be_deleted.remove("_definition_")
            for sm in submodels_to_be_deleted:
                del specification_dict[sm]
        self.specification = EquationSpecification(
            specification_dict=specification_dict)
        new_namespace = self.model_system.run_year_namespace
        keys_coeff_spec = self.get_keys_for_coefficients_and_specification()
        new_namespace[keys_coeff_spec["specification"]] = self.specification
        self.coefficients, coeff_dict_dummy = self.model_system.do_process(
            new_namespace)
        ## update run_year_namespce since it's not been updated by do_process
        self.model_system.run_year_namespace = new_namespace
        self.model_system.run_year_namespace[
            keys_coeff_spec["coefficients"]] = self.coefficients

        ## this gets coeff and spec from run_year_namespce and is only updated in _run_year method
        #self.extract_coefficients_and_specification()
        if self.save_estimation_results:
            self.save_results(out_storage=out_storage)

    def predict(self, predicted_choice_id_name, agents_index=None):
        """ Run prediction. Currently makes sense only for choice models."""
        # Create temporary configuration where all words 'estimate' are replaced by 'run'
        tmp_config = Resources(self.config)

        if self.agents_index_for_prediction is None:
            self.agents_index_for_prediction = self.get_agent_set_index().copy(
            )

        if agents_index is None:
            agents_index = self.agents_index_for_prediction

        tmp_config['models_configuration'][self.model_name]['controller'][
            'run']['arguments']['coefficients'] = "coeff_est"
        tmp_config['models_configuration'][self.model_name]['controller'][
            'run']['arguments']['agents_index'] = "agents_index"
        tmp_config['models_configuration'][self.model_name]['controller'][
            'run']['arguments']['chunk_specification'] = "{'nchunks':1}"

        ### save specification and coefficients to cache (no matter the save_estimation_results flag)
        ### so that the prepare_for_run method could load specification and coefficients from there
        #output_configuration = self.config['output_configuration']
        #del self.config['output_configuration']
        #self.save_results()

        #self.config['output_configuration'] = output_configuration

        #self.model_system.run_year_namespace["coefficients"] = self.coefficients
        #del tmp_config['models_configuration'][self.model_name]['controller']['prepare_for_run']

        try:
            run_year_namespace = copy.copy(
                self.model_system.run_year_namespace)
        except:
            logger.log_error("The estimate() method must be run first")
            return False

        try:
            agents = self.get_agent_set()
            choice_id_name = self.get_choice_set().get_id_name()[0]
            # save current locations of agents
            current_choices = agents.get_attribute(choice_id_name).copy()
            dummy_data = zeros(current_choices.size,
                               dtype=current_choices.dtype) - 1
            agents.modify_attribute(name=choice_id_name,
                                    data=dummy_data)  #reset all choices

            run_year_namespace["process"] = "run"
            run_year_namespace["coeff_est"] = self.coefficients
            run_year_namespace["agents_index"] = agents_index
            run_year_namespace["processmodel_config"] = tmp_config[
                'models_configuration'][self.model_name]['controller']['run']
            new_choices = self.model_system.do_process(run_year_namespace)

            #self.model_system.run(tmp_config, write_datasets_to_cache_at_end_of_year=False)
            #new_choices = agents.get_attribute(choice_id_name).copy()
            agents.modify_attribute(name=choice_id_name, data=current_choices)
            dummy_data[agents_index] = new_choices
            if predicted_choice_id_name not in agents.get_known_attribute_names(
            ):
                agents.add_primary_attribute(name=predicted_choice_id_name,
                                             data=dummy_data)
            else:
                agents.modify_attribute(name=predicted_choice_id_name,
                                        data=dummy_data)
            logger.log_status("Predictions saved into attribute " +
                              predicted_choice_id_name)
            return True
        except Exception, e:
            logger.log_error("Error encountered in prediction: %s" % e)
            logger.log_stack_trace()

        return False
Exemple #29
0
def match_parcels_to_constraints_and_templates(parcel_dataset,
                                                development_template_dataset,
                                                output_dir, log_scale=True, strict=True,
                                                output_points=False,
                                                parcel_index=None,
                                                template_index=None,
                                                consider_constraints_as_rules=True,
                                                template_opus_path="urbansim_parcel.development_template",
                                                dataset_pool=None,
                                                resources=None):
    """
    This function matches parcels to their constraints and templates and gives a summary about how many parcels have no match.
    It also creates a plot for each GLU and unit type of template ranges and densities.
    parcel_index - 1D array, indices of parcel_dataset (default is all parcels).
    template_index - index to templates that are available (default is all templates).
    If strict is True, parcels without templates are considered across GLU, otherwise only within each GLU. 
    """

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    resources = Resources(resources)
    debug = resources.get("debug",  0)
    if not isinstance(debug, DebugPrinter):
        debug = DebugPrinter(debug)

    if parcel_index is not None and parcel_index.size <= 0:
        return None
        
    if parcel_index is not None:
        index1 = parcel_index
    else:
        index1 = arange(parcel_dataset.size())

    if template_index is not None:
        index2 = template_index
    else:
        index2 = arange(development_template_dataset.size())

    has_constraint_dataset = True
    try:
        constraints = dataset_pool.get_dataset("development_constraint") 
        constraints.load_dataset_if_not_loaded()
    except:
        has_constraint_dataset = False

    parcels_glu = parcel_dataset.compute_variables(['parcel.disaggregate(land_use_type.generic_land_use_type_id)'], dataset_pool=dataset_pool)
    if has_constraint_dataset:
        constraint_types = unique(constraints.get_attribute("constraint_type"))  #unit_per_acre, far etc
        development_template_dataset.compute_variables(map(lambda x: "%s.%s" % (template_opus_path, x), constraint_types), dataset_pool)
            
        parcel_dataset.get_development_constraints(constraints, dataset_pool, 
                                                   index=index1, 
                                                   consider_constraints_as_rules=consider_constraints_as_rules)
        generic_land_use_type_ids = development_template_dataset.compute_variables("urbansim_parcel.development_template.generic_land_use_type_id",
                                                       dataset_pool=dataset_pool)

    parcel_ids = parcel_dataset.get_id_attribute()
    template_ids = development_template_dataset.get_id_attribute()
    
    
    has_template = zeros(index1.size, dtype="int32")
    vacant_land = parcel_dataset.compute_variables(['urbansim_parcel.parcel.vacant_land_area'],
                                                                dataset_pool=dataset_pool)[index1]
    is_vacant = vacant_land>0
    #vacant_land = vacant_land*logical_or(parcels_glu==1, parcels_glu==2)                                                            
    is_developable_parcel = zeros(index1.size, dtype="int32")
    accepted_by_constraints = zeros(index1.size, dtype="int32")
    
    #parcels_to_template = {} 
    parcels_to_template_acc_by_constr = {}
    density_types = development_template_dataset['density_type']
    parcels_acc_by_constr_wo_templ = {}
    parcels_acc_by_constr = {}
    #pidx = parcel_dataset.get_id_index(804461)
    logger.start_block("Combine parcels, templates and constraints")
    for i_template in index2:
        this_template_id = template_ids[i_template]
        
        fit_indicator = ones(index1.size, dtype="bool8")
        parcels_to_template_acc_by_constr[this_template_id] = []
        this_templ_accepted_by_constraints = zeros(index1.size, dtype="int32")
        has_this_template = zeros(index1.size, dtype="int32")
        if has_constraint_dataset:
            generic_land_use_type_id = generic_land_use_type_ids[i_template]
            if generic_land_use_type_id not in parcels_acc_by_constr_wo_templ.keys():
                parcels_acc_by_constr_wo_templ[generic_land_use_type_id] = zeros(index1.size, dtype="int32")
            if generic_land_use_type_id not in parcels_acc_by_constr.keys():
                parcels_acc_by_constr[generic_land_use_type_id] = zeros(index1.size, dtype="int32")
            #if generic_land_use_type_id not in [1,2]:
            #    continue
            units_proposed = parcel_dataset.compute_variables(['psrc_parcel.parcel.units_proposed_for_template_%s' % this_template_id],
                                                                dataset_pool=dataset_pool)[index1]
            is_size_fit = parcel_dataset.compute_variables(['psrc_parcel.parcel.is_size_fit_for_template_%s' % this_template_id],
                                                                dataset_pool=dataset_pool)[index1]
            for constraint_type, constraint in parcel_dataset.development_constraints[generic_land_use_type_id].iteritems():
                if density_types[i_template] <> constraint_type:
                    continue
                template_attribute = development_template_dataset.get_attribute(constraint_type)[i_template]  #density converted to constraint variable name
                if template_attribute == 0:
                    continue
                min_constraint = constraint[:, 0].copy()
                max_constraint = constraint[:, 1].copy()
                ## treat -1 as unconstrained
                w_unconstr = min_constraint == -1
                if w_unconstr.any():
                    min_constraint[w_unconstr] = template_attribute
                
                w_unconstr = max_constraint == -1
                if w_unconstr.any():
                    max_constraint[w_unconstr] = template_attribute

                this_accepted_by_constraints = logical_and(template_attribute >= min_constraint,
                                                        template_attribute <= max_constraint)
                fit_indicator = logical_and(fit_indicator, 
                                            logical_and(logical_and(this_accepted_by_constraints, units_proposed > 0), is_size_fit))
                
                is_developable_parcel = logical_or(is_developable_parcel, max_constraint > 0)
                this_templ_accepted_by_constraints = logical_or(this_templ_accepted_by_constraints, 
                                                                logical_and(is_developable_parcel, 
                                                                            logical_and(this_accepted_by_constraints, units_proposed > 0)))
                has_this_template = logical_or(has_this_template, fit_indicator)
            accepted_by_constraints = logical_or(accepted_by_constraints, this_templ_accepted_by_constraints)
            has_template = logical_or(has_template, has_this_template)
            #parcels_to_template[this_template_id] = where(logical_and(vacant_land>0, 
            #                    logical_and(logical_and(is_developable_parcel, this_accepted_by_constraints),
            #                                logical_not(fit_indicator))))[0]
            #parcels_to_template_acc_by_constr[this_template_id].append(where(accepted_by_constraints)[0].tolist())
            not_accepted = logical_and(this_templ_accepted_by_constraints, logical_and(logical_not(has_this_template), is_vacant))
            parcels_to_template_acc_by_constr[this_template_id].append(where(not_accepted)[0].tolist())
            parcels_acc_by_constr_wo_templ[generic_land_use_type_id] = logical_or(parcels_acc_by_constr_wo_templ[generic_land_use_type_id], 
                                            not_accepted)
            parcels_acc_by_constr[generic_land_use_type_id] = logical_or(parcels_acc_by_constr[generic_land_use_type_id], 
                                                            logical_and(this_templ_accepted_by_constraints, is_vacant))
            #if fit_indicator[pidx]:
            #    print 'Parcel 804461: template %s accepted.' %  this_template_id
            
    logger.end_block()
    ### Print summary
    ##################
    unique_glu = parcels_acc_by_constr_wo_templ.keys()
    #parcels_wo_templ = zeros(index1.size, dtype="int32")
    
    #parcels_wo_templ = where(logical_and(vacant_land>0, logical_and(is_developable_parcel, logical_not(has_template))))[0]
    #nr_parcels_wo_templ = parcels_wo_templ.size
    #is_vacant = vacant_land>0
    #logger.log_status("\nGLU\tvacant land\tconstraint out\tno template")
    logger.log_status("\nGLU\tconsidered\tno template")
    no_glu_templ = []
    parcels_wo_temp_by_glu = {}
    sum1 = 0
    sum2 = 0
    parcels_wo_templ = logical_not(has_template)
    for glu in unique_glu:
        if strict:
            parcels_acc_by_constr_wo_templ[glu] = logical_and(parcels_acc_by_constr_wo_templ[glu], parcels_wo_templ)
        #if glu == 3:
        #parcels_wo_templ = logical_or(parcels_wo_templ, parcels_acc_by_constr_wo_templ[glu])
#        if glu not in generic_land_use_type_ids:
#            no_glu_templ.append(glu)
        #idx = parcels_glu==glu
#        if idx.sum() > 0:
#            logger.log_status("%s\t%7i\t\t%7i\t\t%7i" % (glu, is_vacant[idx].sum(), 
#                        is_vacant[idx].sum() - logical_and(is_vacant[idx], is_developable_parcel[idx]).sum(),
#                        logical_and(is_vacant[idx], logical_and(is_developable_parcel[idx], logical_not(has_template[idx]))).sum()))
#            parcels_wo_temp_by_glu[glu] = where(logical_and(idx, logical_and(is_vacant, 
#                                    logical_and(is_developable_parcel, logical_not(has_template)))))[0]
        logger.log_status("%s\t%7i\t\t%7i" % (glu, parcels_acc_by_constr[glu].sum(), parcels_acc_by_constr_wo_templ[glu].sum()))
        sum1 = sum1 + parcels_acc_by_constr[glu].sum()
        sum2 = sum2 + parcels_acc_by_constr_wo_templ[glu].sum()      
    logger.log_status("\nall\t%7i\t\t%7i" % (sum1, sum2))
    #if len(no_glu_templ) > 0:
    #    logger.log_status("\nNo templates for GLUs: %s" % no_glu_templ)
        
    ### Create plots
    #################
    
    templ_min_max = {}
    for glu in unique_glu:
        gidx = where(parcels_acc_by_constr_wo_templ[glu])[0]
        logger.start_block("Creating figures for GLU %s using %s parcels" % (glu,gidx.size))
        templ_min_max[glu] = []
        max_land_sqft = {'far': 0, 'units_per_acre': 0}
        min_land_sqft = {'far': 9999999, 'units_per_acre': 9999999}
        max_templ_attr = {'far': 0, 'units_per_acre': 0}
        min_templ_attr = {'far': 999999, 'units_per_acre': 9999999}
        xy = {'far':[], 'units_per_acre':[]}
        points = {'far':zeros((0,3)), 'units_per_acre':zeros((0,3))}
        npoints = {'far': 0, 'units_per_acre': 0}
        for i_template in index2:
            if glu <> generic_land_use_type_ids[i_template]:
                continue
            this_template_id = template_ids[i_template]
            #units_proposed = parcel_dataset['units_proposed_for_template_%s' % this_template_id]
            #is_size_fit = parcel_dataset['is_size_fit_for_template_%s' % this_template_id]
            #is_constraint = zeros(parcel_dataset.size(), dtype='bool8')
            #is_constraint[array(parcels_to_template_acc_by_constr[this_template_id])]=True
            #is_size_fit = logical_and(logical_and(logical_not(is_size_fit), 
            #                                      logical_and(is_vacant, units_proposed>0)), 
            #                          logical_and(is_constraint,
            #                                      is_developable_parcel))
            missed_to_match = zeros(parcel_dataset.size(), dtype='bool8')
            missed_to_match[(unique(array(parcels_to_template_acc_by_constr[this_template_id]).flatten())).astype('int32')] = True
            missed_to_match = where(logical_and(missed_to_match, parcels_acc_by_constr_wo_templ[glu]))[0]
            #missed_to_match = unique(array(parcels_to_template_acc_by_constr[this_template_id]).flatten())
            for constraint_type, constraint in parcel_dataset.development_constraints[glu].iteritems():
                if density_types[i_template] <> constraint_type:
                    continue
                template_attribute = development_template_dataset.get_attribute(constraint_type)[i_template]  #density converted to constraint variable name
                if template_attribute == 0:
                    continue
                templ_min_max[glu].append([development_template_dataset["land_sqft_min"][i_template], 
                                           development_template_dataset["land_sqft_max"][i_template]])
                xy[constraint_type] = xy[constraint_type] + [[development_template_dataset["land_sqft_min"][i_template], 
                            development_template_dataset["land_sqft_max"][i_template]], 
                            [template_attribute, template_attribute]]
                #if is_size_fit[gidx].sum() > 0:
                if missed_to_match.size > 0:
                    npoints[constraint_type] = npoints[constraint_type] + missed_to_match.size #is_size_fit[gidx].sum()
                    #if is_size_fit[gidx].sum() > 100:
                    if missed_to_match.size > 100:
                        draw = sample_noreplace(missed_to_match, 100)
                        thisidx = draw
                    else:
                        thisidx = missed_to_match
                    points[constraint_type] = concatenate((points[constraint_type], 
                                      concatenate((parcel_dataset['vacant_land_area'][thisidx][:,newaxis], 
                                                   template_attribute*ones((thisidx.size,1)), 
                                                   parcel_ids[thisidx][:,newaxis]), axis=1)), axis=0)
                    max_land_sqft[constraint_type] = max(max_land_sqft[constraint_type], parcel_dataset['vacant_land_area'][thisidx].max())
                    min_land_sqft[constraint_type] = min(min_land_sqft[constraint_type], parcel_dataset['vacant_land_area'][thisidx].max())
                    max_templ_attr[constraint_type] = max(max_templ_attr[constraint_type], template_attribute)
                    min_templ_attr[constraint_type] = min(min_templ_attr[constraint_type], template_attribute)

        import matplotlib.ticker as ticker
        import matplotlib.pyplot as plt
        def myexp(x, pos):
            return '%i' % (round(exp(x)))
        def myexp2(x, pos):
            return '%.2f' % (round(exp(x), 2))

        for type in ['far', 'units_per_acre']:
            if points[type].size == 0:
                continue
            #print xy[type]
            lxy = array(xy[type])
            dots = points[type][:,0:2]
            minx = min_land_sqft[type]-100
            maxx = max_land_sqft[type]+100
            miny = min_templ_attr[type]-0.05
            maxy = max_templ_attr[type]+0.05
            if log_scale:
                lxy = log(lxy)
                dots = log(dots)
                minx = log(minx)
                maxx = log(maxx)
                miny = log(miny)
                maxy = log(maxy)
            fig = plt.figure()
            ax = fig.add_subplot(111)
            lines = ax.plot(*lxy) # template lines
            po = ax.plot(dots[:,0], dots[:,1]) # parcel points
            if log_scale:
                xformatter = ticker.FuncFormatter(myexp)
                yformatter = ticker.FuncFormatter(myexp2)
                ax.xaxis.set_major_formatter(xformatter)
                ax.yaxis.set_major_formatter(yformatter)
                # The following would be better but throws an error
                #locator = ticker.LogLocator(base=2.718282, subs=0.1)
                #ax.xaxis.set_major_locator(locator)
            plt.setp(lines, color='b', linewidth=1)
            plt.setp(po, marker='o', linestyle='None', linewidth=0)

            ax.axis([min(dots[:,0].min(), minx), 
                     max(dots[:,0].max(), maxx), 
                     min(dots[:,1].min(), miny), 
                     max(dots[:,1].max(), maxy)])
            plt.title('GLU: %s, units: %s, missing: %s' % (glu, type, npoints[type]))
            #ax.grid(True)
            plt.xlabel('land sqft range')
            plt.ylabel('density')
            log_suffix = ''
            if log_scale:
                log_suffix = '_log'
            plt.savefig(os.path.join(output_dir, 'match_templates%s_%s_%s.pdf' % (log_suffix, glu, type)))
            plt.close()
            #plt.show()
            if output_points:
            #if glu == 3:
                write_table_to_text_file(os.path.join(output_dir, 'points_%s_%s.txt' % (glu, type)), points[type], delimiter=', ')
        logger.end_block()

    logger.log_status('Resulting figures stored into %s' % output_dir)               
    return parcel_ids[index1][parcels_wo_templ]