Exemple #1
0
 def get_before_after_attribute(self, attribute_name):
     """Return a dictionary with elements 'before' (contains an array of the given attribute
     that is reloaded from the cache) and 'after' (contains an array of the given attribute 
     with the current values).
     """
     from opus_core.store.attribute_cache import AttributeCache
     var_name = VariableName(attribute_name)
     storage = AttributeCache(self.simulation_state.get_cache_directory())
     ds = self._get_before_after_dataset_from_attribute(var_name, storage=storage,
                package_order=self.get_dataset_pool().get_package_order())       
     return {'after': ds[var_name.get_alias()],
             'before': ds.get_attribute('%s_reload__' % var_name.get_alias())}
Exemple #2
0
 def test_alias_attribute_same_name(self):
     # this tests an expression consisting of an alias for a primary attribute that is the same name as the primary attribute
     expr = "persons = persons"
     storage = StorageFactory().get_storage('dict_storage')
     storage.write_table(table_name='tests',
                         table_data={
                             "persons": array([1, 5, 10]),
                             "id": array([1, 3, 4])
                         })
     dataset = Dataset(in_storage=storage,
                       in_table_name='tests',
                       id_name="id",
                       dataset_name="tests")
     result = dataset.compute_variables([expr])
     self.assertEqual(ma.allclose(result, [1, 5, 10], rtol=1e-7),
                      True,
                      msg="error in test_alias_attribute")
     name = VariableName(expr)
     self.assertEqual(name.get_short_name(),
                      'persons',
                      msg="bad value for shortname")
     self.assertEqual(name.get_alias(),
                      'persons',
                      msg="bad value for alias")
     self.assertEqual(name.get_autogen_class(),
                      None,
                      msg="bad value for autogen_class")
    def get_attribute(self, name):
        """ Return an array of the (by the argument name) given attribute. """
        if not isinstance(name, VariableName):
            attr_name = VariableName(name)
        else:
            attr_name = name
        alias = attr_name.get_alias()
        dataset_name = attr_name.get_dataset_name()
        if not (alias in self.get_attribute_names()):
            if dataset_name == self.get_dataset(1).dataset_name:
                index = self.get_2d_index_of_dataset1()
                return self.get_dataset(1).get_attribute_by_index(
                    attr_name, index)
            if dataset_name == self.get_dataset(2).dataset_name:
                index = self.get_2d_index()
                return self.get_dataset(2).get_attribute_by_index(
                    attr_name, index)

            if alias in self.get_dataset(1).get_known_attribute_names():
                index = self.get_2d_index_of_dataset1()
                return self.get_dataset(1).get_attribute_by_index(
                    attr_name, index)
            if alias in self.get_dataset(2).get_known_attribute_names():
                index = self.get_2d_index()
                return self.get_dataset(2).get_attribute_by_index(
                    attr_name, index)
            self._raise_error(NameError, "Variable %s not found!" % alias)
        return self.attribute_boxes[alias].get_data()
Exemple #4
0
 def test_unary_functions_fully_qualified_name(self):
     # this tests expressions with unary functions applied to a fully qualified name
     expr = "sqrt(opus_core.tests.a_test_variable)"
     storage = StorageFactory().get_storage('dict_storage')
     storage.write_table(
         table_name='tests',
         table_data={
             "a_dependent_variable":array([1,5,10]),
             "id":array([1,3,4])
             }
         )
     dataset = Dataset(in_storage=storage, in_table_name='tests', id_name="id", dataset_name="tests")
     result = dataset.compute_variables([expr])
     should_be = array([3.16227766, 7.0710678, 10])
     self.assertEqual(ma.allclose(result, should_be, rtol=1e-3), True, msg="error in test_unary_functions_fully_qualified_name")
     # check that the access methods for the variable all return the correct values
     name = VariableName(expr)
     autogen = name.get_autogen_class()
     self.assert_(issubclass(autogen, Variable), msg="autogen'd class isn't a Variable")
     self.assertEqual(name.get_package_name(), None, msg="bad value for package")
     self.assertEqual(name.get_dataset_name(), 'tests', msg="bad value for dataset")
     self.assertEqual(name.get_short_name(), autogen.__name__, msg="bad value for shortname")
     self.assertEqual(name.get_alias(), autogen.__name__, msg="bad value for alias")
     # make an instance of the class and check the dependencies (since the dependent variables
     # all have fully-qualifed names we don't need to associate a dataset with the variable
     # for this test)
     self.assertEqual(autogen().dependencies(), ['opus_core.tests.a_test_variable'], 
                      msg="dependencies are incorrect")
Exemple #5
0
 def _do_flush_dependent_variables_if_required(self):
     try:
         if not SessionConfiguration().get('flush_variables', False):
             return
     except:
         return
     from opus_core.datasets.interaction_dataset import InteractionDataset
     dataset = self.get_dataset()
     dependencies = self.get_current_dependencies()
     my_dataset_name = dataset.get_dataset_name()
     for iattr in range(
             len(dependencies)):  # iterate over dependent variables
         dep_item = dependencies[iattr][0]
         if isinstance(dep_item, str):
             depvar_name = VariableName(dep_item)
         else:
             depvar_name = dep_item.get_variable_name(
             )  # dep_item should be an instance of AttributeBox
         dataset_name = depvar_name.get_dataset_name()
         if dataset_name == my_dataset_name:
             ds = dataset
         else:
             ds = SessionConfiguration().get_dataset_from_pool(dataset_name)
             #ds = dataset_pool.get_dataset('dataset_name')
         if not isinstance(ds, InteractionDataset):
             short_name = depvar_name.get_alias()
             if short_name not in ds.get_id_name():
                 ds.flush_attribute(depvar_name)
Exemple #6
0
 def test_alias_complex_expression(self):
     # aliasing a complex expression
     expr = "x = 2*sqrt(var1+var2)"
     storage = StorageFactory().get_storage('dict_storage')
     storage.write_table(table_name='dataset',
                         table_data={
                             "var1": array([4, -8, 0.5, 1]),
                             "var2": array([3, 3, 7, 7]),
                             "id": array([1, 2, 3, 4])
                         })
     dataset = Dataset(in_storage=storage,
                       in_table_name='dataset',
                       id_name="id",
                       dataset_name="mydataset")
     result = dataset.compute_variables([expr])
     should_be = array([5.29150262, 0.0, 5.47722558, 5.65685425])
     self.assert_(ma.allclose(result, should_be, rtol=1e-6),
                  "Error in test_alias_complex_expression")
     # check that the new var has x as an alias
     v = VariableName(expr)
     self.assertEqual(v.get_alias(), 'x', msg="bad value for alias")
     # check that the alias gives the correct value
     result2 = dataset.compute_variables(['x'])
     self.assert_(ma.allclose(result2, should_be, rtol=1e-6),
                  "Error in accessing a_test_variable")
Exemple #7
0
 def test_alias_attribute(self):
     # this tests an expression consisting of an alias for a primary attribute
     expr = "p = persons"
     storage = StorageFactory().get_storage('dict_storage')
     storage.write_table(table_name='tests',
                         table_data={
                             "persons": array([1, 5, 10]),
                             "id": array([1, 3, 4])
                         })
     dataset = Dataset(in_storage=storage,
                       in_table_name='tests',
                       id_name="id",
                       dataset_name="tests")
     result = dataset.compute_variables([expr])
     self.assertEqual(ma.allclose(result, [1, 5, 10], rtol=1e-7),
                      True,
                      msg="error in test_alias_attribute")
     # check that the access methods for the variable all return the correct values
     name = VariableName(expr)
     self.assertEqual(name.get_package_name(),
                      None,
                      msg="bad value for package")
     self.assertEqual(name.get_dataset_name(),
                      None,
                      msg="bad value for dataset")
     self.assert_(name.get_short_name().startswith('autogen'),
                  msg="bad value for shortname")
     self.assertEqual(name.get_alias(), 'p', msg="bad value for alias")
     self.assertNotEqual(name.get_autogen_class(),
                         None,
                         msg="bad value for autogen_class")
Exemple #8
0
 def test_alias_fully_qualified_variable(self):
     expr = "x = opus_core.tests.a_test_variable"
     storage = StorageFactory().get_storage('dict_storage')
     storage.write_table(table_name='tests',
                         table_data={
                             "a_dependent_variable": array([1, 5, 10]),
                             "id": array([1, 3, 4])
                         })
     dataset = Dataset(in_storage=storage,
                       in_table_name='tests',
                       id_name="id",
                       dataset_name="tests")
     result = dataset.compute_variables([expr])
     should_be = array([10, 50, 100])
     self.assert_(ma.allclose(result, should_be, rtol=1e-6),
                  "Error in test_alias_fully_qualified_variable")
     # check that the new var has x as an alias
     v = VariableName(expr)
     self.assertEqual(v.get_package_name(),
                      None,
                      msg="bad value for package_name")
     self.assertEqual(v.get_dataset_name(),
                      'tests',
                      msg="bad value for dataset_name")
     self.assert_(v.get_short_name().startswith('autogen'),
                  msg="bad value for shortname")
     self.assertEqual(v.get_alias(), 'x', msg="bad value for alias")
     self.assertNotEqual(v.get_autogen_class(),
                         None,
                         msg="bad value for autogen_class")
     # check that the alias has the correct value
     result2 = dataset.compute_variables(['x'])
     self.assert_(ma.allclose(result2, should_be, rtol=1e-6),
                  "Error in accessing a_test_variable")
Exemple #9
0
    def apply_filter(self, filter, agent_set, agents_index, submodel=-2):
        """ apply filter comparing to mean project size by submodel instead of 0, by shifting self.filter
        """
        project_size_filter = None
        if (filter is not None):
            if isinstance(filter, dict):
                submodel_filter = filter[submodel]
            else:
                submodel_filter = filter

            mean_project_size = agent_set.get_attribute(
                agent_set.get_attribute_name())[agents_index].mean()

            if isinstance(submodel_filter, str):
                resources = Resources({"debug": self.debug})
                self.choice_set.compute_variables(
                    [submodel_filter],
                    dataset_pool=self.dataset_pool,
                    resources=resources)
                filter_name = VariableName(submodel_filter)
                project_size_filter = self.choice_set.get_attribute(
                    filter_name.get_alias()) - mean_project_size
            else:
                project_size_filter = submodel_filter - mean_project_size

        return LocationChoiceModel.apply_filter(self,
                                                project_size_filter,
                                                agent_set=agent_set,
                                                agents_index=agents_index,
                                                submodel=submodel)
Exemple #10
0
 def test_alias_fully_qualified_variable_same_name(self):
     expr = "a_test_variable = opus_core.tests.a_test_variable"
     storage = StorageFactory().get_storage('dict_storage')
     storage.write_table(table_name='tests',
                         table_data={
                             "a_dependent_variable": array([1, 5, 10]),
                             "id": array([1, 3, 4])
                         })
     dataset = Dataset(in_storage=storage,
                       in_table_name='tests',
                       id_name="id",
                       dataset_name="tests")
     result = dataset.compute_variables([expr])
     should_be = array([10, 50, 100])
     self.assert_(ma.allclose(result, should_be, rtol=1e-6),
                  "Error in test_alias_fully_qualified_variable")
     result2 = dataset.compute_variables(['a_test_variable'])
     self.assert_(ma.allclose(result2, should_be, rtol=1e-6),
                  "Error in accessing a_test_variable")
     v = VariableName(expr)
     # check that no autogen class was generated
     self.assertEqual(v.get_autogen_class(),
                      None,
                      msg="bad value for autogen_class")
     # check that the alias is correct
     self.assertEqual(v.get_alias(),
                      'a_test_variable',
                      msg="bad value for alias")
Exemple #11
0
 def _compute_if_needed(self, name, dataset_pool, resources=None, quiet=False, version=None):
     """ Compute variable given by the argument 'name' only if this variable
     has not been computed before.
     Check first if this variable belongs to dataset1 or dataset2.
     dataset_pool holds available datasets.
     """
     if not isinstance(name, VariableName):
         variable_name = VariableName(name)
     else:
         variable_name = name
     short_name = variable_name.get_alias()
     if (short_name in self.get_attribute_names()) and (self.are_dependent_variables_up_to_date(
                         variable_name, version=version)):
         return version #nothing to be done
     dataset_name = variable_name.get_dataset_name()
     if dataset_name == self.get_dataset_name():
         new_version = self._compute_one_variable(variable_name, dataset_pool, resources)
     else:
         owner_dataset, index = self.get_owner_dataset_and_index(dataset_name)
         if owner_dataset is None:
             self._raise_error(StandardError, "Cannot find variable '%s'\nin either dataset or in the interaction set." %
                             variable_name.get_expression())
         owner_dataset.compute_variables([variable_name], dataset_pool, resources=resources, quiet=True)
         new_version = self.add_attribute(data = owner_dataset.get_attribute_by_index(variable_name, index),
             name = variable_name, metadata = AttributeType.COMPUTED)
         attribute_box = owner_dataset._get_attribute_box(variable_name)
         variable = attribute_box.get_variable_instance()
         my_attribute_box = self._get_attribute_box(variable_name)
         my_attribute_box.set_variable_instance(variable)
     return new_version
Exemple #12
0
    def compute(self, indicator, year):
        year_replaced_attribute = indicator.attribute.replace(
            'DDDD', repr(year))
        name = VariableName(year_replaced_attribute)

        if name.get_alias() not in self.dataset.get_known_attribute_names():
            self.dataset.compute_variables(name)
        self.computed.append((year_replaced_attribute, year))
Exemple #13
0
 def summary_before_after(self, attribute_name):
     """Print summary of the given attribute 'before' (values
     reloaded from the cache) and 'after' (current values).
     """
     from opus_core.store.attribute_cache import AttributeCache
     var_name = VariableName(attribute_name)
     storage = AttributeCache(self.simulation_state.get_cache_directory())
     ds = self._get_before_after_dataset_from_attribute(var_name, storage=storage, 
                package_order=self.get_dataset_pool().get_package_order())
     print ''
     print 'Before model run:'
     print '================='
     ds.summary(names=['%s_reload__' % var_name.get_alias()])
     print ''
     print 'After model run:'
     print '================='
     #ds.summary(names=[var_name.get_alias()])
     ds.summary(names=[var_name.get_alias()])
Exemple #14
0
 def test_fully_qualified_variable(self):
     # this tests an expression consisting of a fully-qualified variable
     expr = "opus_core.test_agent.income_times_2"
     storage = StorageFactory().get_storage('dict_storage')
     storage.write_table(table_name='test_agents',
                         table_data={
                             "income": array([1, 5, 10]),
                             "id": array([1, 3, 4])
                         })
     dataset = Dataset(in_storage=storage,
                       in_table_name='test_agents',
                       id_name="id",
                       dataset_name="test_agent")
     result = dataset.compute_variables([expr])
     should_be = array([2, 10, 20])
     self.assert_(ma.allclose(result, should_be, rtol=1e-6),
                  "Error in test_fully_qualified_variable")
     # check that expr is in the cache of known expressions
     # (normally we shouldn't be accessing this private field, but just this once ...)
     cache = VariableName._cache
     self.assert_(expr in cache, msg="did not find expr in cache")
     # check that the access methods for the variable all return the correct values
     name = VariableName(expr)
     self.assertEqual(name.get_package_name(),
                      'opus_core',
                      msg="bad value for package")
     self.assertEqual(name.get_dataset_name(),
                      'test_agent',
                      msg="bad value for dataset")
     self.assertEqual(name.get_short_name(),
                      'income_times_2',
                      msg="bad value for shortname")
     self.assertEqual(name.get_alias(),
                      'income_times_2',
                      msg="bad value for alias")
     self.assertEqual(name.get_autogen_class(),
                      None,
                      msg="bad value for autogen_class")
     # test that the variable can now also be accessed using its short name in an expression
     result2 = dataset.compute_variables(['income_times_2'])
     self.assert_(ma.allclose(result2, should_be, rtol=1e-6),
                  "Error in accessing a_test_variable")
     # check that the cache uses the variable name with whitespace removed
     oldsize = len(cache)
     expr_with_spaces = "opus_core . test_agent. income_times_2  "
     name2 = VariableName(expr_with_spaces)
     newsize = len(cache)
     self.assertEqual(oldsize, newsize, msg="caching error")
     self.assert_(expr_with_spaces not in cache, msg="caching error")
     self.assertEqual(expr_with_spaces,
                      name2.get_expression(),
                      msg="caching error")
     self.assertEqual(name2.get_short_name(),
                      'income_times_2',
                      msg="bad value for shortname")
    def _compute_if_needed(self,
                           name,
                           dataset_pool,
                           resources=None,
                           quiet=False,
                           version=None):
        """ Compute variable given by the argument 'name' only if this variable
        has not been computed before.
        Check first if this variable belongs to dataset1 or dataset2.
        dataset_pool holds available datasets.
        """
        if not isinstance(name, VariableName):
            variable_name = VariableName(name)
        else:
            variable_name = name
        short_name = variable_name.get_alias()

        dataset_name = variable_name.get_dataset_name()
        if dataset_name == self.get_dataset_name():
            new_version = UrbansimDataset._compute_if_needed(self,
                                                             variable_name,
                                                             dataset_pool,
                                                             resources,
                                                             quiet=quiet,
                                                             version=version)
        else:
            if dataset_name == self.dataset1.get_dataset_name():
                owner_dataset = self.dataset1
#                index = self.get_2d_index_of_dataset1()
            elif dataset_name == self.dataset2.get_dataset_name():
                owner_dataset = self.dataset2


#                index = self.get_2d_index()
            else:
                self._raise_error(
                    StandardError,
                    "Cannot find variable '%s'\nin either dataset or in the interaction set."
                    % variable_name.get_expression())
            owner_dataset.compute_variables([variable_name],
                                            dataset_pool,
                                            resources=resources,
                                            quiet=True)
            new_version =  self.compute_variables_return_versions_and_final_value("%s = %s.disaggregate(%s.%s)" % \
                                   ( short_name, self.get_dataset_name(), owner_dataset.get_dataset_name(), short_name ),
                                   dataset_pool=dataset_pool, resources=resources, quiet=quiet )[0]
        return new_version
 def variable_dependencies(self, name):
     """Prints out dependencies of this variable. 'name' can be either an alias from 
     the model specification or an expression."""
     from opus_core.variables.dependency_query import DependencyChart
     varname = None
     allvars = self.get_specification().get_variable_names()
     for ivar in range(len(allvars)):
         thisvar = allvars[ivar]
         if not isinstance(thisvar, VariableName):
             thisvar = VariableName(thisvar)
         if name == thisvar.get_alias():
             varname = thisvar
             break
     if varname is None:
         varname = VariableName(name)
     chart = DependencyChart(self.xml_configuration)
     chart.print_dependencies(varname.get_expression())
 def run(self,
         datasets_variables={},
         dataset_pool=None,
         flush_dataset=True):
     """
     datasets_variables is a dictionary where keys are dataset objects and each 
     value is a list of variables (as fully qualified names) to be computed.
     data_objects is a dictionary to be passed in the variable computation.
     If 'flush_dataset' is True, the datasets given as keys in 'datasets_variables'
     are flushed to cache.
     """
     for dataset in datasets_variables.keys():
         variables = datasets_variables[dataset]
         dataset.compute_variables(variables, dataset_pool=dataset_pool)
         for var in variables:
             varname = VariableName(var)
             values = dataset.get_attribute(varname)
             dataset.delete_one_attribute(varname)
             dataset.add_primary_attribute(values, varname.get_alias())
         if flush_dataset:
             dataset.flush_dataset()
Exemple #18
0
 def test_fully_qualified_DDD_SSS_variable(self):
     # this should use the test variable a_test_SSS_variable_DDD_SSS
     expr = "opus_core.tests.a_test_squid_variable_42_clam"
     storage = StorageFactory().get_storage('dict_storage')
     storage.write_table(table_name='tests',
                         table_data={
                             "a_dependent_variable": array([1, 5, 10]),
                             "id": array([1, 3, 4])
                         })
     dataset = Dataset(in_storage=storage,
                       in_table_name='tests',
                       id_name="id",
                       dataset_name="tests")
     result = dataset.compute_variables([expr])
     should_be = array([10, 50, 100])
     self.assert_(ma.allclose(result, should_be, rtol=1e-6),
                  "Error in test_fully_qualified_DDD_SSS_variable")
     # check that the access methods for the variable all return the correct values
     name = VariableName(expr)
     self.assertEqual(name.get_package_name(),
                      'opus_core',
                      msg="bad value for package")
     self.assertEqual(name.get_dataset_name(),
                      'tests',
                      msg="bad value for dataset")
     self.assertEqual(name.get_short_name(),
                      'a_test_squid_variable_42_clam',
                      msg="bad value for shortname")
     self.assertEqual(name.get_alias(),
                      'a_test_squid_variable_42_clam',
                      msg="bad value for alias")
     self.assertEqual(name.get_autogen_class(),
                      None,
                      msg="bad value for autogen_class")
     # test that the variable can now also be accessed using its short name in an expression
     result2 = dataset.compute_variables(['a_test_squid_variable_42_clam'])
     self.assert_(ma.allclose(result2, should_be, rtol=1e-6),
                  "Error in accessing a_test_squid_variable_42_clam")
Exemple #19
0
        def _proxy_for_get_attribute(name):
            try:
                return native_get_attribute(name)

            except NameError:
                if not isinstance(name, VariableName):
                    name = VariableName(name)
                short_name = name.get_alias()

                current_year = SimulationState().get_current_time()

                if short_name in dataset.exogenous_attribute_names.keys():
                    exogenous_table_name = dataset.exogenous_attribute_names[
                        short_name]

                    temporary_dataset = Dataset(
                        in_storage=dataset.resources['in_storage'],
                        in_table_name=exogenous_table_name,
                        id_name='id')

                    if ('year' not in dataset.get_attribute_names() or
                            not self.attribute_boxes['year'].is_in_memory()):
                        # Load the data into a temporary dataset because we
                        # don't want dataset to save the values we retrieve,
                        # since then we can't filter them by year.
                        temporary_dataset.load_dataset(
                            nchunks=1,
                            attributes=[short_name, 'year', 'base_table_id'],
                            in_table_name=exogenous_table_name)

                else:
                    raise  # re-raise NameError

                exogenous_data = temporary_dataset.attribute_boxes[
                    short_name].get_data()
                year_data = temporary_dataset.attribute_boxes['year'].get_data(
                )
                base_table_id_data = temporary_dataset.attribute_boxes[
                    'base_table_id'].get_data()

                exogenous_table_data = zip(exogenous_data, year_data,
                                           base_table_id_data)

                exogenous_attribute_values = [
                    _attribute for _attribute, _year, _base_table_id in
                    exogenous_table_data if _year == current_year
                ]

                exogenous_base_table_ids = [
                    _base_table_id for _attribute, _year, _base_table_id in
                    exogenous_table_data if _year == current_year
                ]

                base_table_ids = native_get_attribute(
                    dataset.resources['id_name'])

                exogenous_attributes_by_base_table_id = {}
                for base_table_id, value in zip(exogenous_base_table_ids,
                                                exogenous_attribute_values):
                    try:
                        exogenous_attributes_by_base_table_id[base_table_id]
                    except:
                        exogenous_attributes_by_base_table_id[
                            base_table_id] = value
                    else:
                        raise AttributeError(
                            "Duplicate data for base_table_id "
                            "'%s', year %s." % (base_table_id, current_year))

                result = [None] * len(base_table_ids)
                for index in range(len(base_table_ids)):
                    try:
                        result[index] = exogenous_attributes_by_base_table_id[
                            base_table_ids[index]]
                    except KeyError:
                        raise AttributeError(
                            "Missing exogenous data for "
                            "base_table_id '%s', year %s." %
                            (base_table_ids[index], current_year))

                return result
Exemple #20
0
class ObservedDataOneQuantity:
    """  Class for storing information about one quantity measure. It is to be grouped in 
    an object of class ObservedData.
    """
    # pairs of inverse transformations
    transformation_pairs = {"sqrt": "**2", "log":"exp", "exp": "log", "**2": "sqrt"}

    def __init__(self, variable_name, observed_data, filename=None,  transformation=None, inverse_transformation=None, 
                 filter=None, match=False, dependent_datasets={}, **kwargs):
        """  'variable_name' is a quantity about which we have data available.
        'observed_data' is of type ObservedData, it is the grouping parent. 
        'filename' is the name of file where 
        the data is stored. It can be None, if the observed_data.directory is a cache.
        'transformation' is an operation to be performed on the data (e.g. sqrt, log),
        'inverse_transformation' is the inverse function of 'transformation'. If it not given, it
        is determined automatically.
        'filter' is a variable that will be applied to both, the observed data and the simulated data.
        'match' (logical) determines if the dataset should be matched (by ids) with the simulated dataset. Elements
        that don't match are eliminated from the simulated dataset.
        'dependent_datasets' (if any) should be a dictionary of dataset_name:{'filename': filename, 'match': True|False, **kwargs}. 
        They will be added to the dataset_pool. 
        Remaining arguments are passed into DatasetFactory, thus it can contain information about how 
        to create the corresponding dataset.
        """
        self.variable_name = VariableName(variable_name)
        self.dataset_name = self.variable_name.get_dataset_name()
        dataset_pool = observed_data.get_dataset_pool()
        self.matching_datasets = {}
        
        if dataset_pool is None:
            kwargs.update({'in_storage':observed_data.get_storage(), 'in_table_name': filename})
            try:
                self.dataset = DatasetFactory().search_for_dataset(self.dataset_name, observed_data.get_package_order(), arguments=kwargs)
            except: # take generic dataset
                self.dataset = Dataset(dataset_name=self.dataset_name, **kwargs)
        else:
            self.dataset = dataset_pool.get_dataset(self.dataset_name)
        if match:
            self.add_match(self.dataset)
        for dep_dataset_name, info in dependent_datasets.iteritems():
            if dataset_pool is None:
                dataset_pool = DatasetPool(storage=observed_data.get_storage(), package_order=observed_data.get_package_order())
            info.update({'in_storage':observed_data.get_storage(), 'in_table_name': info.get('filename')})
            del info['filename']
            match = False
            if 'match' in info.keys():
                match = info['match']
                del info['match']
            try:
                dep_dataset = DatasetFactory().search_for_dataset(dep_dataset_name, observed_data.get_package_order(), arguments=info)
            except:
                dep_dataset = Dataset(dataset_name=dep_dataset_name, **info)
            dataset_pool.replace_dataset(dep_dataset_name, dep_dataset)
            if match:
                self.add_match(dep_dataset)
        if self.variable_name.get_alias() not in self.dataset.get_known_attribute_names():
            self.dataset.compute_variables([self.variable_name], dataset_pool=dataset_pool)
        if filter is not None:
            filter_values = self.dataset.compute_variables([filter], dataset_pool=dataset_pool)
            idx = where(filter_values > 0)[0]
            self.add_match(self.dataset, idx)
            self.dataset.subset_by_index(idx)
        self.transformation = transformation
        self.inverse_transformation = inverse_transformation
        if (self.transformation is not None) and (self.inverse_transformation is None):
            self.inverse_transformation = self.transformation_pairs[self.transformation]
                
    def get_values(self):
        return self.dataset.get_attribute(self.variable_name)
        
    def get_transformed_values(self):
        return try_transformation(self.get_values(), self.transformation)
        
    def get_variable_name(self):
        return self.variable_name
    
    def get_dataset(self):
        return self.dataset
    
    def get_dataset_name(self):
        return self.dataset_name
    
    def get_transformation(self):
        return self.transformation
    
    def get_transformation_pair(self):
        return (self.transformation, self.inverse_transformation)
    
    def add_match(self, dataset, index = None):
        dataset_name = dataset.get_dataset_name()
        result = zeros(dataset.size(), dtype='bool8')
        idx = index
        if index is None:
            idx = arange(dataset.size())
        result[idx] = 1
        if dataset_name in self.matching_datasets.keys():
            tmp = zeros(dataset.size(), dtype='bool8')
            tmp[dataset.get_id_index(self.matching_datasets[dataset_name])]=1
            result = result*tmp
        self.matching_datasets[dataset_name] = dataset.get_id_attribute()[where(result)]
        
    def get_matching_datasets(self):
        return self.matching_datasets
    def get_variable(self,
                     variable_name,
                     dataset,
                     quiet=False,
                     debug=0,
                     index_name=None):
        """Returns an instance of class Variable. 
        'variable_name' is an instance of class VariableName. 
        'dataset' is an object of class Dataset to which the variable belongs to. 
        In case of an error in either importing the module or evaluating its constructor, 
        the method returns None.
        If quiet is True no warnings are printed.
        index_name is used for lag variables only.
        """
        lag_attribute_name = None
        lag_offset = 0

        if not isinstance(debug, DebugPrinter):
            debug = DebugPrinter(debug)

        if variable_name.get_autogen_class() is not None:
            # variable_name has an autogenerated class -- just use that
            variable_subclass = variable_name.get_autogen_class()
            substrings = ()
        else:
            # either find the variable name in the expression library (if present), in an appropriate 'aliases' file,
            # or load our variable class as 'variable_subclass' using an import statement
            short_name = variable_name.get_short_name()
            dataset_name = variable_name.get_dataset_name()
            package_name = variable_name.get_package_name()
            # if there isn't a package name, first look in the expression library (if there is a package name, look elsewhere)
            if package_name is None:
                e = VariableFactory._expression_library.get(
                    (dataset_name, short_name), None)
                if e is not None:
                    if e == variable_name.get_expression(
                    ):  # it is a primary attribute
                        return None
                    v = VariableName(e)
                    return VariableFactory().get_variable(v,
                                                          dataset,
                                                          quiet=quiet,
                                                          debug=debug)
            else:
                # not in the expression library - next look in the appropriate 'aliases' file, if one is present
                # (but only if we have a package name in the first place)
                try:
                    stmt = 'from %s.%s.aliases import aliases' % (package_name,
                                                                  dataset_name)
                    exec(stmt)
                except ImportError:
                    aliases = []
                for a in aliases:
                    # for each definition, see if the alias is equal to the short_name.  If it is,
                    # then use that definition for the variable
                    v = VariableName(a)
                    if v.get_alias() == short_name:
                        return VariableFactory().get_variable(v,
                                                              dataset,
                                                              quiet=quiet,
                                                              debug=debug)

            lag_variable_parser = LagVariableParser()
            if lag_variable_parser.is_short_name_for_lag_variable(short_name):
                lag_attribute_name, lag_offset = lag_variable_parser.parse_lag_variable_short_name(
                    short_name)
                true_short_name = "VVV_lagLLL"
                substrings = (package_name, lag_attribute_name, lag_offset,
                              dataset_name, index_name)
                opus_path = 'opus_core.variables'

            else:
                if package_name is None:
                    raise LookupError(
                        "Incomplete variable specification for '%s.%s' (missing package name, "
                        "and variable is not in expression library not a lag variable)."
                        % (dataset_name, short_name))

                opus_path = '%s.%s' % (package_name, dataset_name)

                true_short_name, substrings = VariableFamilyNameTranslator().\
                        get_translated_variable_name_and_substring_arguments(opus_path, short_name)

            module = '%s.%s' % (opus_path, true_short_name)

            # Note that simply checking for the .py module file would not
            # be safe here, as objects could be instantiated in __init__.py files.
            try:
                ev = "from %s import %s as variable_subclass" % (
                    module, true_short_name)
                debug.print_debug("Evaluating '" + ev + "'.", 12)
                exec(ev)
                debug.print_debug("Successful.", 12)
            except ImportError, e:
                if not quiet:
                    from opus_core.simulation_state import SimulationState
                    time = SimulationState().get_current_time()
                    desc = '\n'.join((
                        "Opus variable '%s' does not exist for dataset '%s' in year %s. "
                        "The following error occured when finally trying to import "
                        "the variable '%s' from the Python module "
                        "'%s':",
                        "%s",
                    )) % (true_short_name, opus_path, time, true_short_name,
                          module,
                          indent_text(
                              formatPlainTextExceptionInfoWithoutLog('')))
                    raise NameError(desc)
                return None
Exemple #22
0
class RegressionModelWithAdditionInitialResiduals(RegressionModel):
    """
    It is a RegressionModel that computes an initial error of the observations to the predictions
    when run for the first time. Then every time it runs, it adds this error to the outcome. The 'error' attribute
    is called '_init_error_%s' % outcome_attribute and it is stored as a primary attribute.
    """
    model_name = "Regression Model With Addition of Initial Residuals"
    model_short_name = "RMWAIR"

    def __init__(self,
                 regression_procedure="opus_core.linear_regression",
                 submodel_string=None,
                 outcome_attribute=None,
                 run_config=None,
                 estimate_config=None,
                 debuglevel=None,
                 dataset_pool=None):
        """'outcome_attribute' must be specified in order to compute the residuals.
        """
        RegressionModel.__init__(self,
                                 regression_procedure=regression_procedure,
                                 submodel_string=submodel_string,
                                 run_config=run_config,
                                 estimate_config=estimate_config,
                                 debuglevel=debuglevel,
                                 dataset_pool=dataset_pool)
        self.outcome_attribute = outcome_attribute
        if (self.outcome_attribute is not None) and not isinstance(
                self.outcome_attribute, VariableName):
            self.outcome_attribute = VariableName(self.outcome_attribute)

    def run(self, specification, coefficients, dataset, index=None, **kwargs):
        """
        See description above. If missing values of the outcome attribute are suppose to be excluded from
        the addition of the initial residuals, set an entry of run_config 'exclude_missing_values_from_initial_error' to True.
        Additionaly, an entry 'outcome_attribute_missing_value' specifies the missing value (default is 0).
        Similarly, if outliers are to be excluded, the run_config entry "exclude_outliers_from_initial_error" should be set to True.
        In such a case, run_config entries 'outlier_is_less_than' and 'outlier_is_greater_than' can define lower and upper bounds for outliers. 
        By default, an outlier is a data point smaller than 0. There is no default upper bound.
        """
        if self.outcome_attribute is None:
            raise StandardError, "An outcome attribute must be specified for this model. Pass it into the initialization."

        if self.outcome_attribute.get_alias(
        ) not in dataset.get_known_attribute_names():
            try:
                dataset.compute_variables(self.outcome_attribute,
                                          dataset_pool=self.dataset_pool)
            except:
                raise StandardError, "The outcome attribute %s must be a known attribute of the dataset %s." % (
                    self.outcome_attribute.get_alias(),
                    dataset.get_dataset_name())

        if index is None:
            index = arange(dataset.size())
        original_data = dataset.get_attribute_by_index(self.outcome_attribute,
                                                       index)

        outcome = RegressionModel.run(
            self,
            specification,
            coefficients,
            dataset,
            index,
            initial_values=original_data.astype('float32'),
            **kwargs)
        initial_error_name = "_init_error_%s" % self.outcome_attribute.get_alias(
        )

        if initial_error_name not in dataset.get_known_attribute_names():
            initial_error = original_data - outcome
            dataset.add_primary_attribute(name=initial_error_name,
                                          data=zeros(dataset.size(),
                                                     dtype="float32"))
            exclude_missing_values = self.run_config.get(
                "exclude_missing_values_from_initial_error", False)
            exclude_outliers = self.run_config.get(
                "exclude_outliers_from_initial_error", False)
            if exclude_missing_values:
                missing_value = self.run_config.get(
                    "outcome_attribute_missing_value", 0)
                initial_error[original_data == missing_value] = 0
                logger.log_status(
                    'Values equal %s were excluded from adding residuals.' %
                    missing_value)
            if exclude_outliers:
                outlier_low = self.run_config.get("outlier_is_less_than", 0)
                initial_error[original_data < outlier_low] = 0
                outlier_high = self.run_config.get("outlier_is_greater_than",
                                                   original_data.max())
                initial_error[original_data > outlier_high] = 0
                logger.log_status(
                    'Values less than %s and larger than %s were excluded from adding residuals.'
                    % (outlier_low, outlier_high))
            dataset.set_values_of_one_attribute(initial_error_name,
                                                initial_error, index)
        else:
            initial_error = dataset.get_attribute_by_index(
                initial_error_name, index)
        return outcome + initial_error

    def run_after_estimation(self, *args, **kwargs):
        return RegressionModel.run(self, *args, **kwargs)
    def estimate(self,
                 specification,
                 dataset,
                 outcome_attribute,
                 index=None,
                 procedure=None,
                 data_objects=None,
                 estimate_config=None,
                 debuglevel=0):
        """'specification' is of type EquationSpecification,
            'dataset' is of type Dataset,
            'outcome_attribute' - string that determines the dependent variable,
            'index' are indices of individuals in dataset for which
                    the model runs. If it is None, the whole dataset is considered.
            'procedure' - name of the estimation procedure. If it is None,
                there should be an entry "estimation" in 'estimate_config' that determines the procedure. The class
                must have a method 'run' that takes as arguments 'data', 'regression_procedure' and 'resources'.
                It returns a dictionary with entries 'estimators', 'standard_errors' and 't_values' (all 1D numpy arrays).
            'data_objects' is a dictionary where each key is the name of an data object
                    ('zone', ...) and its value is an object of class  Dataset.
            'estimate_config' is of type Resources, it gives additional arguments for the estimation procedure.
            'debuglevel' overwrites the class 'debuglevel'.
        """
        #import wingdbstub
        self.debug.flag = debuglevel
        if estimate_config == None:
            estimate_config = Resources()
        if not isinstance(estimate_config, Resources) and isinstance(
                estimate_config, dict):
            estimate_config = Resources(estimate_config)
        self.estimate_config = estimate_config.merge_with_defaults(
            self.estimate_config)
        if data_objects is not None:
            self.dataset_pool.add_datasets_if_not_included(data_objects)
        self.procedure = procedure
        if self.procedure == None:
            self.procedure = self.estimate_config.get("estimation", None)
        if self.procedure is not None:
            self.procedure = ModelComponentCreator().get_model_component(
                self.procedure)
        else:
            logger.log_warning(
                "No estimation procedure given, or problems with loading the corresponding module."
            )

        compute_resources = Resources({"debug": self.debug})
        if dataset.size() <= 0:  # no data loaded yet
            dataset.get_id_attribute()
        if index == None:
            index = arange(dataset.size())
        if not isinstance(index, ndarray):
            index = array(index)

        estimation_size_agents = self.estimate_config.get(
            "estimation_size_agents",
            None)  # should be a proportion of the agent_set
        if estimation_size_agents == None:
            estimation_size_agents = 1.0
        else:
            estimation_size_agents = max(min(estimation_size_agents, 1.0),
                                         0.0)  # between 0 and 1

        if estimation_size_agents < 1.0:
            self.debug.print_debug("Sampling agents for estimation ...", 3)
            estimation_idx = sample_noreplace(
                arange(index.size), int(index.size * estimation_size_agents))
        else:
            estimation_idx = arange(index.size)

        estimation_idx = index[estimation_idx]
        self.debug.print_debug(
            "Number of observations for estimation: " +
            str(estimation_idx.size), 2)
        if estimation_idx.size <= 0:
            self.debug.print_debug("Nothing to be done.", 2)
            return (None, None)

        coefficients = create_coefficient_from_specification(specification)
        specified_coefficients = SpecifiedCoefficients().create(coefficients,
                                                                specification,
                                                                neqs=1)
        submodels = specified_coefficients.get_submodels()
        self.get_status_for_gui().update_pieces_using_submodels(
            submodels=submodels, leave_pieces=2)
        self.map_agents_to_submodels(
            submodels,
            self.submodel_string,
            dataset,
            estimation_idx,
            dataset_pool=self.dataset_pool,
            resources=compute_resources,
            submodel_size_max=self.estimate_config.get('submodel_size_max',
                                                       None))
        variables = specified_coefficients.get_full_variable_names_without_constants(
        )
        self.debug.print_debug("Compute variables ...", 4)
        self.increment_current_status_piece()
        dataset.compute_variables(variables,
                                  dataset_pool=self.dataset_pool,
                                  resources=compute_resources)

        coef = {}
        estimated_coef = {}
        self.outcome = {}
        dataset.compute_variables([outcome_attribute],
                                  dataset_pool=self.dataset_pool,
                                  resources=compute_resources)
        regression_resources = Resources(estimate_config)
        regression_resources.merge({"debug": self.debug})
        outcome_variable_name = VariableName(outcome_attribute)
        for submodel in submodels:
            coef[submodel] = SpecifiedCoefficientsFor1Submodel(
                specified_coefficients, submodel)
            self.increment_current_status_piece()
            logger.log_status("Estimate regression for submodel " +
                              str(submodel),
                              tags=["estimate"],
                              verbosity_level=2)
            logger.log_status("Number of observations: " +
                              str(self.observations_mapping[submodel].size),
                              tags=["estimate"],
                              verbosity_level=2)
            self.data[
                submodel] = dataset.create_regression_data_for_estimation(
                    coef[submodel],
                    index=estimation_idx[self.observations_mapping[submodel]])
            self.coefficient_names[submodel] = coef[
                submodel].get_coefficient_names_without_constant()[0, :]
            if (self.data[submodel].shape[0] > 0
                ) and (self.data[submodel].size > 0) and (
                    self.procedure
                    is not None):  # observations for this submodel available
                self.outcome[submodel] = dataset.get_attribute_by_index(
                    outcome_variable_name.get_alias(),
                    estimation_idx[self.observations_mapping[submodel]])
                regression_resources.merge({"outcome": self.outcome[submodel]})
                regression_resources.merge({
                    "coefficient_names":
                    self.coefficient_names[submodel].tolist(),
                    "constant_position":
                    coef[submodel].get_constants_positions()
                })
                estimated_coef[submodel] = self.procedure.run(
                    self.data[submodel],
                    self.regression,
                    resources=regression_resources)
                if "estimators" in estimated_coef[submodel].keys():
                    coef[submodel].set_coefficient_values(
                        estimated_coef[submodel]["estimators"])
                if "standard_errors" in estimated_coef[submodel].keys():
                    coef[submodel].set_standard_errors(
                        estimated_coef[submodel]["standard_errors"])
                if "other_measures" in estimated_coef[submodel].keys():
                    for measure in estimated_coef[submodel][
                            "other_measures"].keys():
                        coef[submodel].set_measure(
                            measure, estimated_coef[submodel]["other_measures"]
                            [measure])
                if "other_info" in estimated_coef[submodel].keys():
                    for info in estimated_coef[submodel]["other_info"]:
                        coef[submodel].set_other_info(
                            info, estimated_coef[submodel]["other_info"][info])
        coefficients.fill_coefficients(coef)

        self.save_predicted_values_and_errors(specification,
                                              coefficients,
                                              dataset,
                                              outcome_variable_name,
                                              index=index,
                                              data_objects=data_objects)

        return (coefficients, estimated_coef)
Exemple #24
0
class WaterDemandModel(RegressionModel):
    """
    """
    #    filter_attribute = "include_in_housing_value_estimation"
    model_name = "Water Demand Model"
    model_short_name = "WDM"

    def __init__(self,
                 regression_procedure="opus_core.linear_regression",
                 outcome_attribute="month_combination_2",
                 filter_attribute=None,
                 submodel_string="land_use_type_id",
                 run_config=None,
                 estimate_config=None,
                 debuglevel=0,
                 dataset_pool=None):
        self.outcome_attribute = outcome_attribute
        if (self.outcome_attribute is not None) and not isinstance(
                self.outcome_attribute, VariableName):
            self.outcome_attribute = VariableName(self.outcome_attribute)

        self.filter_attribute = filter_attribute
        RegressionModel.__init__(self,
                                 regression_procedure=regression_procedure,
                                 submodel_string=submodel_string,
                                 run_config=run_config,
                                 estimate_config=estimate_config,
                                 debuglevel=debuglevel,
                                 dataset_pool=dataset_pool)

    def run(self,
            specification,
            coefficients,
            dataset,
            index=None,
            chunk_specification=None,
            data_objects=None,
            run_config=None,
            debuglevel=0):
        """ For info on the arguments see RegressionModel.
        """
        outcome_attribute_short = self.outcome_attribute.get_alias()
        if data_objects is not None:
            self.dataset_pool.add_datasets_if_not_included(data_objects)
        if self.filter_attribute <> None:
            res = Resources({"debug": debuglevel})
            index = dataset.get_filtered_index(self.filter_attribute,
                                               threshold=0,
                                               index=index,
                                               dataset_pool=self.dataset_pool,
                                               resources=res)

        current_year = SimulationState().get_current_time()
        current_month = int(re.search('\d+$', outcome_attribute_short).group())
        # date in YYYYMM format, matching to the id_name field of weather dataset
        date = int("%d%02d" % (current_year, current_month))
        date = array([date] * dataset.size())

        if "date" in dataset.get_known_attribute_names():
            dataset.set_values_of_one_attribute("date", date)
        else:
            dataset.add_primary_attribute(date, "date")

        water_demand = RegressionModel.run(self,
                                           specification,
                                           coefficients,
                                           dataset,
                                           index,
                                           chunk_specification,
                                           run_config=run_config,
                                           debuglevel=debuglevel)
        if (water_demand == None) or (water_demand.size <= 0):
            return water_demand

        if index == None:
            index = arange(dataset.size())

        if re.search("^ln_", outcome_attribute_short):
            # if the outcome attr. name starts with 'ln_' the results will be exponentiated.
            outcome_attribute_name = outcome_attribute_short[
                3:len(outcome_attribute_short)]
            water_demand = exp(water_demand)
        else:
            outcome_attribute_name = outcome_attribute_short

        if outcome_attribute_name in dataset.get_known_attribute_names():
            dataset.set_values_of_one_attribute(outcome_attribute_name,
                                                water_demand, index)
        else:
            results = zeros(dataset.size(), dtype=water_demand.dtype)
            results[index] = water_demand
            dataset.add_primary_attribute(results, outcome_attribute_name)

        return water_demand
    def run( self, building_set, building_types_table, vacancy_table, year, location_set,
            building_categories=None, dataset_pool=None, resources=None ):
        building_types = building_types_table.get_attribute("name")
        building_id_name = building_set.get_id_name()[0]
        location_id_name = location_set.get_id_name()[0]
        new_buildings = {building_id_name: array([], dtype=building_set.get_data_type(building_id_name)),
                         "building_type_id":array([], dtype=building_set.get_data_type("building_type_id", int8)),
                         "year_built": array([], dtype=building_set.get_data_type("year_built", int32)),
                         "sqft": array([], dtype=building_set.get_data_type("sqft", int32)),
                         "residential_units": array([], dtype=building_set.get_data_type("residential_units", int32)),
                         "improvement_value": array([], dtype= building_set.get_data_type("improvement_value", float32)),
                         "land_value": array([], dtype= building_set.get_data_type("land_value", float32)),
                         location_id_name: array([], dtype=building_set.get_data_type(location_id_name, int32))}
        max_id = building_set.get_id_attribute().max()
        buildings_set_size_orig = building_set.size()

        for itype in range(building_types_table.size()): # iterate over building types
            type = building_types[itype]
            type_code = building_types_table.get_id_attribute()[itype]
            is_residential = building_types_table.get_attribute("is_residential")[itype]
            vacancy_attribute = 'target_total_%s_vacancy' % type
            if vacancy_attribute not in vacancy_table.get_known_attribute_names():
                logger.log_warning("No target vacancy for building type '%s'. Transition model for this building type skipped." % type)
                continue
            vacancy_table.get_attribute(vacancy_attribute)  # ensures that the attribute is loaded
            target_vacancy_rate = eval("vacancy_table.get_data_element_by_id( year ).%s" % vacancy_attribute)

            compute_resources = Resources(resources)
            compute_resources.merge({"debug":self.debug})
            units_attribute = building_types_table.get_attribute('units')[itype]

            # determine current-year vacancy rates
            if is_residential:
                default_vacancy_variable = "urbansim.%s.vacant_%s_units_from_buildings" % (
                                                                   location_set.get_dataset_name(), type)
            else:
                default_vacancy_variable = "urbansim.%s.vacant_%s_sqft_from_buildings" % (
                                                                   location_set.get_dataset_name(), type)
            variable_for_vacancy = compute_resources.get(
                                    "%s_vacant_variable" % type, default_vacancy_variable)
            location_set.compute_variables([variable_for_vacancy, "urbansim.%s.buildings_%s_space" % (
                                                                      location_set.get_dataset_name(),type)],
                                        dataset_pool=dataset_pool, resources = compute_resources)

            vacant_units_sum = location_set.get_attribute(variable_for_vacancy).sum()
            units_sum = float( location_set.get_attribute("buildings_%s_space" % type).sum() )
            vacant_rate = self.safe_divide(vacant_units_sum, units_sum)

            should_develop_units = int(round(max( 0, ( target_vacancy_rate * units_sum - vacant_units_sum ) /
                                         ( 1 - target_vacancy_rate ) )))
            logger.log_status(type + ": vacant units: %d, should be vacant: %f, sum units: %d"
                          % (vacant_units_sum, target_vacancy_rate * units_sum, units_sum))

            if not should_develop_units:
                logger.log_note(("Will not build any " + type + " units, because the current vacancy of %d units\n"
                             + "is more than the %d units desired for the vacancy rate of %f.")
                            % (vacant_units_sum,
                               target_vacancy_rate * units_sum,
                               target_vacancy_rate))
                continue

            improvement_value = building_set.compute_variables("urbansim.%s.%s_improvement_value" % (
                                                                     building_set.get_dataset_name(), type),
                                                                   dataset_pool=dataset_pool,
                                                                   resources=compute_resources)
            average_improvement_value = improvement_value.sum()/ units_sum

            #create buildings
            is_building_type = building_set.compute_variables("urbansim.building.is_building_type_%s" % type,
                                                              dataset_pool=dataset_pool,
                                                              resources=compute_resources)
            units_of_this_type = building_set.compute_variables(units_attribute, dataset_pool=dataset_pool,
                                           resources=compute_resources)
            units_of_this_type = units_of_this_type*is_building_type
            units_without_zeros_idx = where(units_of_this_type > 0)[0]
            history_values_without_zeros = units_of_this_type[units_without_zeros_idx]
            history_improvement_values_without_zeros = where(improvement_value[units_without_zeros_idx]>0,
                                                             improvement_value[units_without_zeros_idx],
                                                             average_improvement_value)
            mean_size = history_values_without_zeros.mean()
            idx = array( [], dtype="int32" )
            # Ensure that there are some development projects to choose from.
            num_of_projects_to_select = max( 10, int( should_develop_units / mean_size ) )
            while True:
                idx = concatenate( ( idx, randint( 0, history_values_without_zeros.size,
                                                   size=num_of_projects_to_select) ) )
                csum = history_values_without_zeros[idx].cumsum()
                idx = idx[where( csum <= should_develop_units )]
                if csum[-1] >= should_develop_units:
                    break
            nbuildings = idx.size
            new_buildings["building_type_id"] = concatenate((new_buildings["building_type_id"], type_code*ones(nbuildings)))
            new_buildings["year_built"] = concatenate((new_buildings["year_built"], year*ones(nbuildings)))
            new_max_id = max_id + nbuildings
            new_buildings[building_id_name]=concatenate((new_buildings[building_id_name], arange(max_id+1, new_max_id+1)))
            max_id = new_max_id
            new_buildings["improvement_value"] = concatenate((new_buildings["improvement_value"],
                                                              history_improvement_values_without_zeros[idx]))

            if is_residential:
                target_size_attribute = "residential_units"
                zero_attribute = "sqft"
            else:
                target_size_attribute = "sqft"
                zero_attribute = "residential_units"
            new_buildings[target_size_attribute] = concatenate((new_buildings[target_size_attribute], history_values_without_zeros[idx]))
            new_buildings[zero_attribute] = concatenate((new_buildings[zero_attribute], zeros(nbuildings)))
            new_buildings[location_id_name] = concatenate((new_buildings[location_id_name], zeros(nbuildings)))
            new_buildings["land_value"] = concatenate((new_buildings["land_value"], zeros(nbuildings)))
            logger.log_status("Creating %s %s of %s %s buildings." % (history_values_without_zeros[idx].sum(),
                                                                   target_size_attribute, nbuildings, type))

        building_set.add_elements(new_buildings, require_all_attributes=False)
        if building_categories: # should be a dictionary of categories for each building type
            building_set.resources['building_categories'] = building_categories
        # add submodel attribute
        category_variables = map(lambda type: "urbansim.%s.size_category_%s" % (building_set.get_dataset_name(), type),
                                           building_types)

        for category_var in category_variables:
            var = VariableName(category_var)
            if var.get_alias() in building_set.get_known_attribute_names():
                building_set.delete_one_attribute(var)
            building_set.compute_variables(var, dataset_pool=dataset_pool, resources = compute_resources)
            building_set.add_primary_attribute(building_set.get_attribute(var), var.get_alias())

        difference = building_set.size() - buildings_set_size_orig
        return difference