def export_dataset(self, dataset_name, in_storage, out_storage, overwrite=True, out_dataset_name=None, nchunks = 1, **kwargs): if not overwrite and dataset_name in out_storage.get_table_names(): logger.log_note('Dataset %s ignored because it already exists in OPUS' % dataset_name) return with logger.block('Exporting dataset %s' % dataset_name): if out_dataset_name is None: out_dataset_name = dataset_name cols_in_this_chunk = in_storage.ALL_COLUMNS if nchunks > 1: colnames = in_storage.get_column_names(dataset_name) chunk_size = int(ceil(len(colnames) / float(nchunks))) for chunk in range(nchunks): if nchunks > 1: cols_in_this_chunk = colnames[int(chunk*chunk_size):int((chunk+1)*chunk_size)] with logger.block('Loading %s - chunk %s out of %s' % (dataset_name, chunk+1, nchunks)): values_from_storage = in_storage.load_table(dataset_name, column_names=cols_in_this_chunk) length = len(values_from_storage) and len(values_from_storage.values()[0]) if length == 0: logger.log_warning("Dataset %s ignored because it's empty" % dataset_name) return with logger.block('Storing %s' % dataset_name): if chunk > 0: kwargs['mode'] = out_storage.APPEND out_storage.write_table(out_dataset_name, values_from_storage, **kwargs) logger.log_note("Exported %s records for dataset %s" % (length, dataset_name))
def compute(self, dataset_pool): with logger.block(name="compute variable persons_within_DDD_of_parcel with DDD=%s" % self.radius, verbose=False): results = None with logger.block(name="trying to read cache file %s" % self.cache_file_name, verbose=False): try: results = self._load_results() except IOError: logger.log_warning("Cache file could not be loaded") with logger.block(name="initialize datasets", verbose=False): parcels = self.get_dataset() arr = self.get_dataset().sum_dataset_over_ids(dataset_pool.get_dataset('household'), attribute_name="persons") if not results: with logger.block(name="initialize coords", verbose=False): coords = column_stack( (parcels.get_attribute("x_coord_sp"), parcels.get_attribute("y_coord_sp")) ) with logger.block(name="build KDTree", verbose=False): kd_tree = KDTree(coords, 100) with logger.block(name="compute"): results = kd_tree.query_ball_tree(kd_tree, self.radius) with logger.block(name="cache"): if not SimulationState().cache_directory_exists(): logger.log_warning("Cache does not exist and is created.") SimulationState().create_cache_directory() self._cache_results(results) with logger.block(name="sum results", verbose=False): return_values = array(map(lambda l: arr[l].sum(), results)) return return_values
def compute(self, dataset_pool): dataset = self.get_dataset() with logger.block("Compute sc_residential_sqm", verbose=True): residential_sqm = dataset.compute_variables(["sc_residential_sqm"], dataset_pool=dataset_pool) logger.log_note("residential_sqm in development event history: %s" % sum(residential_sqm)) attr_names_matches = [re.match('sqm_sector([0-9]+)', n) for n in dataset.get_known_attribute_names()] sector_ids = sorted([int(m.group(1)) for m in attr_names_matches if m]) sqm_sector_array = reshape(residential_sqm, (-1, 1)) for sector_id in sector_ids: sqm_sector = dataset.compute_one_variable_with_unknown_package("sqm_sector%s" % sector_id, dataset_pool=dataset_pool) logger.log_note("sqm_sector%s in development event history: %s" % (sector_id, sum(sqm_sector))) sqm_sector_array = append(sqm_sector_array, reshape(sqm_sector, (-1, 1)), 1) sqm_sector_argmax = argmax(sqm_sector_array, 1) #logger.log_note("sqm_sector_argmax: %s" % sqm_sector_argmax) sector_id_array = array([0] + sector_ids) #logger.log_note("sector_id_array: %s" % sector_id_array) val = sector_id_array[sqm_sector_argmax] #logger.log_note("val: %s" % val) return val
def compute(self, dataset_pool): with logger.block('Analyzing sector'): sectors = dataset_pool.get_dataset("sector") name_equals_sector = sectors.get_attribute("name") == self.sector name_equals_sector_indexes = where(name_equals_sector) assert(len(name_equals_sector_indexes) == 1) name_equals_sector_index = name_equals_sector_indexes[0] sector_ids = sectors.get_attribute("sector_id") sector_id = sector_ids[name_equals_sector_index][0] with logger.block('Analyzing buildings'): buildings = self.get_dataset() sqm_our_sector = buildings.get_attribute("sqm_sector%s" % sector_id) #get column of observed jobs logger.log_note("sqm_sector%s: %s" % (sector_id, sum(sqm_our_sector))) return sqm_our_sector
def run(self, chunk_specification, dataset, dataset_index=None, result_array_type=float32, **kwargs): """ 'chunk_specification' - determines number of chunks to use when computing over the dataset set. 'dataset' - an object of class Dataset that is to be chunked. 'dataset_index' - index of individuals in dataset to be chunked. 'result_array_type' - type of the resulting array. Can be any numerical type of numpy array. **kwargs - keyword arguments. The method chunks dataset_index in the desired number of chunks (minimum is 1) and for each chunk it calls the method 'run_chunk'. The order of the individuals entering the chunking is determined by the method 'get_agents_order'. """ if dataset_index == None: dataset_index = arange(dataset.size()) if not isinstance(dataset_index, ndarray): dataset_index = array(dataset_index) logger.log_status("Total number of individuals: %s" % dataset_index.size) result_array = zeros(dataset_index.size, dtype=result_array_type) if dataset_index.size <= 0: logger.log_status("Nothing to be done.") return result_array all_indexed_individuals = DatasetSubset(dataset, dataset_index) ordered_agent_indices = self.get_agents_order( all_indexed_individuals) # set order of individuals in chunks # TODO: Remove next six lines after we inherit chunk specification as a text string. if (chunk_specification is None): chunk_specification = {'nchunks': 1} chunker = ChunkSpecification(chunk_specification) self.number_of_chunks = chunker.nchunks(dataset_index) chunksize = int( ceil(all_indexed_individuals.size() / float(self.number_of_chunks))) for ichunk in range(self.number_of_chunks): with logger.block("%s chunk %d out of %d." % (self.model_short_name, (ichunk + 1), self.number_of_chunks)): self.index_of_current_chunk = ichunk chunk_agent_indices = ordered_agent_indices[arange( (ichunk * chunksize), min((ichunk + 1) * chunksize, all_indexed_individuals.size()))] logger.log_status("Number of agents in this chunk: %s" % chunk_agent_indices.size) result_array[chunk_agent_indices] = self.run_chunk( dataset_index[chunk_agent_indices], dataset, **kwargs).astype(result_array_type) return result_array
def export_dataset(self, dataset_name, in_storage, out_storage, overwrite=True, out_dataset_name=None, nchunks=1, **kwargs): if not overwrite and dataset_name in out_storage.get_table_names(): logger.log_note( 'Dataset %s ignored because it already exists in OPUS' % dataset_name) return with logger.block('Exporting dataset %s' % dataset_name): if out_dataset_name is None: out_dataset_name = dataset_name cols_in_this_chunk = in_storage.ALL_COLUMNS if nchunks > 1: colnames = in_storage.get_column_names(dataset_name) chunk_size = int(ceil(len(colnames) / float(nchunks))) for chunk in range(nchunks): if nchunks > 1: cols_in_this_chunk = colnames[int(chunk * chunk_size):int( (chunk + 1) * chunk_size)] with logger.block('Loading %s - chunk %s out of %s' % (dataset_name, chunk + 1, nchunks)): values_from_storage = in_storage.load_table( dataset_name, column_names=cols_in_this_chunk) length = len(values_from_storage) and len( values_from_storage.values()[0]) if length == 0: logger.log_warning( "Dataset %s ignored because it's empty" % dataset_name) return with logger.block('Storing %s' % dataset_name): if chunk > 0: kwargs['mode'] = out_storage.APPEND out_storage.write_table(out_dataset_name, values_from_storage, **kwargs) logger.log_note("Exported %s records for dataset %s" % (length, dataset_name))
def compute(self, dataset_pool): with logger.block( name="compute variable jobs_within_DDD_of_parcel_weighted with DDD=%s" % self.radius, verbose=False ): results = None distances = None with logger.block(name="trying to read cache files", verbose=False): try: results = self._load_results() except IOError: logger.log_warning("Cache file %s could not be loaded" % self.cache_file_name) try: distances = self._load_distances() except IOError: logger.log_warning("Cache file %s could not be loaded" % self.cache_distances_file_name) with logger.block(name="initialize datasets", verbose=False): parcels = self.get_dataset() arr = parcels.sum_dataset_over_ids(dataset_pool.get_dataset("job"), constant=1) if not results or not distances: with logger.block(name="initialize coords", verbose=False): coords = column_stack((parcels.get_attribute("x_coord_sp"), parcels.get_attribute("y_coord_sp"))) with logger.block(name="build KDTree", verbose=False): kd_tree = KDTree(coords, 100) with logger.block(name="compute neighbourhoods"): results = kd_tree.query_ball_tree(kd_tree, self.radius) with logger.block(name="compute euclidean distances"): distances = kd_tree.sparse_distance_matrix(kd_tree, self.radius) with logger.block(name="cache neighbourhoods"): if not SimulationState().cache_directory_exists(): logger.log_warning("Cache does not exist and is created.") SimulationState().create_cache_directory() self._cache_results(results) self._cache_distances(distances) with logger.block(name="Sum weighted jobs in neighbourhood", verbose=False): # return_values = array(map(lambda l: arr[l].sum(), results)) return_values = array(self.euclidean_accessibility_for_parcel(results, distances, arr)) return return_values
def export(self, in_storage, out_storage, **kwargs): dataset_names = in_storage.get_table_names() with logger.block('Exporting tables'): logger.log_status("Reading tables from '%s'" % in_storage.get_storage_location()) if not dataset_names: logger.log_warning('This location has no tables to export!') logger.log_warning('Did you specify a location containing the data for a single year?') else: for dataset_name in dataset_names: self.export_dataset(dataset_name, in_storage, out_storage, **kwargs)
def compute(self, dataset_pool): with logger.block('Analyzing sectors'): sectors = dataset_pool.get_dataset("sector") name_equals_sector = sectors.get_attribute("name") == self.sector name_equals_sector_indexes = where(name_equals_sector) assert(len(name_equals_sector_indexes) == 1) name_equals_sector_index = name_equals_sector_indexes[0] sector_ids = sectors.get_attribute("sector_id") sector_id = sector_ids[name_equals_sector_index][0] sqft_per_jobs = sectors.get_attribute("sqm_per_job") sqft_per_job = sqft_per_jobs[name_equals_sector_index][0] logger.log_note("sqft_per_job: %s" % sqft_per_job) with logger.block('Analyzing jobs'): logger.log_note("sector_id: %s" % sector_id) jobs = dataset_pool.get_dataset("job") logger.log_note("jobs.size: %s" % jobs.size()) buildings = self.get_dataset() logger.log_note("buildings.size: %s" % buildings.size()) job_sqft = ma.masked_where(jobs.get_attribute('sector_id') != sector_id, [sqft_per_job] * jobs.size(), 0) logger.log_note("job_sqft: %s" % repr(job_sqft)) logger.log_note("job_sqft.sum(): %s" % (job_sqft.sum())) logger.log_note("job_sqft.sum() / sqft_per_job: %s" % (job_sqft.sum() / sqft_per_job)) jobs_building_id = jobs.get_attribute('building_id') buildings_id = buildings.get_id_attribute() logger.log_note("building_id difference: %s" % (set(jobs_building_id) - set(buildings_id))) job_area_raw = buildings.sum_over_ids(jobs_building_id, job_sqft) logger.log_note("job_area_raw: %s" % repr(job_area_raw)) logger.log_note("job_area_raw.sum(): %s" % (job_area_raw.sum())) logger.log_note("job_area_raw.sum() / sqft_per_job: %s" % (job_area_raw.sum() / sqft_per_job)) job_area = clip(job_area_raw, 0, buildings.get_attribute("building_sqft")) logger.log_note("job_area: %s" % repr(job_area)) logger.log_note("job_area.sum(): %s" % (job_area.sum())) logger.log_note("job_area.sum() / sqft_per_job: %s" % (job_area.sum() / sqft_per_job)) return job_area
def export(self, in_storage, out_storage, **kwargs): dataset_names = in_storage.get_table_names() with logger.block('Exporting tables'): logger.log_status("Reading tables from '%s'" % in_storage.get_storage_location()) if not dataset_names: logger.log_warning('This location has no tables to export!') logger.log_warning( 'Did you specify a location containing the data for a single year?' ) else: for dataset_name in dataset_names: self.export_dataset(dataset_name, in_storage, out_storage, **kwargs)
def run(self, chunk_specification, dataset, dataset_index=None, result_array_type=float32, **kwargs): """ 'chunk_specification' - determines number of chunks to use when computing over the dataset set. 'dataset' - an object of class Dataset that is to be chunked. 'dataset_index' - index of individuals in dataset to be chunked. 'result_array_type' - type of the resulting array. Can be any numerical type of numpy array. **kwargs - keyword arguments. The method chunks dataset_index in the desired number of chunks (minimum is 1) and for each chunk it calls the method 'run_chunk'. The order of the individuals entering the chunking is determined by the method 'get_agents_order'. """ if dataset_index==None: dataset_index=arange(dataset.size()) if not isinstance(dataset_index,ndarray): dataset_index=array(dataset_index) logger.log_status("Total number of individuals: %s" % dataset_index.size) result_array = zeros(dataset_index.size, dtype=result_array_type) if dataset_index.size <= 0: logger.log_status("Nothing to be done.") return result_array all_indexed_individuals = DatasetSubset(dataset, dataset_index) ordered_agent_indices = self.get_agents_order(all_indexed_individuals)# set order of individuals in chunks # TODO: Remove next six lines after we inherit chunk specification as a text string. if (chunk_specification is None): chunk_specification = {'nchunks':1} chunker = ChunkSpecification(chunk_specification) self.number_of_chunks = chunker.nchunks(dataset_index) chunksize = int(ceil(all_indexed_individuals.size()/float(self.number_of_chunks))) for ichunk in range(self.number_of_chunks): with logger.block("%s chunk %d out of %d." % (self.model_short_name, (ichunk+1), self.number_of_chunks)): self.index_of_current_chunk = ichunk chunk_agent_indices = ordered_agent_indices[arange((ichunk*chunksize), min((ichunk+1)*chunksize, all_indexed_individuals.size()))] logger.log_status("Number of agents in this chunk: %s" % chunk_agent_indices.size) result_array[chunk_agent_indices] = self.run_chunk(dataset_index[chunk_agent_indices], dataset, **kwargs).astype(result_array_type) return result_array
def run( self, resources, write_datasets_to_cache_at_end_of_year=True, log_file_name="run_model_system.log", cleanup_datasets=True, ): """Entries in resources: (entries with no defaults are required) models - a list containing names of models to be run. Each name must correspond to the name of the module/class of that model. Default(object): None years - a tuple (start year, end year) debuglevel - an integer. The higher the more output will be printed. Default: 0 expression_library - a dictionary. The keys in the dictionary are pairs (dataset_name, variable_name) and the values are the corresponding expressions. The model system needs to set the expression library (if it isn't None) in DatasetFactory for DatasetFactory to know about variables defined as expressions in the xml expression library. Default: None This method is called both to start up the simulation for all years, and also for each year when running with one process per year. In the latter case, 'years' consists of just (current_year, current_year) rather than the real start and end years for the simulation. """ if not isinstance(resources, Resources): raise TypeError, "Argument 'resources' must be of type 'Resources'." logger_settings = resources.get("log", {"tags": [], "verbosity_level": 3}) logger.set_tags(logger_settings.get("tags", [])) logger.set_verbosity_level(logger_settings.get("verbosity_level", 3)) self.simulation_state = SimulationState() self.simulation_state.set_low_memory_run(resources.get("low_memory_mode", False)) self.simulation_state.set_start_time(resources.get("base_year", 0)) self.run_year_namespace = {} if resources.get("cache_directory", None) is not None: self.simulation_state.set_cache_directory(resources["cache_directory"]) if "expression_library" in resources: VariableFactory().set_expression_library(resources["expression_library"]) if resources.get("sample_input", False): self.update_config_for_multiple_runs(resources) cache_directory = self.simulation_state.get_cache_directory() log_file = os.path.join(cache_directory, log_file_name) logger.enable_file_logging(log_file, verbose=False) try: logger.log_status("Cache Directory set to: " + cache_directory) with logger.block("Start simulation run"): models = resources.get("models", []) models_in_years = resources.get("models_in_year", {}) resources.check_obligatory_keys(["years"]) years = resources["years"] if (not isinstance(years, tuple)) and (not isinstance(years, list)): raise TypeError, "Entry 'years' in resources must be a tuple." if len(years) < 2: print years raise StandardError, "Entry 'years' in resources must be of length at least 2." start_year = years[0] end_year = years[-1] debuglevel = resources.get("debuglevel", 0) seed_values = resources.get("seed", NO_SEED) logger.log_status("random seed = %s" % str(seed_values)) seed(seed_values) for year in range(start_year, end_year + 1): with logger.block("Starting simulation for year " + str(year)): self.simulation_state.set_current_time(year) SessionConfiguration().get_dataset_pool().remove_all_datasets() logger.disable_file_logging(log_file) try: if models_in_years.get(year, None) is not None: models_to_run = models_in_years[year] else: models_to_run = models self._run_year( year=year, models=models_to_run, simulation_state=self.simulation_state, debuglevel=debuglevel, resources=resources, write_datasets_to_cache_at_end_of_year=write_datasets_to_cache_at_end_of_year, cleanup_datasets=cleanup_datasets, ) finally: logger.enable_file_logging(log_file, verbose=False) collect() finally: logger.disable_file_logging(log_file)
def logged_method (*req_args, **opt_args): with logger.block(name=an_instance.name(), verbose=False): results = compute_method(*req_args, **opt_args) an_instance._do_flush_dependent_variables_if_required() return results
cache_path = options.cache_path database_name = options.database_name if database_name is None or cache_path is None: parser.print_help() sys.exit(1) table_name = options.table_name logger.log_status('Initializing database...') db_server = DatabaseServer(EstimationDatabaseConfiguration( database_name = database_name, database_configuration = options.database_configuration ) ) if not db_server.has_database(database_name): # if only one table should be exported, db_server.create_database(database_name) # the database can exist db = db_server.get_database(database_name) input_storage = flt_storage(storage_location = cache_path) output_storage = sql_storage( storage_location = db) with logger.block('Exporting cache to sql...'): if table_name is None: ExportStorage().export(in_storage=input_storage, out_storage=output_storage) else: db.drop_table(table_name) ExportStorage().export_dataset(table_name, in_storage=input_storage, out_storage=output_storage)
if database_name is None or cache_path is None: parser.print_help() sys.exit(1) table_name = options.table_name logger.log_status('Initializing database...') db_server = DatabaseServer( EstimationDatabaseConfiguration( database_name=database_name, database_configuration=options.database_configuration)) if not db_server.has_database( database_name): # if only one table should be exported, db_server.create_database(database_name) # the database can exist db = db_server.get_database(database_name) input_storage = flt_storage(storage_location=cache_path) output_storage = sql_storage(storage_location=db) with logger.block('Exporting cache to sql...'): if table_name is None: ExportStorage().export(in_storage=input_storage, out_storage=output_storage) else: db.drop_table(table_name) ExportStorage().export_dataset(table_name, in_storage=input_storage, out_storage=output_storage)
def run(self, resources, write_datasets_to_cache_at_end_of_year=True, log_file_name='run_model_system.log', cleanup_datasets=True): """Entries in resources: (entries with no defaults are required) models - a list containing names of models to be run. Each name must correspond to the name of the module/class of that model. Default(object): None years - a tuple (start year, end year) debuglevel - an integer. The higher the more output will be printed. Default: 0 expression_library - a dictionary. The keys in the dictionary are pairs (dataset_name, variable_name) and the values are the corresponding expressions. The model system needs to set the expression library (if it isn't None) in DatasetFactory for DatasetFactory to know about variables defined as expressions in the xml expression library. Default: None This method is called both to start up the simulation for all years, and also for each year when running with one process per year. In the latter case, 'years' consists of just (current_year, current_year) rather than the real start and end years for the simulation. """ if not isinstance(resources, Resources): raise TypeError, "Argument 'resources' must be of type 'Resources'." logger_settings = resources.get("log", { "tags": [], "verbosity_level": 3 }) logger.set_tags(logger_settings.get("tags", [])) logger.set_verbosity_level(logger_settings.get("verbosity_level", 3)) self.simulation_state = SimulationState() self.simulation_state.set_low_memory_run( resources.get("low_memory_mode", False)) self.simulation_state.set_start_time(resources.get("base_year", 0)) self.run_year_namespace = {} if resources.get('cache_directory', None) is not None: self.simulation_state.set_cache_directory( resources['cache_directory']) if 'expression_library' in resources: VariableFactory().set_expression_library( resources['expression_library']) if resources.get('sample_input', False): self.update_config_for_multiple_runs(resources) cache_directory = self.simulation_state.get_cache_directory() log_file = os.path.join(cache_directory, log_file_name) logger.enable_file_logging(log_file, verbose=False) try: logger.log_status("Cache Directory set to: " + cache_directory) with logger.block('Start simulation run'): models = resources.get("models", []) models_in_years = resources.get("models_in_year", {}) resources.check_obligatory_keys(["years"]) years = resources["years"] if (not isinstance(years, tuple)) and (not isinstance( years, list)): raise TypeError, "Entry 'years' in resources must be a tuple." if len(years) < 2: print years raise StandardError, "Entry 'years' in resources must be of length at least 2." start_year = years[0] end_year = years[-1] debuglevel = resources.get("debuglevel", 0) seed_values = resources.get('seed', NO_SEED) logger.log_status("random seed = %s" % str(seed_values)) seed(seed_values) for year in range(start_year, end_year + 1): with logger.block("Starting simulation for year " + str(year)): self.simulation_state.set_current_time(year) SessionConfiguration().get_dataset_pool( ).remove_all_datasets() logger.disable_file_logging(log_file) try: if models_in_years.get(year, None) is not None: models_to_run = models_in_years[year] else: models_to_run = models self._run_year( year=year, models=models_to_run, simulation_state=self.simulation_state, debuglevel=debuglevel, resources=resources, write_datasets_to_cache_at_end_of_year= write_datasets_to_cache_at_end_of_year, cleanup_datasets=cleanup_datasets) finally: logger.enable_file_logging(log_file, verbose=False) collect() finally: logger.disable_file_logging(log_file)
def logged_method(*req_args, **opt_args): with logger.block(name=an_instance.name(), verbose=False): results = compute_method(*req_args, **opt_args) an_instance._do_flush_dependent_variables_if_required() return results