def load_def(localdir, ent_name, section_def, required_fields): if 'type' in section_def and 'fields' in section_def: raise Exception("invalid structure for '%s': " "type and fields sections are mutually exclusive" % ent_name) if 'type' in section_def: csv_filename = section_def.get('path', ent_name + ".csv") csv_filepath = complete_path(localdir, csv_filename) str_type = section_def['type'] if isinstance(str_type, basestring): celltype = field_str_to_type(str_type, "array '%s'" % ent_name) else: assert isinstance(str_type, type) celltype = str_type return 'ndarray', load_ndarray(csv_filepath, celltype) fields_def = section_def.get('fields') if fields_def is not None: for fdef in fields_def: if isinstance(fdef, basestring): raise SyntaxError("invalid field declaration: '%s', you are " "probably missing a ':'" % fdef) if all(isinstance(fdef, dict) for fdef in fields_def): fields = fields_yaml_to_type(fields_def) else: assert all(isinstance(fdef, tuple) for fdef in fields_def) fields = fields_def else: fields = None newnames = merge_dicts(invert_dict(section_def.get('oldnames', {})), section_def.get('newnames', {})) transpose = section_def.get('transposed', False) interpolate_def = section_def.get('interpolate') files_def = section_def.get('files') if files_def is None: #XXX: it might be cleaner to use the same code path than for the # multi-file case (however, that would loose the "import any file # size" feature that I'm fond of. # we can simply return the stream as-is #FIXME: stream is not sorted # csv file is assumed to be in the correct order (ie by period then id) csv_filename = section_def.get('path', ent_name + ".csv") csv_filepath = complete_path(localdir, csv_filename) csv_file = CSV(csv_filepath, newnames, delimiter=',', transpose=transpose) if fields is not None: fields = required_fields + fields stream = csv_file.read(fields) if fields is None: fields = csv_file.fields if interpolate_def is not None: raise Exception('interpolate is currently only supported with ' 'multiple files') return 'table', (fields, csv_file.numlines, stream, csv_file) else: # we have to load all files, merge them and return a stream out of that print(" * computing number of rows...") # 1) only load required fields default_args = dict(newnames=newnames, transpose=transpose) if isinstance(files_def, dict): files_items = files_def.items() elif isinstance(files_def, list) and files_def: if isinstance(files_def[0], dict): # handle YAML ordered dict structure files_items = [d.items()[0] for d in files_def] elif isinstance(files_def[0], basestring): files_items = [(path, {}) for path in files_def] else: raise Exception("invalid structure for 'files'") else: raise Exception("invalid structure for 'files'") #XXX: shouldn't we use the "path" defined for the whole entity if any? # section_def.get('path') files = [] for path, kwargs in files_items: kwargs['newnames'] = \ merge_dicts(invert_dict(kwargs.pop('oldnames', {})), kwargs.get('newnames', {})) f = CSV(complete_path(localdir, path), **merge_dicts(default_args, kwargs)) files.append(f) id_periods = union1d(f.as_array(required_fields) for f in files) print(" * reading files...") # 2) load all fields if fields is None: target_fields = merge_items(*[f.fields for f in files]) fields_per_file = [None for f in files] else: target_fields = required_fields + fields fields_per_file = [[(name, type_) for name, type_ in target_fields if name in f.field_names] for f in files] total_fields = set.union(*[set(f.field_names) for f in files]) missing = set(name for name, _ in target_fields) - total_fields if missing: raise Exception("the following fields were not found in any " "file: %s" % ", ".join(missing)) total_lines = len(id_periods) # allocate main array target = np.empty(total_lines, dtype=np.dtype(target_fields)) # fill with default values target[:] = tuple(missing_values[ftype] for _, ftype in target_fields) target['period'] = id_periods['period'] target['id'] = id_periods['id'] arrays = [f.as_array(fields_to_load) for f, fields_to_load in zip(files, fields_per_file)] # close all files for f in files: f.close() #FIXME: interpolation currently only interpolates missing data points, # not data points with their value equal the missing value # corresponding to the field type. This can only be fixed once # booleans are loaded as int8. if interpolate_def is not None: if any(v != 'previous_value' for v in interpolate_def.itervalues()): raise Exception("currently, only 'previous_value' " "interpolation is supported") to_interpolate = [k for k, v in interpolate_def.iteritems() if v == 'previous_value'] else: to_interpolate = [] interpolate(target, arrays, id_periods, to_interpolate) return 'table', (target_fields, total_lines, iter(target), None)
def file2h5(fpath, input_dir='', buffersize=10 * 2 ** 20): with open(fpath) as f: content = yaml.load(f) yaml_layout = { '#output': str, 'compression': str, 'globals': { 'periodic': { 'path': str, 'fields': [{ '*': str }], 'oldnames': { '*': str }, 'newnames': { '*': str }, 'invert': [str], 'transposed': bool }, '*': { 'path': str, 'type': str, 'fields': [{ '*': str }], 'oldnames': { '*': str }, 'newnames': { '*': str }, 'invert': [str], 'transposed': bool } }, '#entities': { '*': { 'path': str, 'fields': [{ '*': str }], 'oldnames': { '*': str }, 'newnames': { '*': str }, 'invert': [str], 'transposed': bool, 'files': None, # { # '*': None # } 'interpolate': { '*': str } } } } validate_dict(content, yaml_layout) localdir = os.path.dirname(os.path.abspath(fpath)) h5_filename = content['output'] compression = content.get('compression') h5_filepath = complete_path(localdir, h5_filename) print("Importing in", h5_filepath) try: h5file = tables.open_file(h5_filepath, mode="w", title="CSV import") globals_def = content.get('globals', {}) if globals_def: print() print("globals") print("-------") const_node = h5file.create_group("/", "globals", "Globals") for global_name, global_def in globals_def.iteritems(): print() print(" %s" % global_name) req_fields = ([('PERIOD', int)] if global_name == 'periodic' else []) kind, info = load_def(localdir, global_name, global_def, req_fields) if kind == 'ndarray': array_to_disk_array(h5file, const_node, global_name, info, title=global_name, compression=compression) else: assert kind == 'table' fields, numlines, datastream, csvfile = info stream_to_table(h5file, const_node, global_name, fields, datastream, numlines, title="%s table" % global_name, buffersize=buffersize, #FIXME: handle invert compression=compression) if csvfile is not None: csvfile.close() print() print("entities") print("--------") ent_node = h5file.create_group("/", "entities", "Entities") for ent_name, entity_def in content['entities'].iteritems(): print() print(" %s" % ent_name) input_filename = entity_def.get('path', input_dir + ent_name + ".csv") if input_filename[-4:]=='.csv': kind, info = load_def(localdir, ent_name, entity_def, [('period', int), ('id', int)]) assert kind == "table" fields, numlines, datastream, csvfile = info stream_to_table(h5file, ent_node, ent_name, fields, datastream, numlines, title="%s table" % ent_name, invert=entity_def.get('invert', []), buffersize=buffersize, compression=compression) if csvfile is not None: csvfile.close() if input_filename[-6:]=='.Rdata': files_def = entity_def.get('files') if files_def is None: files_def = ent_name print(" - reading", input_filename, ",file", files_def) rpy.set_default_mode(rpy.NO_CONVERSION) msg, filters = compression_str2filter(compression) try: rpy.r.load(input_dir + input_filename) except: rpy.r.load(input_filename) print(" - storing %s..." % msg) array_pandas = com.load_data(files_def) fields_def = entity_def.get('fields') if fields_def is not None: for fdef in fields_def: if isinstance(fdef, basestring): raise SyntaxError("invalid field declaration: '%s', you are " "probably missing a ':'" % fdef) fields = fields_yaml_to_type(fields_def) columns = [col[0] for col in fields] +['id','period'] else: fields = None columns = array_pandas.columns array_pandas = array_pandas.loc[:,columns] dtype = np.dtype(fields) #TODO: gerer les conflits dtype = array_pandas.to_records(index=False).dtype filters=None table = h5file.createTable(ent_node, ent_name, dtype, title="%s table" % ent_name, filters=filters) table.append(array_pandas.to_records(index=False)) table.flush() finally: h5file.close() print() print("done.")
def from_str(cls, yaml_str, simulation_dir='', input_dir=None, input_file=None, output_dir=None, output_file=None, start_period=None, periods=None, seed=None, skip_shows=None, skip_timings=None, log_level=None, assertions=None, autodump=None, autodiff=None, runs=None): content = yaml.load(yaml_str) expand_periodic_fields(content) content = handle_imports(content, simulation_dir) validate_dict(content, cls.yaml_layout) # the goal is to get something like: # globals_def = {'periodic': {'fields': [('a': int), ...], ...}, # 'MIG': {'type': int}} globals_def = {} for k, v in content.get('globals', {}).iteritems(): if "type" in v: v["type"] = field_str_to_type(v["type"], "array '%s'" % k) else: # TODO: fields should be optional (would use all the fields # provided in the file) v["fields"] = fields_yaml_to_type(v["fields"]) globals_def[k] = v simulation_def = content['simulation'] if seed is None: seed = simulation_def.get('random_seed') if seed is not None: seed = int(seed) print("using fixed random seed: %d" % seed) random.seed(seed) np.random.seed(seed) if periods is None: periods = simulation_def['periods'] if start_period is None: start_period = simulation_def['start_period'] if skip_shows is None: skip_shows = simulation_def.get('skip_shows', config.skip_shows) config.skip_shows = skip_shows if assertions is None: assertions = simulation_def.get('assertions', config.assertions) # TODO: check that the value is one of "raise", "skip", "warn" config.assertions = assertions logging_def = simulation_def.get('logging', {}) if log_level is None: log_level = logging_def.get('level', config.log_level) config.log_level = log_level if config.log_level == 'procedures': config.log_level = 'functions' warnings.warn("'procedures' logging.level is deprecated, " "please use 'functions' instead", UserDeprecationWarning) if 'timings' in simulation_def: warnings.warn("simulation.timings is deprecated, please use " "simulation.logging.timings instead", UserDeprecationWarning) config.show_timings = simulation_def['timings'] if skip_timings is None: show_timings = logging_def.get('timings', config.show_timings) else: show_timings = not skip_timings config.show_timings = show_timings if autodump is None: autodump = simulation_def.get('autodump') if autodump is True: autodump = 'autodump.h5' if isinstance(autodump, basestring): # by default autodump will dump all rows autodump = (autodump, None) config.autodump = autodump if autodiff is None: autodiff = simulation_def.get('autodiff') if autodiff is True: autodiff = 'autodump.h5' if isinstance(autodiff, basestring): # by default autodiff will compare all rows autodiff = (autodiff, None) config.autodiff = autodiff input_def = simulation_def['input'] if input_dir is None: input_dir = input_def.get('path', '') if not os.path.isabs(input_dir): input_dir = os.path.join(simulation_dir, input_dir) config.input_directory = input_dir if input_file is None: input_file = input_def.get('file', '') input_path = os.path.join(input_dir, input_file) output_def = simulation_def['output'] if output_dir is None: output_dir = output_def.get('path', '') if not os.path.isabs(output_dir): output_dir = os.path.join(simulation_dir, output_dir) if not os.path.exists(output_dir): print("creating directory: '%s'" % output_dir) os.makedirs(output_dir) config.output_directory = output_dir minimal_output = False if output_file is None: output_file = output_def.get('file', '') if output_file: output_path = os.path.join(output_dir, output_file) else: # using a temporary directory instead of a temporary file # because tempfile.* only returns file-like objects (which # pytables does not support) or directories, not file names. tmp_dir = tempfile.mkdtemp(prefix='liam2-', suffix='-tmp', dir=output_dir) output_path = os.path.join(tmp_dir, 'simulation.h5') minimal_output = True entities = {} for k, v in content['entities'].iteritems(): entities[k] = Entity.from_yaml(k, v) for entity in entities.itervalues(): entity.attach_and_resolve_links(entities) global_context = {'__globals__': global_symbols(globals_def), '__entities__': entities} parsing_context = global_context.copy() parsing_context.update((entity.name, entity.all_symbols(global_context)) for entity in entities.itervalues()) # compute the lag variable for each entity (an entity can cause fields from # other entities to be added via links) # dict of sets lag_vars_by_entity = defaultdict(set) for entity in entities.itervalues(): parsing_context['__entity__'] = entity.name entity.parse_processes(parsing_context) entity_lag_vars = entity.compute_lagged_fields() for e in entity_lag_vars: lag_vars_by_entity[e.name] |= entity_lag_vars[e] # store that in entity.lag_fields and create entity.array_lag for entity in entities.itervalues(): entity_lag_vars = lag_vars_by_entity[entity.name] if entity_lag_vars: # make sure we have an 'id' column, and that it comes first # (makes debugging easier). 'id' is always necessary for lag # expressions to be able to "expand" the vector of values to the # "current" individuals. entity_lag_vars.discard('id') sorted_vars = ['id'] + sorted(entity_lag_vars) field_type = dict(entity.fields.name_types) lag_fields = [(v, field_type[v]) for v in sorted_vars] # FIXME: this should be initialized to the data from # start_period - 2, if any so that we can use lag() in an init # process entity.array_lag = np.empty(0, dtype=np.dtype(lag_fields)) else: lag_fields = [] entity.lag_fields = lag_fields # compute minimal fields for each entity and set all which are not # minimal to output=False if minimal_output: min_fields_by_entity = defaultdict(set) for entity in entities.itervalues(): entity_lag_vars = entity.compute_lagged_fields( inspect_one_period=False) for e in entity_lag_vars: min_fields_by_entity[e.name] |= entity_lag_vars[e] for entity in entities.itervalues(): minimal_fields = min_fields_by_entity[entity.name] if minimal_fields: minimal_fields.add('id') minimal_fields.add('period') for field in entity.fields.in_output: if field.name not in minimal_fields: field.output = False if 'init' not in simulation_def and 'processes' not in simulation_def: raise SyntaxError("the 'simulation' section must have at least one " "of 'processes' or 'init' subsection") # for entity in entities.itervalues(): # entity.resolve_method_calls() used_entities = set() init_def = [d.items()[0] for d in simulation_def.get('init', [])] init_processes = [] for ent_name, proc_names in init_def: if ent_name not in entities: raise Exception("Entity '%s' not found" % ent_name) entity = entities[ent_name] used_entities.add(ent_name) init_processes.extend([(entity.processes[proc_name], 1) for proc_name in proc_names]) processes_def = [d.items()[0] for d in simulation_def.get('processes', [])] processes = [] for ent_name, proc_defs in processes_def: entity = entities[ent_name] used_entities.add(ent_name) for proc_def in proc_defs: # proc_def is simply a process name if isinstance(proc_def, basestring): # use the default periodicity of 1 proc_name, periodicity = proc_def, 1 else: proc_name, periodicity = proc_def processes.append((entity.processes[proc_name], periodicity)) entities_list = sorted(entities.values(), key=lambda e: e.name) declared_entities = set(e.name for e in entities_list) unused_entities = declared_entities - used_entities if unused_entities: suffix = 'y' if len(unused_entities) == 1 else 'ies' print("WARNING: entit%s without any executed process:" % suffix, ','.join(sorted(unused_entities))) input_method = input_def.get('method', 'h5') default_entity = simulation_def.get('default_entity') if runs is None: runs = simulation_def.get('runs', 1) return Simulation(globals_def, periods, start_period, init_processes, processes, entities_list, input_method, input_path, output_path, default_entity, runs, minimal_output)
def from_yaml(cls, fpath, input_dir=None, input_file=None, output_dir=None, output_file=None): simulation_path = os.path.abspath(fpath) simulation_dir = os.path.dirname(simulation_path) with open(fpath) as f: content = yaml.load(f) expand_periodic_fields(content) content = handle_imports(content, simulation_dir) validate_dict(content, cls.yaml_layout) # the goal is to get something like: # globals_def = {'periodic': [('a': int), ...], # 'MIG': int} globals_def = content.get('globals', {}) for k, v in content.get('globals', {}).iteritems(): if "type" in v: v["type"] = field_str_to_type(v["type"], "array '%s'" % k) else: #TODO: fields should be optional (would use all the fields # provided in the file) v["fields"] = fields_yaml_to_type(v["fields"]) globals_def[k] = v simulation_def = content['simulation'] seed = simulation_def.get('random_seed') if seed is not None: seed = int(seed) print("using fixed random seed: %d" % seed) random.seed(seed) np.random.seed(seed) periods = simulation_def['periods'] time_scale = simulation_def.get('time_scale', 'year') retro = simulation_def.get('retro', False) start_period = simulation_def.get('start_period', None) init_period = simulation_def.get('init_period', None) if start_period is None and init_period is None: raise Exception("Either start_period either init_period should be given.") if start_period is not None: if init_period is not None: raise Exception("Start_period can't be given if init_period is.") step = time_period[time_scale] * (1 - 2 * (retro)) init_period = addmonth(start_period, step) config.skip_shows = simulation_def.get('skip_shows', config.skip_shows) # TODO: check that the value is one of "raise", "skip", "warn" config.assertions = simulation_def.get('assertions', config.assertions) logging_def = simulation_def.get('logging', {}) config.log_level = logging_def.get('level', config.log_level) if 'timings' in simulation_def: warnings.warn("simulation.timings is deprecated, please use " "simulation.logging.timings instead", DeprecationWarning) config.show_timings = simulation_def['timings'] config.show_timings = logging_def.get('timings', config.show_timings) autodump = simulation_def.get('autodump', None) if autodump is True: autodump = 'autodump.h5' if isinstance(autodump, basestring): # by default autodump will dump all rows autodump = (autodump, None) config.autodump = autodump autodiff = simulation_def.get('autodiff', None) if autodiff is True: autodiff = 'autodump.h5' if isinstance(autodiff, basestring): # by default autodiff will compare all rows autodiff = (autodiff, None) config.autodiff = autodiff legislation = simulation_def.get('legislation', None) final_stat = simulation_def.get('final_stat', None) input_def = simulation_def['input'] input_directory = input_dir if input_dir is not None else input_def.get('path', '') if not os.path.isabs(input_directory): input_directory = os.path.join(simulation_dir, input_directory) config.input_directory = input_directory output_def = simulation_def['output'] output_directory = output_dir if output_dir is not None else output_def.get('path', '') assert os.path.isabs(output_directory), "{} is not an absolute path".format(output_directory) if not os.path.isabs(output_directory): output_directory = os.path.join(simulation_dir, output_directory) if not os.path.exists(output_directory): print("creating directory: '%s'" % output_directory) os.makedirs(output_directory) config.output_directory = output_directory if output_file is None: output_file = output_def['file'] output_path = os.path.join(output_directory, output_file) method = input_def.get('method', 'h5') # need to be before processes because in case of legislation, we need input_table for now. if method == 'h5': if input_file is None: input_file = input_def['file'] input_path = os.path.join(input_directory, input_file) data_source = H5Data(input_path, output_path) elif method == 'void': input_path = None data_source = Void(output_path) else: print(method, type(method)) for k, v in content['entities'].iteritems(): entities[k] = Entity.from_yaml(k, v) for entity in entities.itervalues(): entity.attach_and_resolve_links(entities) global_context = {'__globals__': global_symbols(globals_def), '__entities__': entities} parsing_context = global_context.copy() parsing_context.update((entity.name, entity.all_symbols(global_context)) for entity in entities.itervalues()) for entity in entities.itervalues(): parsing_context['__entity__'] = entity.name entity.parse_processes(parsing_context) entity.compute_lagged_fields() # entity.optimize_processes() # for entity in entities.itervalues(): # entity.resolve_method_calls() used_entities = set() init_def = [d.items()[0] for d in simulation_def.get('init', {})] init_processes = [] for ent_name, proc_names in init_def: if ent_name != 'legislation': if ent_name not in entity_registry: raise Exception("Entity '%s' not found" % ent_name) entity = entity_registry[ent_name] init_entities.add(entity) init_processes.extend([(entity.processes[proc_name], 1, 1) for proc_name in proc_names]) else: # proc1 = ExtProcess('liam2of', ['simulation', None]) proc2 = ExtProcess('of_on_liam', ['simulation', 2009, 'period']) # proc3 = ExtProcess('merge_leg',['simulation',data_source.output_path, # "C:/Til/output/"+"simul_leg.h5",'period']) # init_processes.append((proc1, 1)) init_processes.append((proc2, 1, 1)) # processes.append((proc3, 1)) processes_def = [d.items()[0] for d in simulation_def['processes']] processes = [] for ent_name, proc_defs in processes_def: if ent_name != 'legislation': entity = entity_registry[ent_name] entity_set.add(entity) for proc_def in proc_defs: # proc_def is simply a process name if isinstance(proc_def, basestring): # use the default periodicity of 1 proc_name, periodicity, start = proc_def, 1, 1 else: if len(proc_def) == 3: proc_name, periodicity, start = proc_def elif len(proc_def) == 2: proc_name, periodicity = proc_def start = 1 processes.append((entity.processes[proc_name], periodicity, start)) else: # proc1 = ExtProcess('liam2of',['simulation',None]) proc2 = ExtProcess('of_on_liam', ['simulation', proc_defs[0], 'period']) # proc3 = ExtProcess('merge_leg',['simulation',data_source.output_path, # "C:/Til/output/"+"simul_leg.h5",'period']) # processes.append((proc1, 1)) processes.append((proc2, 'year', 12)) # processes.append((proc3, 1)) entities = sorted(entity_set, key=lambda e: e.name) default_entity = simulation_def.get('default_entity') # processes[2][0].subprocesses[0][0] return Simulation(globals_def, periods, init_period, init_processes, init_entities, processes, entities, data_source, default_entity, legislation, final_stat, time_scale, retro)
def from_yaml(cls, fpath, input_dir=None, input_file=None, output_dir=None, output_file=None): simulation_path = os.path.abspath(fpath) simulation_dir = os.path.dirname(simulation_path) with open(fpath) as f: content = yaml.load(f) content = handle_imports(content, simulation_dir) validate_dict(content, cls.yaml_layout) # the goal is to get something like: # globals_def = {'periodic': [('a': int), ...], # 'MIG': int} globals_def = {} for k, v in content.get('globals', {}).iteritems(): # periodic is a special case if k == 'periodic': type_ = fields_yaml_to_type(v) else: # "fields" and "type" are synonyms type_def = v.get('fields') or v.get('type') if isinstance(type_def, basestring): type_ = field_str_to_type(type_def, "array '%s'" % k) else: if not isinstance(type_def, list): raise SyntaxError("invalid structure for globals") type_ = fields_yaml_to_type(type_def) globals_def[k] = type_ simulation_def = content['simulation'] seed = simulation_def.get('random_seed') if seed is not None: seed = int(seed) print("using fixed random seed: %d" % seed) random.seed(seed) np.random.seed(seed) periods = simulation_def['periods'] start_period = simulation_def['start_period'] config.skip_shows = simulation_def.get('skip_shows', False) #TODO: check that the value is one of "raise", "skip", "warn" config.assertions = simulation_def.get('assertions', 'raise') config.show_timings = simulation_def.get('timings', True) autodump = simulation_def.get('autodump', None) if autodump is True: autodump = 'autodump.h5' if isinstance(autodump, basestring): # by default autodump will dump all rows autodump = (autodump, None) config.autodump = autodump autodiff = simulation_def.get('autodiff', None) if autodiff is True: autodiff = 'autodump.h5' if isinstance(autodiff, basestring): # by default autodiff will compare all rows autodiff = (autodiff, None) config.autodiff = autodiff input_def = simulation_def['input'] input_directory = input_dir if input_dir is not None \ else input_def.get('path', '') if not os.path.isabs(input_directory): input_directory = os.path.join(simulation_dir, input_directory) config.input_directory = input_directory output_def = simulation_def['output'] output_directory = output_dir if output_dir is not None \ else output_def.get('path', '') if not os.path.isabs(output_directory): output_directory = os.path.join(simulation_dir, output_directory) if not os.path.exists(output_directory): print("creating directory: '%s'" % output_directory) os.makedirs(output_directory) config.output_directory = output_directory if output_file is None: output_file = output_def['file'] output_path = os.path.join(output_directory, output_file) for k, v in content['entities'].iteritems(): entity_registry.add(Entity.from_yaml(k, v)) for entity in entity_registry.itervalues(): entity.check_links() entity.parse_processes(globals_def) entity.compute_lagged_fields() init_def = [d.items()[0] for d in simulation_def.get('init', {})] init_processes, init_entities = [], set() for ent_name, proc_names in init_def: if ent_name not in entity_registry: raise Exception("Entity '%s' not found" % ent_name) entity = entity_registry[ent_name] init_entities.add(entity) init_processes.extend([(entity.processes[proc_name], 1) for proc_name in proc_names]) processes_def = [d.items()[0] for d in simulation_def['processes']] processes, entity_set = [], set() for ent_name, proc_defs in processes_def: entity = entity_registry[ent_name] entity_set.add(entity) for proc_def in proc_defs: # proc_def is simply a process name if isinstance(proc_def, basestring): # use the default periodicity of 1 proc_name, periodicity = proc_def, 1 else: proc_name, periodicity = proc_def processes.append((entity.processes[proc_name], periodicity)) entities = sorted(entity_set, key=lambda e: e.name) method = input_def.get('method', 'h5') if method == 'h5': if input_file is None: input_file = input_def['file'] input_path = os.path.join(input_directory, input_file) data_source = H5Data(input_path, output_path) elif method == 'void': data_source = Void(output_path) else: raise ValueError("'%s' is an invalid value for 'method'. It should " "be either 'h5' or 'void'") default_entity = simulation_def.get('default_entity') return Simulation(globals_def, periods, start_period, init_processes, init_entities, processes, entities, data_source, default_entity)
def from_yaml(cls, fpath, input_dir=None, input_file=None, output_dir=None, output_file=None): simulation_path = os.path.abspath(fpath) simulation_dir = os.path.dirname(simulation_path) with open(fpath) as f: content = yaml.load(f) expand_periodic_fields(content) content = handle_imports(content, simulation_dir) validate_dict(content, cls.yaml_layout) # the goal is to get something like: # globals_def = {'periodic': [('a': int), ...], # 'MIG': int} globals_def = content.get('globals', {}) for k, v in content.get('globals', {}).iteritems(): if "type" in v: v["type"] = field_str_to_type(v["type"], "array '%s'" % k) else: #TODO: fields should be optional (would use all the fields # provided in the file) v["fields"] = fields_yaml_to_type(v["fields"]) globals_def[k] = v simulation_def = content['simulation'] seed = simulation_def.get('random_seed') if seed is not None: seed = int(seed) print("using fixed random seed: %d" % seed) random.seed(seed) np.random.seed(seed) periods = simulation_def['periods'] start_period = simulation_def['start_period'] config.skip_shows = simulation_def.get('skip_shows', config.skip_shows) #TODO: check that the value is one of "raise", "skip", "warn" config.assertions = simulation_def.get('assertions', config.assertions) logging_def = simulation_def.get('logging', {}) config.log_level = logging_def.get('level', config.log_level) if 'timings' in simulation_def: warnings.warn("simulation.timings is deprecated, please use " "simulation.logging.timings instead", DeprecationWarning) config.show_timings = simulation_def['timings'] config.show_timings = logging_def.get('timings', config.show_timings) autodump = simulation_def.get('autodump', None) if autodump is True: autodump = 'autodump.h5' if isinstance(autodump, basestring): # by default autodump will dump all rows autodump = (autodump, None) config.autodump = autodump autodiff = simulation_def.get('autodiff', None) if autodiff is True: autodiff = 'autodump.h5' if isinstance(autodiff, basestring): # by default autodiff will compare all rows autodiff = (autodiff, None) config.autodiff = autodiff input_def = simulation_def['input'] input_directory = input_dir if input_dir is not None \ else input_def.get('path', '') if not os.path.isabs(input_directory): input_directory = os.path.join(simulation_dir, input_directory) config.input_directory = input_directory output_def = simulation_def['output'] output_directory = output_dir if output_dir is not None \ else output_def.get('path', '') if not os.path.isabs(output_directory): output_directory = os.path.join(simulation_dir, output_directory) if not os.path.exists(output_directory): print("creating directory: '%s'" % output_directory) os.makedirs(output_directory) config.output_directory = output_directory if output_file is None: output_file = output_def['file'] output_path = os.path.join(output_directory, output_file) entities = {} for k, v in content['entities'].iteritems(): entities[k] = Entity.from_yaml(k, v) for entity in entities.itervalues(): entity.attach_and_resolve_links(entities) global_context = {'__globals__': global_symbols(globals_def), '__entities__': entities} parsing_context = global_context.copy() parsing_context.update((entity.name, entity.all_symbols(global_context)) for entity in entities.itervalues()) for entity in entities.itervalues(): parsing_context['__entity__'] = entity.name entity.parse_processes(parsing_context) entity.compute_lagged_fields() # entity.optimize_processes() # for entity in entities.itervalues(): # entity.resolve_method_calls() used_entities = set() init_def = [d.items()[0] for d in simulation_def.get('init', {})] init_processes = [] for ent_name, proc_names in init_def: if ent_name not in entities: raise Exception("Entity '%s' not found" % ent_name) entity = entities[ent_name] used_entities.add(ent_name) init_processes.extend([(entity.processes[proc_name], 1) for proc_name in proc_names]) processes_def = [d.items()[0] for d in simulation_def['processes']] processes = [] for ent_name, proc_defs in processes_def: entity = entities[ent_name] used_entities.add(ent_name) for proc_def in proc_defs: # proc_def is simply a process name if isinstance(proc_def, basestring): # use the default periodicity of 1 proc_name, periodicity = proc_def, 1 else: proc_name, periodicity = proc_def processes.append((entity.processes[proc_name], periodicity)) entities_list = sorted(entities.values(), key=lambda e: e.name) declared_entities = set(e.name for e in entities_list) unused_entities = declared_entities - used_entities if unused_entities: suffix = 'y' if len(unused_entities) == 1 else 'ies' print("WARNING: entit%s without any executed process:" % suffix, ','.join(sorted(unused_entities))) method = input_def.get('method', 'h5') if method == 'h5': if input_file is None: input_file = input_def['file'] input_path = os.path.join(input_directory, input_file) data_source = H5Data(input_path, output_path) elif method == 'void': data_source = Void(output_path) else: raise ValueError("'%s' is an invalid value for 'method'. It should " "be either 'h5' or 'void'") default_entity = simulation_def.get('default_entity') return Simulation(globals_def, periods, start_period, init_processes, processes, entities_list, data_source, default_entity)
def load_def(localdir, ent_name, section_def, required_fields): if 'type' in section_def and 'fields' in section_def: raise Exception("invalid structure for '%s': " "type and fields sections are mutually exclusive" % ent_name) if 'type' in section_def: csv_filename = section_def.get('path', ent_name + ".csv") csv_filepath = complete_path(localdir, csv_filename) str_type = section_def['type'] if isinstance(str_type, basestring): celltype = field_str_to_type(str_type, "array '%s'" % ent_name) else: assert isinstance(str_type, type) celltype = str_type return 'ndarray', load_ndarray(csv_filepath, celltype) fields_def = section_def.get('fields') if fields_def is not None: for fdef in fields_def: if isinstance(fdef, basestring): raise SyntaxError("invalid field declaration: '%s', you are " "probably missing a ':'" % fdef) if all(isinstance(fdef, dict) for fdef in fields_def): fields = fields_yaml_to_type(fields_def) else: assert all(isinstance(fdef, tuple) for fdef in fields_def) fields = fields_def fnames = {name for name, _ in fields} for reqname, reqtype in required_fields[::-1]: if reqname not in fnames: fields.insert(0, (reqname, reqtype)) else: fields = None newnames = merge_dicts(invert_dict(section_def.get('oldnames', {})), section_def.get('newnames', {})) transpose = section_def.get('transposed', False) interpolate_def = section_def.get('interpolate') files_def = section_def.get('files') if files_def is None: # XXX: it might be cleaner to use the same code path than for the # multi-file case (however, that would loose the "import any file # size" feature that I'm fond of. # we can simply return the stream as-is # FIXME: stream is not sorted # csv file is assumed to be in the correct order (ie by period then id) csv_filename = section_def.get('path', ent_name + ".csv") csv_filepath = complete_path(localdir, csv_filename) csv_file = CSV(csv_filepath, newnames, delimiter=',', transpose=transpose) stream = csv_file.read(fields) if fields is None: fields = csv_file.fields if interpolate_def is not None: raise Exception('interpolate is currently only supported with ' 'multiple files') return 'table', (fields, csv_file.numlines, stream, csv_file) else: # we have to load all files, merge them and return a stream out of that print(" * computing number of rows...") # 1) only load required fields default_args = dict(newnames=newnames, transpose=transpose) if isinstance(files_def, dict): files_items = files_def.items() elif isinstance(files_def, list) and files_def: if isinstance(files_def[0], dict): # handle YAML ordered dict structure files_items = [d.items()[0] for d in files_def] elif isinstance(files_def[0], basestring): files_items = [(path, {}) for path in files_def] else: raise Exception("invalid structure for 'files'") else: raise Exception("invalid structure for 'files'") # XXX: shouldn't we use the "path" defined for the whole entity if any? # section_def.get('path') files = [] for path, kwargs in files_items: kwargs['newnames'] = \ merge_dicts(invert_dict(kwargs.pop('oldnames', {})), kwargs.get('newnames', {})) f = CSV(complete_path(localdir, path), **merge_dicts(default_args, kwargs)) files.append(f) id_periods = union1d(f.as_array(required_fields) for f in files) print(" * reading files...") # 2) load all fields if fields is None: target_fields = merge_items(*[f.fields for f in files]) fields_per_file = [None for _ in files] else: target_fields = fields fields_per_file = [[(name, type_) for name, type_ in target_fields if name in f.field_names] for f in files] total_fields = set.union(*[set(f.field_names) for f in files]) missing = set(name for name, _ in target_fields) - total_fields if missing: raise Exception("the following fields were not found in any " "file: %s" % ", ".join(missing)) total_lines = len(id_periods) # allocate main array target = get_default_array(total_lines, np.dtype(target_fields)) target['period'] = id_periods['period'] target['id'] = id_periods['id'] arrays = [ f.as_array(fields_to_load) for f, fields_to_load in zip(files, fields_per_file) ] # close all files for f in files: f.close() # FIXME: interpolation currently only interpolates missing data points, # not data points with their value equal the missing value # corresponding to the field type. This can only be fixed once # booleans are loaded as int8. if interpolate_def is not None: if any(v != 'previous_value' for v in interpolate_def.itervalues()): raise Exception("currently, only 'previous_value' " "interpolation is supported") to_interpolate = [ k for k, v in interpolate_def.iteritems() if v == 'previous_value' ] else: to_interpolate = [] interpolate(target, arrays, id_periods, to_interpolate) return 'table', (target_fields, total_lines, iter(target), None)
def from_yaml(cls, main_fpath, input_dir=None, input_file=None, output_dir=None, output_file=None): simulation_path = os.path.abspath(main_fpath) simulation_dir = os.path.dirname(simulation_path) # TODO : add an instruction in yaml_layout in so as to call an other YAML file and use it instead of a list. with open(main_fpath) as f: content = yaml.load(f) list_fpath = [] if "import" in content.keys(): list_fpath = content["import"].values() for fname in list_fpath: fpath = os.path.join(simulation_dir, fname) with open(fpath) as f: content1 = yaml.load(f) if "globals" in content1: if "globals" in content: raise Exception("globals can be defined only once") else: content["globals"] = content1["globals"] if "entities" in content1: if "entities" in content: for name1 in content1["entities"].keys(): if name1 in content["entities"].keys(): for name2 in content1["entities"][name1].keys(): if name2 in content["entities"][name1].keys(): if name2 in ("processes", "macros", "links"): for name3 in content1["entities"][name1][name2].keys(): if name3 in content["entities"][name1][name2].keys(): raise Exception( "%s of %s is defined a second time in %s" % (name3, name1, fpath) ) else: content["entities"][name1][name2][name3] = content1["entities"][ name1 ][name2][name3] if name2 == "fields": raise Exception( "fields of entities can be defined only once, it is not the case for %s in %s" % (name1, fpath) ) else: content["entities"][name1][name2] = content1["entities"][name1][name2] else: content["entities"][name1] = content1["entities"][name1] else: content["entities"] = content1["entities"] if "simulation" in content1: if "simulation" in content: raise Exception("simualtion can be defined only once") else: content["simulation"] = content1["simulation"] validate_dict(content, cls.yaml_layout) # the goal is to get something like: # globals_def = {'periodic': [('a': int), ...], # 'MIG': int} globals_def = {} for k, v in content.get("globals", {}).iteritems(): # periodic is a special case if k == "periodic": type_ = fields_yaml_to_type(v) else: # "fields" and "type" are synonyms type_def = v.get("fields") or v.get("type") if isinstance(type_def, basestring): type_ = field_str_to_type(type_def, "array '%s'" % k) else: if not isinstance(type_def, list): raise SyntaxError("invalid structure for globals") type_ = fields_yaml_to_type(type_def) globals_def[k] = type_ simulation_def = content["simulation"] seed = simulation_def.get("random_seed") if seed is not None: seed = int(seed) print "using fixed random seed: %d" % seed random.seed(seed) np.random.seed(seed) periods = simulation_def["periods"] start_period = simulation_def["start_period"] config.skip_shows = simulation_def.get("skip_shows", False) # TODO: check that the value is one of "raise", "skip", "warn" config.assertions = simulation_def.get("assertions", "raise") input_def = simulation_def["input"] input_directory = input_dir if input_dir is not None else input_def.get("path", "") if not os.path.isabs(input_directory): input_directory = os.path.join(simulation_dir, input_directory) config.input_directory = input_directory output_def = simulation_def["output"] output_directory = output_dir if output_dir is not None else output_def.get("path", "") if not os.path.isabs(output_directory): output_directory = os.path.join(simulation_dir, output_directory) config.output_directory = output_directory if output_file is None: output_file = output_def["file"] output_path = os.path.join(output_directory, output_file) for k, v in content["entities"].iteritems(): entity_registry.add(Entity.from_yaml(k, v)) for entity in entity_registry.itervalues(): entity.check_links() entity.parse_processes(globals_def) entity.compute_lagged_fields() init_def = [d.items()[0] for d in simulation_def.get("init", {})] init_processes, init_entities = [], set() for ent_name, proc_names in init_def: if ent_name not in entity_registry: raise Exception("Entity '%s' not found" % ent_name) entity = entity_registry[ent_name] init_entities.add(entity) init_processes.extend([(entity.processes[proc_name], 1) for proc_name in proc_names]) processes_def = [d.items()[0] for d in simulation_def["processes"]] processes, entities = [], set() for ent_name, proc_defs in processes_def: entity = entity_registry[ent_name] entities.add(entity) for proc_def in proc_defs: # proc_def is simply a process name if isinstance(proc_def, basestring): # use the default periodicity of 1 proc_name, periodicity = proc_def, 1 else: proc_name, periodicity = proc_def processes.append((entity.processes[proc_name], periodicity)) method = input_def.get("method", "h5") if method == "h5": if input_file is None: input_file = input_def["file"] input_path = os.path.join(input_directory, input_file) data_source = H5Data(input_path, output_path) elif method == "void": input_path = None data_source = Void(output_path) else: print method, type(method) default_entity = simulation_def.get("default_entity") return Simulation( globals_def, periods, start_period, init_processes, init_entities, processes, entities, data_source, default_entity, )
def from_yaml(cls, fpath, input_dir=None, input_file=None, output_dir=None, output_file=None): simulation_path = os.path.abspath(fpath) simulation_dir = os.path.dirname(simulation_path) with open(fpath) as f: content = yaml.load(f) content = handle_imports(content, simulation_dir) validate_dict(content, cls.yaml_layout) # the goal is to get something like: # globals_def = {'periodic': [('a': int), ...], # 'MIG': int} globals_def = {} for k, v in content.get('globals', {}).iteritems(): # periodic is a special case if k == 'periodic': type_ = fields_yaml_to_type(v) else: # "fields" and "type" are synonyms type_def = v.get('fields') or v.get('type') if isinstance(type_def, basestring): type_ = field_str_to_type(type_def, "array '%s'" % k) else: if not isinstance(type_def, list): raise SyntaxError("invalid structure for globals") type_ = fields_yaml_to_type(type_def) globals_def[k] = type_ simulation_def = content['simulation'] seed = simulation_def.get('random_seed') if seed is not None: seed = int(seed) print("using fixed random seed: %d" % seed) random.seed(seed) np.random.seed(seed) periods = simulation_def['periods'] time_scale = simulation_def.get('time_scale', 'year') retro = simulation_def.get('retro', False) start_period = simulation_def.get('start_period',None) init_period = simulation_def.get('init_period',None) if start_period is None and init_period is None: raise Exception("Either start_period either init_period should be given.") if start_period is not None: if init_period is not None: raise Exception("Start_period can't be given if init_period is.") step = time_period[time_scale]*(1 - 2*(retro)) init_period = addmonth(start_period, step) config.skip_shows = simulation_def.get('skip_shows', False) #TODO: check that the value is one of "raise", "skip", "warn" config.assertions = simulation_def.get('assertions', 'raise') config.show_timings = simulation_def.get('timings', True) autodump = simulation_def.get('autodump', None) if autodump is True: autodump = 'autodump.h5' if isinstance(autodump, basestring): # by default autodump will dump all rows autodump = (autodump, None) config.autodump = autodump autodiff = simulation_def.get('autodiff', None) if autodiff is True: autodiff = 'autodump.h5' if isinstance(autodiff, basestring): # by default autodiff will compare all rows autodiff = (autodiff, None) config.autodiff = autodiff legislation = simulation_def.get('legislation', None) final_stat = simulation_def.get('final_stat', None) input_def = simulation_def['input'] input_directory = input_dir if input_dir is not None \ else input_def.get('path', '') if not os.path.isabs(input_directory): input_directory = os.path.join(simulation_dir, input_directory) config.input_directory = input_directory output_def = simulation_def['output'] output_directory = output_dir if output_dir is not None \ else output_def.get('path', '') if not os.path.isabs(output_directory): output_directory = os.path.join(simulation_dir, output_directory) if not os.path.exists(output_directory): print("creating directory: '%s'" % output_directory) os.makedirs(output_directory) config.output_directory = output_directory if output_file is None: output_file = output_def['file'] output_path = os.path.join(output_directory, output_file) method = input_def.get('method', 'h5') #need to be before processes because in case of legislation, we need input_table for now. if method == 'h5': if input_file is None: input_file = input_def['file'] input_path = os.path.join(input_directory, input_file) data_source = H5Data(input_path, output_path) elif method == 'void': input_path = None data_source = Void(output_path) else: print(method, type(method)) for k, v in content['entities'].iteritems(): entity_registry.add(Entity.from_yaml(k, v)) for entity in entity_registry.itervalues(): entity.check_links() entity.parse_processes(globals_def) entity.compute_lagged_fields() init_def = [d.items()[0] for d in simulation_def.get('init', {})] init_processes, init_entities = [], set() for ent_name, proc_names in init_def: if ent_name != 'legislation': if ent_name not in entity_registry: raise Exception("Entity '%s' not found" % ent_name) entity = entity_registry[ent_name] init_entities.add(entity) init_processes.extend([(entity.processes[proc_name], 1, 1) for proc_name in proc_names]) else: # proc1 = ExtProcess('liam2of',['simulation',None]) proc2 = ExtProcess('of_on_liam',['simulation',2009,'period']) # proc3 = ExtProcess('merge_leg',['simulation',data_source.output_path, # "C:/Til/output/"+"simul_leg.h5",'period']) # init_processes.append((proc1, 1)) init_processes.append((proc2, 1, 1)) # processes.append((proc3, 1)) processes_def = [d.items()[0] for d in simulation_def['processes']] processes, entity_set = [], set() for ent_name, proc_defs in processes_def: if ent_name != 'legislation': entity = entity_registry[ent_name] entity_set.add(entity) for proc_def in proc_defs: # proc_def is simply a process name if isinstance(proc_def, basestring): # use the default periodicity of 1 proc_name, periodicity, start = proc_def, 1, 1 else: if len(proc_def) == 3: proc_name, periodicity, start = proc_def elif len(proc_def) == 2: proc_name, periodicity = proc_def start = 1 processes.append((entity.processes[proc_name], periodicity, start)) else: # proc1 = ExtProcess('liam2of',['simulation',None]) proc2 = ExtProcess('of_on_liam',['simulation',proc_defs[0],'period']) # proc3 = ExtProcess('merge_leg',['simulation',data_source.output_path, # "C:/Til/output/"+"simul_leg.h5",'period']) # processes.append((proc1, 1)) processes.append((proc2, 'year',12)) # processes.append((proc3, 1)) entities = sorted(entity_set, key=lambda e: e.name) default_entity = simulation_def.get('default_entity') #processes[2][0].subprocesses[0][0] return Simulation(globals_def, periods, init_period, init_processes, init_entities, processes, entities, data_source, default_entity, legislation, final_stat, time_scale, retro)