def handle_imports(content, directory): import_files = content.get('import', []) if isinstance(import_files, basestring): import_files = [import_files] for fname in import_files[::-1]: import_path = os.path.abspath(os.path.join(directory, fname)) print("importing: '%s'" % import_path) import_directory = os.path.dirname(import_path) with open(import_path) as f: import_content = handle_imports(yaml.load(f), import_directory) expand_periodic_fields(import_content) for wild_key in ('globals/*/fields', 'entities/*/fields'): multi_keys = expand_wild(wild_key, import_content) for multi_key in multi_keys: import_fields = multi_get(import_content, multi_key) local_fields = multi_get(content, multi_key, []) # fields are in "yaml ordered dict" format and we want # simple list of items import_fields = [d.items()[0] for d in import_fields] local_fields = [d.items()[0] for d in local_fields] # merge the lists merged_fields = merge_items(import_fields, local_fields) # convert them back to "yaml ordered dict" merged_fields = [{k: v} for k, v in merged_fields] multi_set(content, multi_key, merged_fields) content = merge_dicts(import_content, content) return content
def handle_imports(content, directory): import_files = content.get('import', []) if isinstance(import_files, basestring): import_files = [import_files] for fname in import_files[::-1]: import_path = os.path.join(directory, fname) print("importing: '%s'" % import_path) import_directory = os.path.dirname(import_path) with open(import_path) as f: import_content = handle_imports(yaml.load(f), import_directory) expand_periodic_fields(import_content) for wild_key in ('globals/*/fields', 'entities/*/fields'): multi_keys = expand_wild(wild_key, import_content) for multi_key in multi_keys: import_fields = multi_get(import_content, multi_key) local_fields = multi_get(content, multi_key, []) # fields are in "yaml ordered dict" format and we want # simple list of items import_fields = [d.items()[0] for d in import_fields] local_fields = [d.items()[0] for d in local_fields] # merge the lists merged_fields = merge_items(import_fields, local_fields) # convert them back to "yaml ordered dict" merged_fields = [{k: v} for k, v in merged_fields] multi_set(content, multi_key, merged_fields) content = merge_dicts(import_content, content) return content
def load_def(localdir, ent_name, section_def, required_fields): if 'type' in section_def and 'fields' in section_def: raise Exception("invalid structure for '%s': " "type and fields sections are mutually exclusive" % ent_name) if 'type' in section_def: csv_filename = section_def.get('path', ent_name + ".csv") csv_filepath = complete_path(localdir, csv_filename) str_type = section_def['type'] if isinstance(str_type, basestring): celltype = field_str_to_type(str_type, "array '%s'" % ent_name) else: assert isinstance(str_type, type) celltype = str_type return 'ndarray', load_ndarray(csv_filepath, celltype) fields_def = section_def.get('fields') if fields_def is not None: for fdef in fields_def: if isinstance(fdef, basestring): raise SyntaxError("invalid field declaration: '%s', you are " "probably missing a ':'" % fdef) if all(isinstance(fdef, dict) for fdef in fields_def): fields = fields_yaml_to_type(fields_def) else: assert all(isinstance(fdef, tuple) for fdef in fields_def) fields = fields_def else: fields = None newnames = merge_dicts(invert_dict(section_def.get('oldnames', {})), section_def.get('newnames', {})) transpose = section_def.get('transposed', False) interpolate_def = section_def.get('interpolate') files_def = section_def.get('files') if files_def is None: #XXX: it might be cleaner to use the same code path than for the # multi-file case (however, that would loose the "import any file # size" feature that I'm fond of. # we can simply return the stream as-is #FIXME: stream is not sorted # csv file is assumed to be in the correct order (ie by period then id) csv_filename = section_def.get('path', ent_name + ".csv") csv_filepath = complete_path(localdir, csv_filename) csv_file = CSV(csv_filepath, newnames, delimiter=',', transpose=transpose) if fields is not None: fields = required_fields + fields stream = csv_file.read(fields) if fields is None: fields = csv_file.fields if interpolate_def is not None: raise Exception('interpolate is currently only supported with ' 'multiple files') return 'table', (fields, csv_file.numlines, stream, csv_file) else: # we have to load all files, merge them and return a stream out of that print(" * computing number of rows...") # 1) only load required fields default_args = dict(newnames=newnames, transpose=transpose) if isinstance(files_def, dict): files_items = files_def.items() elif isinstance(files_def, list) and files_def: if isinstance(files_def[0], dict): # handle YAML ordered dict structure files_items = [d.items()[0] for d in files_def] elif isinstance(files_def[0], basestring): files_items = [(path, {}) for path in files_def] else: raise Exception("invalid structure for 'files'") else: raise Exception("invalid structure for 'files'") #XXX: shouldn't we use the "path" defined for the whole entity if any? # section_def.get('path') files = [] for path, kwargs in files_items: kwargs['newnames'] = \ merge_dicts(invert_dict(kwargs.pop('oldnames', {})), kwargs.get('newnames', {})) f = CSV(complete_path(localdir, path), **merge_dicts(default_args, kwargs)) files.append(f) id_periods = union1d(f.as_array(required_fields) for f in files) print(" * reading files...") # 2) load all fields if fields is None: target_fields = merge_items(*[f.fields for f in files]) fields_per_file = [None for f in files] else: target_fields = required_fields + fields fields_per_file = [[(name, type_) for name, type_ in target_fields if name in f.field_names] for f in files] total_fields = set.union(*[set(f.field_names) for f in files]) missing = set(name for name, _ in target_fields) - total_fields if missing: raise Exception("the following fields were not found in any " "file: %s" % ", ".join(missing)) total_lines = len(id_periods) # allocate main array target = np.empty(total_lines, dtype=np.dtype(target_fields)) # fill with default values target[:] = tuple(missing_values[ftype] for _, ftype in target_fields) target['period'] = id_periods['period'] target['id'] = id_periods['id'] arrays = [f.as_array(fields_to_load) for f, fields_to_load in zip(files, fields_per_file)] # close all files for f in files: f.close() #FIXME: interpolation currently only interpolates missing data points, # not data points with their value equal the missing value # corresponding to the field type. This can only be fixed once # booleans are loaded as int8. if interpolate_def is not None: if any(v != 'previous_value' for v in interpolate_def.itervalues()): raise Exception("currently, only 'previous_value' " "interpolation is supported") to_interpolate = [k for k, v in interpolate_def.iteritems() if v == 'previous_value'] else: to_interpolate = [] interpolate(target, arrays, id_periods, to_interpolate) return 'table', (target_fields, total_lines, iter(target), None)
def merge_h5(input1_path, input2_path, output_path): input1_file = tables.openFile(input1_path, mode="r") input2_file = tables.openFile(input2_path, mode="r") output_file = tables.openFile(output_path, mode="w") output_globals = output_file.createGroup("/", "globals", "Globals") print "copying globals from", input1_path, copyTable(input1_file.root.globals.periodic, output_file, output_globals) print "done." input1_entities = input1_file.root.entities input2_entities = input2_file.root.entities fields1 = get_h5_fields(input1_file) fields2 = get_h5_fields(input2_file) ent_names1 = set(fields1.keys()) ent_names2 = set(fields2.keys()) output_entities = output_file.createGroup("/", "entities", "Entities") for ent_name in sorted(ent_names1 | ent_names2): print print ent_name ent_fields1 = fields1.get(ent_name, []) ent_fields2 = fields2.get(ent_name, []) output_fields = merge_items(ent_fields1, ent_fields2) output_table = output_file.createTable(output_entities, ent_name, np.dtype(output_fields)) if ent_name in ent_names1: table1 = getattr(input1_entities, ent_name) print " * indexing table from %s ..." % input1_path, input1_rows = index_table_light(table1) print "done." else: table1 = None input1_rows = {} if ent_name in ent_names2: table2 = getattr(input2_entities, ent_name) print " * indexing table from %s ..." % input2_path, input2_rows = index_table_light(table2) print "done." else: table2 = None input2_rows = {} print " * merging: ", input1_periods = input1_rows.keys() input2_periods = input2_rows.keys() output_periods = sorted(set(input1_periods) | set(input2_periods)) def merge_period(period_idx, period): if ent_name in ent_names1: start, stop = input1_rows.get(period, (0, 0)) input1_array = table1.read(start, stop) else: input1_array = None if ent_name in ent_names2: start, stop = input2_rows.get(period, (0, 0)) input2_array = table2.read(start, stop) else: input2_array = None if ent_name in ent_names1 and ent_name in ent_names2: output_array, _ = mergeArrays(input1_array, input2_array) elif ent_name in ent_names1: output_array = input1_array elif ent_name in ent_names2: output_array = input2_array else: raise Exception("this shouldn't have happened") output_table.append(output_array) output_table.flush() loop_wh_progress(merge_period, output_periods) print " done." input1_file.close() input2_file.close() output_file.close()
def merge_group(parent1, parent2, name, output_file, index_col): print() print(name) print('=' * len(name)) group1 = getattr(parent1, name, None) group2 = getattr(parent2, name, None) if group1 is None and group2 is None: print("node not found in either input files, skipped") return output_group = output_file.create_group("/", name) fields1 = get_group_fields(group1) fields2 = get_group_fields(group2) ent_names1 = set(fields1.keys()) ent_names2 = set(fields2.keys()) for ent_name in sorted(ent_names1 | ent_names2): print() print(ent_name) ent_fields1 = fields1.get(ent_name, []) ent_fields2 = fields2.get(ent_name, []) output_fields = merge_items(ent_fields1, ent_fields2) output_table = output_file.create_table(output_group, ent_name, np.dtype(output_fields)) if ent_name in ent_names1: table1 = getattr(group1, ent_name) # noinspection PyProtectedMember print(" * indexing table from %s ..." % group1._v_file.filename, end=' ') input1_rows = index_table_light(table1, index_col) print("done.") else: table1 = None input1_rows = {} if ent_name in ent_names2: table2 = getattr(group2, ent_name) # noinspection PyProtectedMember print(" * indexing table from %s ..." % group2._v_file.filename, end=' ') input2_rows = index_table_light(table2, index_col) print("done.") else: table2 = None input2_rows = {} print(" * merging: ", end=' ') input1_periods = input1_rows.keys() input2_periods = input2_rows.keys() output_periods = sorted(set(input1_periods) | set(input2_periods)) # noinspection PyUnusedLocal def merge_period(period_idx, period): if ent_name in ent_names1: start, stop = input1_rows.get(period, (0, 0)) input1_array = table1.read(start, stop) else: input1_array = None if ent_name in ent_names2: start, stop = input2_rows.get(period, (0, 0)) input2_array = table2.read(start, stop) else: input2_array = None if ent_name in ent_names1 and ent_name in ent_names2: if 'id' in input1_array.dtype.names: assert 'id' in input2_array.dtype.names output_array, _ = merge_arrays(input1_array, input2_array) else: output_array = merge_array_records(input1_array, input2_array) elif ent_name in ent_names1: output_array = input1_array elif ent_name in ent_names2: output_array = input2_array else: raise Exception("this shouldn't have happened") output_table.append(output_array) output_table.flush() loop_wh_progress(merge_period, output_periods) print(" done.")
def diff_array(array1, array2, showdiffs=10, raiseondiff=False): if len(array1) != len(array2): print("length is different: %d vs %d" % (len(array1), len(array2))) ids1 = array1['id'] ids2 = array2['id'] all_ids = np.union1d(ids1, ids2) notin1 = np.setdiff1d(ids1, all_ids) notin2 = np.setdiff1d(ids2, all_ids) if notin1: print("the following ids are not present in file 1:", notin1) elif notin2: print("the following ids are not present in file 2:", notin2) else: # some ids must be duplicated if len(ids1) > len(all_ids): print("file 1 contain duplicate ids:", end=' ') uniques, dupes = unique_dupes(ids1) print(dupes) array1 = array1[uniques] if len(ids2) > len(all_ids): print("file 2 contain duplicate ids:", end=' ') uniques, dupes = unique_dupes(ids2) print(dupes) array2 = array2[uniques] fields1 = get_fields(array1) fields2 = get_fields(array2) fnames1 = set(array1.dtype.names) fnames2 = set(array2.dtype.names) # use merge_items instead of fnames1 | fnames2 to preserve ordering for fname, _ in merge_items(fields1, fields2): print(" - %s:" % fname, end=' ') if fname not in fnames1: print("missing in file 1") continue elif fname not in fnames2: print("missing in file 2") continue col1, col2 = array1[fname], array2[fname] if np.issubdtype(col1.dtype, np.inexact): if len(col1) == len(col2): both_nan = np.isnan(col1) & np.isnan(col2) eq = np.all(both_nan | (col1 == col2)) else: eq = False else: eq = np.array_equal(col1, col2) if eq: print("ok") else: print("different", end=' ') if len(col1) != len(col2): print("(length)") else: diff = (col1 != col2).nonzero()[0] print("(%d differences)" % len(diff)) ids = array1['id'] if len(diff) > showdiffs: diff = diff[:showdiffs] print( PrettyTable( [['id', fname + ' (file1)', fname + ' (file2)']] + [[ids[idx], col1[idx], col2[idx]] for idx in diff])) if raiseondiff: raise Exception('different')
def merge_h5(input1_path, input2_path, output_path): input1_file = tables.open_file(input1_path, mode="r") input2_file = tables.open_file(input2_path, mode="r") output_file = tables.open_file(output_path, mode="w") print("copying globals from", input1_path, end=' ') #noinspection PyProtectedMember input1_file.root.globals._f_copy(output_file.root, recursive=True) print("done.") input1_entities = input1_file.root.entities input2_entities = input2_file.root.entities fields1 = get_h5_fields(input1_file) fields2 = get_h5_fields(input2_file) ent_names1 = set(fields1.keys()) ent_names2 = set(fields2.keys()) output_entities = output_file.create_group("/", "entities", "Entities") for ent_name in sorted(ent_names1 | ent_names2): print() print(ent_name) ent_fields1 = fields1.get(ent_name, []) ent_fields2 = fields2.get(ent_name, []) output_fields = merge_items(ent_fields1, ent_fields2) output_table = output_file.create_table(output_entities, ent_name, np.dtype(output_fields)) if ent_name in ent_names1: table1 = getattr(input1_entities, ent_name) print(" * indexing table from %s ..." % input1_path, end=' ') input1_rows = index_table_light(table1) print("done.") else: table1 = None input1_rows = {} if ent_name in ent_names2: table2 = getattr(input2_entities, ent_name) print(" * indexing table from %s ..." % input2_path, end=' ') input2_rows = index_table_light(table2) print("done.") else: table2 = None input2_rows = {} print(" * merging: ", end=' ') input1_periods = input1_rows.keys() input2_periods = input2_rows.keys() output_periods = sorted(set(input1_periods) | set(input2_periods)) #noinspection PyUnusedLocal def merge_period(period_idx, period): if ent_name in ent_names1: start, stop = input1_rows.get(period, (0, 0)) input1_array = table1.read(start, stop) else: input1_array = None if ent_name in ent_names2: start, stop = input2_rows.get(period, (0, 0)) input2_array = table2.read(start, stop) else: input2_array = None if ent_name in ent_names1 and ent_name in ent_names2: output_array, _ = merge_arrays(input1_array, input2_array) elif ent_name in ent_names1: output_array = input1_array elif ent_name in ent_names2: output_array = input2_array else: raise Exception("this shouldn't have happened") output_table.append(output_array) output_table.flush() loop_wh_progress(merge_period, output_periods) print(" done.") input1_file.close() input2_file.close() output_file.close()
def load_def(localdir, ent_name, section_def, required_fields): if 'type' in section_def and 'fields' in section_def: raise Exception("invalid structure for '%s': " "type and fields sections are mutually exclusive" % ent_name) if 'type' in section_def: csv_filename = section_def.get('path', ent_name + ".csv") csv_filepath = complete_path(localdir, csv_filename) str_type = section_def['type'] if isinstance(str_type, basestring): celltype = field_str_to_type(str_type, "array '%s'" % ent_name) else: assert isinstance(str_type, type) celltype = str_type return 'ndarray', load_ndarray(csv_filepath, celltype) fields_def = section_def.get('fields') if fields_def is not None: for fdef in fields_def: if isinstance(fdef, basestring): raise SyntaxError("invalid field declaration: '%s', you are " "probably missing a ':'" % fdef) if all(isinstance(fdef, dict) for fdef in fields_def): fields = fields_yaml_to_type(fields_def) else: assert all(isinstance(fdef, tuple) for fdef in fields_def) fields = fields_def fnames = {name for name, _ in fields} for reqname, reqtype in required_fields[::-1]: if reqname not in fnames: fields.insert(0, (reqname, reqtype)) else: fields = None newnames = merge_dicts(invert_dict(section_def.get('oldnames', {})), section_def.get('newnames', {})) transpose = section_def.get('transposed', False) interpolate_def = section_def.get('interpolate') files_def = section_def.get('files') if files_def is None: # XXX: it might be cleaner to use the same code path than for the # multi-file case (however, that would loose the "import any file # size" feature that I'm fond of. # we can simply return the stream as-is # FIXME: stream is not sorted # csv file is assumed to be in the correct order (ie by period then id) csv_filename = section_def.get('path', ent_name + ".csv") csv_filepath = complete_path(localdir, csv_filename) csv_file = CSV(csv_filepath, newnames, delimiter=',', transpose=transpose) stream = csv_file.read(fields) if fields is None: fields = csv_file.fields if interpolate_def is not None: raise Exception('interpolate is currently only supported with ' 'multiple files') return 'table', (fields, csv_file.numlines, stream, csv_file) else: # we have to load all files, merge them and return a stream out of that print(" * computing number of rows...") # 1) only load required fields default_args = dict(newnames=newnames, transpose=transpose) if isinstance(files_def, dict): files_items = files_def.items() elif isinstance(files_def, list) and files_def: if isinstance(files_def[0], dict): # handle YAML ordered dict structure files_items = [d.items()[0] for d in files_def] elif isinstance(files_def[0], basestring): files_items = [(path, {}) for path in files_def] else: raise Exception("invalid structure for 'files'") else: raise Exception("invalid structure for 'files'") # XXX: shouldn't we use the "path" defined for the whole entity if any? # section_def.get('path') files = [] for path, kwargs in files_items: kwargs['newnames'] = \ merge_dicts(invert_dict(kwargs.pop('oldnames', {})), kwargs.get('newnames', {})) f = CSV(complete_path(localdir, path), **merge_dicts(default_args, kwargs)) files.append(f) id_periods = union1d(f.as_array(required_fields) for f in files) print(" * reading files...") # 2) load all fields if fields is None: target_fields = merge_items(*[f.fields for f in files]) fields_per_file = [None for _ in files] else: target_fields = fields fields_per_file = [[(name, type_) for name, type_ in target_fields if name in f.field_names] for f in files] total_fields = set.union(*[set(f.field_names) for f in files]) missing = set(name for name, _ in target_fields) - total_fields if missing: raise Exception("the following fields were not found in any " "file: %s" % ", ".join(missing)) total_lines = len(id_periods) # allocate main array target = get_default_array(total_lines, np.dtype(target_fields)) target['period'] = id_periods['period'] target['id'] = id_periods['id'] arrays = [ f.as_array(fields_to_load) for f, fields_to_load in zip(files, fields_per_file) ] # close all files for f in files: f.close() # FIXME: interpolation currently only interpolates missing data points, # not data points with their value equal the missing value # corresponding to the field type. This can only be fixed once # booleans are loaded as int8. if interpolate_def is not None: if any(v != 'previous_value' for v in interpolate_def.itervalues()): raise Exception("currently, only 'previous_value' " "interpolation is supported") to_interpolate = [ k for k, v in interpolate_def.iteritems() if v == 'previous_value' ] else: to_interpolate = [] interpolate(target, arrays, id_periods, to_interpolate) return 'table', (target_fields, total_lines, iter(target), None)
def diff_array(array1, array2, numdiff=10, raiseondiff=False): if len(array1) != len(array2): print("length is different: %d vs %d" % (len(array1), len(array2))) ids1 = array1['id'] ids2 = array2['id'] all_ids = np.union1d(ids1, ids2) notin1 = np.setdiff1d(ids1, all_ids) notin2 = np.setdiff1d(ids2, all_ids) if notin1: print("the following ids are not present in file 1:", \ notin1) elif notin2: print("the following ids are not present in file 2:", \ notin2) else: # some ids must be duplicated if len(ids1) > len(all_ids): print("file 1 contain duplicate ids:", end=' ') uniques, dupes = unique_dupes(ids1) print(dupes) array1 = array1[uniques] if len(ids2) > len(all_ids): print("file 2 contain duplicate ids:", end=' ') uniques, dupes = unique_dupes(ids2) print(dupes) array2 = array2[uniques] fields1 = get_fields(array1) fields2 = get_fields(array2) fnames1 = set(array1.dtype.names) fnames2 = set(array2.dtype.names) # use merge_items instead of fnames1 | fnames2 to preserve ordering for fname, _ in merge_items(fields1, fields2): print(" - %s:" % fname, end=' ') if fname not in fnames1: print("missing in file 1") continue elif fname not in fnames2: print("missing in file 2") continue col1, col2 = array1[fname], array2[fname] if issubclass(col1.dtype.type, np.inexact): if len(col1) == len(col2): both_nan = np.isnan(col1) & np.isnan(col2) eq = np.all(both_nan | (col1 == col2)) else: eq = False else: eq = np.array_equal(col1, col2) if eq: print("ok") else: print("different", end=' ') if len(col1) != len(col2): print("(length)") else: diff = (col1 != col2).nonzero()[0] print("(%d differences)" % len(diff)) ids = array1['id'] if len(diff) > numdiff: diff = diff[:numdiff] print(PrettyTable([['id', fname + ' (file1)', fname + ' (file2)']] + [[ids[idx], col1[idx], col2[idx]] for idx in diff])) if raiseondiff: raise Exception('different')