def _write_coverage_file(self, track: Track, group_obs_counts: Dict[Optional[str], int], group_var_counts: Dict[Optional[str], Dict[Tuple[str, ...], int]], infix: str) -> None: fn: str = self.file_prefix + "_" + infix + ".csv" logging.info("Writing coverage file to %s.", fn) groups: List[str] = sorted([str(x) for x in group_obs_counts.keys()]) columns: List[str] = ["variable", "in_schema", "var_id", "data_type"] + groups sorted_vars = _get_sorted_vars(group_var_counts, track) with open(fn, "w") as fh: writer: csv.DictWriter = csv.DictWriter(fh, columns) writer.writeheader() for var_path in sorted_vars: logging.debug("Writing coverage for %s.", nesteddicts.path_to_str(var_path)) row: Dict = self._init_row(var_path) for group in group_obs_counts.keys(): n_in_group: int = group_obs_counts[group] times_var_observed: int = group_var_counts[group].get(var_path, 0) frac: float = times_var_observed / n_in_group if frac > 1.0: logging.warning("Observed coverage of {:.5f} (>1) for variable {:}." .format(frac, nesteddicts.path_to_str(var_path))) #assert frac <= 1.0 row[str(group)] = "%0.2f" % frac writer.writerow(row)
def __call__(self, fixture: Any, actual: Optional[Any], path: Optional[ListType[str]] = None) -> bool: assert fixture is not None # If we have a dictionary and no path, we're starting with the root if isinstance(fixture, dict) and path is None: return self.compare_folders(fixture, actual, []) # Otherwise, find out what kind of variable we're looking at assert path is not None var: Optional[Variable] = self.schema.lookup(tuple(path)) if var is None: raise ValueError("Unrecognized variable %s" % path_to_str(path)) data_type: str = var.data_type if data_type == "Folder": return self.compare_folders(fixture, actual, path) if data_type == "List": return self.compare_lists(fixture, actual, path) if data_type == "KeyedList": return self.compare_keyed_lists(fixture, actual, path) if data_type == "MultipleText": return compare_multiple_text(fixture, actual) return compare_primitives(fixture, actual)
def _handle_list(self, composite_id: str, child_path: Tuple[str, ...], value: Any, observed: Set) -> None: for child_value in value: if child_value is None: logging.debug("Encountered empty list item in composite %s (path %s).", composite_id, nesteddicts.path_to_str(child_path)) continue self._crawl(composite_id, child_value, observed, child_path)
def _record_missing(self, path: ListType, data_type: str, value: Optional[Any]) -> None: if not _is_simple_value(value): value = json.dumps(value, sort_keys=True) path_str = nesteddicts.path_to_str(path) missing: MissingValue = MissingValue(self.entity_id, self.label, path_str, data_type, value) self.outcome.missings.append(missing)
def _source_path(var: Variable, source_id: VariableId) -> str: source_track: Optional[Track] = var.track.source try: assert source_track is not None source_var: Variable = source_track[source_id] except Exception as e: print("breakpoint") raise e return path_to_str(source_var.absolute_path)
def _verify_source_parent(variable: "Variable", source_var_id: VariableId) -> None: list_ancestor: Optional["Variable"] = variable.get_first_list_ancestor() if list_ancestor is None: return parent_sources: Set[VariableId] = set(list_ancestor.sources) assert variable.track.source is not None source: "Variable" = variable.track.source[source_var_id] while source.parent is not None and source.var_id not in parent_sources: source = variable.track.source[source.parent] if source.var_id not in parent_sources: template: str = 'Variable %s (%s), which descends from %s %s (%s), includes %s (%s) as a source, but that ' \ 'does not descend from one of the root list\'s sources.' msg = template % (path_to_str( variable.absolute_path), variable.var_id, list_ancestor.data_type, path_to_str(list_ancestor.absolute_path), list_ancestor.var_id, path_to_str(source.absolute_path), source.var_id) raise ValueError(msg)
def __call__(self) -> None: writer = csv.writer(self.fh) for var in self.schema: # type: Variable var_id: VariableId = var.var_id abs_path: str = path_to_str(var.absolute_path) if not var.sources: writer.writerow([var_id, abs_path]) continue row: ListType = [var_id, abs_path] + [_source_path(var, source_id) for source_id in var.sources] writer.writerow(row)
def _write_coverage_file(self) -> None: output_filename: str = self.output_filename or "source_coverage.csv" fn: str = os.path.join(self.context.output_dir, output_filename) logging.info("Writing coverage file to %s.", fn) columns: List[str] = [ "source_var_id", "source_var_path", "target_var_id", "target_var_path", "data_type", "n" ] source_schema = self.schema.source assert source_schema is not None with open(fn, "w") as fh: writer: csv.DictWriter = csv.DictWriter(fh, columns) writer.writeheader() for var_info in sorted(self.coverage_result): source_var_id = var_info.source_var_id target_var_id = var_info.target_var_id logging.debug("Writing coverage for %s -> %s.", source_var_id, target_var_id) source_var: Optional[Variable] = source_schema.get( source_var_id) target_var: Optional[Variable] = self.schema.get(target_var_id) assert source_var is not None and target_var is not None row = { "source_var_id": source_var_id, "source_var_path": nesteddicts.path_to_str(source_var.absolute_path), "target_var_id": target_var_id, "target_var_path": nesteddicts.path_to_str(target_var.absolute_path), "data_type": source_var.data_type, "n": self.coverage_result[var_info] } writer.writerow(row)
def _record_mismatch(self, path: ListType, data_type: str, expected: Optional[Any], actual: Optional[Any]) -> None: if not _is_simple_value(expected): expected = json.dumps(expected, sort_keys=True) if not _is_simple_value(actual): actual = json.dumps(actual, sort_keys=True) path_str = nesteddicts.path_to_str(path) mismatch: ValueMismatch = ValueMismatch(self.entity_id, self.label, path_str, data_type, expected, actual) self.outcome.mismatches.append(mismatch)
def _crawl_folder(self, node: Dict, path: List, period: Optional[str]) -> None: keys: List = list( node.keys()) # May need to delete a key, so create a copy for key in keys: if key.startswith("_"): logging.debug("Ignoring system variable %s" % nesteddicts.path_to_str(path + [key])) continue value = node[key] child_path = path + [key] var: Optional[Variable] = self.composite.schema.lookup(child_path) if var is None: logging.warning( "Unknown variable path %s in period %s of composite %s" % (nesteddicts.path_to_str(path), period or "immutable", self.composite.composite_id)) self._record_exception("unknown_vars", child_path, value, period) continue # Only primitives have the "cast" method if isinstance(var, Primitive): try: casted: Any = var.cast(value) node[key] = casted except ValueError: logging.warning( 'Could not cast value "%s" into data type "%s"' % (value, var.data_type)) self._record_exception("cast_errors", path, {key: value}, period) del node[key] else: self._crawl(value, child_path, period)
def _process_track(track: Track, temporality: str, writer: csv.DictWriter) -> None: rows: Dict[str, Dict[str, str]] = {} for var_id, variable in track.items(): abs_path: str = path_to_str(variable.absolute_path) row: Dict = { "variable_id": var_id, "absolute_path": abs_path, "data_type": variable.data_type, "temporality": temporality } rows[abs_path] = row for abs_path in sorted(rows.keys()): row = rows[abs_path] writer.writerow(row)
def _write_groups_file(self, group_obs_counts: Dict[Optional[str], int], grouping_var_id: Optional[str], infix: str) -> None: groups_fn: str = self.file_prefix + "_" + infix + "_groups.csv" with open(groups_fn, "w") as fh: group_var_path: str = "Group" if grouping_var_id is not None: group_var: Optional[Variable] = self.schema.lookup(grouping_var_id) if group_var is not None: group_var_path = nesteddicts.path_to_str(group_var.absolute_path) writer: csv.DictWriter = csv.DictWriter(fh, [group_var_path, "observations"]) writer.writeheader() for key, value in sorted(zip([str(key) for key in group_obs_counts.keys()], group_obs_counts.values())): writer.writerow({ group_var_path: str(key), "observations": str(value) })
def _record_all_as_missing(self, f_subtree: Optional[Any], path: ListType[str]) -> None: """Recursively find all non-folders in the subtree, recording them as missing variables.""" data_type: str if len(path) == 0: data_type = "Folder" else: var: Optional[Variable] = self.schema.lookup(path) assert var is not None data_type = var.data_type if data_type == "Folder": assert f_subtree is not None for key, subfolder in f_subtree.items(): self._record_all_as_missing(subfolder, path + [key]) else: var_path: str = nesteddicts.path_to_str(path) missing: MissingValue = MissingValue(self.entity_id, self.label, var_path, data_type, f_subtree) self.outcome.missings.append(missing)
def _init_row(self, var_path: Tuple) -> Dict: var_path_str: str = nesteddicts.path_to_str(var_path) var: Optional[Variable] = self.schema.lookup(var_path) if var is not None: return { "variable": var_path_str, "in_schema": "TRUE", "var_id": var.var_id, "data_type": var.data_type } else: return { "variable": var_path_str, "in_schema": "FALSE", "var_id": "", "data_type": "" }
def _inspect(self, key: str, f_tree: Optional[Any], a_tree: Dict, path: ListType[str]) -> None: child_path: ListType[str] = path + [key] var: Optional[Variable] = self.schema.lookup(child_path) if var is None: raise ValueError("No variable called %s" % nesteddicts.path_to_str(path + [key])) data_type: str = var.data_type if f_tree == POLYTROPOS_NA: self._handle_explicit_na(data_type, a_tree, child_path) return if data_type == "Folder": assert isinstance(f_tree, dict) self._inspect_folder(f_tree, a_tree, child_path) elif data_type in {"List", "NamedList"}: self._inspect_complex(data_type, f_tree, a_tree, child_path) else: self._inspect_primitive(data_type, f_tree, a_tree, child_path)
def _crawl(self, node: Any, path: List, period: Optional[str]) -> None: if len(path) == 0: self._crawl_folder(node, path, period) return var: Optional[Variable] = self.composite.schema.lookup(path) if var is None: logging.warning("Unknown variable path %s in composite %s" % (nesteddicts.path_to_str(path), self.composite.composite_id)) self._record_exception("unknown_vars", path, node, period) return if var.data_type == "List": self._crawl_list(node, path, period) elif var.data_type == "KeyedList": self._crawl_keyed_list(node, path, period) elif var.data_type == "Folder": self._crawl_folder(node, path, period) else: raise ValueError
def _nested_case(self, descriptor: Dict) -> Iterator[str]: var_id: VariableId = list(descriptor.keys())[0] content: Dict = list(descriptor.values())[0] if "type" not in content: raise ValueError("Expected type specification for nested columns") ctype: str = content["type"] if ctype not in {"List", "KeyedList"}: raise ValueError('Unexpected type specification "%s"' % ctype) variable: Variable = self.schema.get(var_id) if variable is None: raise ValueError('Unrecognized variable ID "%s"' % var_id) if variable.data_type != ctype: raise ValueError('%s root "%s" is actually a %s' % (ctype, var_id, variable.data_type)) if ctype == "KeyedList": if "key_column_name" in content: yield content["key_column_name"] else: yield nesteddicts.path_to_str(variable.absolute_path) if "children" in content: yield from self(content["children"])
def __str__(self) -> str: msg: str = "Unrecognized variable %s" % path_to_str(self.path) return msg
def _record_invalid(self, path: ListType[str], content: Optional[Any]) -> None: invalid: InvalidPath = InvalidPath(self.entity_id, nesteddicts.path_to_str(path), content) self.outcome.invalids.append(invalid)
def _record_match(self, path: ListType, data_type: str, value: Optional[Any]) -> None: path_str = nesteddicts.path_to_str(path) match: ValueMatch = ValueMatch(self.entity_id, self.label, path_str, data_type, value) self.outcome.matches.append(match)
def _record_missing(self, path: ListType, data_type: str, value: Optional[Any]) -> None: path_str = nesteddicts.path_to_str(path) missing: MissingValue = MissingValue(self.entity_id, self.label, path_str, data_type, value) self.outcome.missings.append(missing)
def _var_path(self, var_id: str) -> str: var: Variable = self.schema.get(var_id) if var is None: raise ValueError('Unrecognized variable id "%s"' % var_id) return nesteddicts.path_to_str(var.absolute_path)
def _record_mismatch(self, path: ListType, data_type: str, expected: Optional[Any], actual: Optional[Any]) -> None: path_str = nesteddicts.path_to_str(path) mismatch: ValueMismatch = ValueMismatch(self.entity_id, self.label, path_str, data_type, expected, actual) self.outcome.mismatches.append(mismatch)