def load_from_args(self, args: Optional[Namespace]) -> None: for arg_name, full_conf_name in self._args.items(): arg_val = getattr(args, arg_name, None) if arg_val is not None: conf_val = self._get_value_in_correct_type(full_conf_name, arg_val) assign(self, full_conf_name, conf_val)
def dict_update(dictionary, path, unic_key, value, value_format_callback=None): path_length = len(path) - 1 for index, path_level_item in enumerate(path): is_last_iteration = path_length == index str_unic_key = unic_key step_path = ".".join(path[:index + 1]) current_step_depth_level_value = {} try: current_step_depth_level_value = glom.glom(dictionary, step_path) if is_last_iteration: if value_format_callback is not None: current_step_depth_level_value[ str_unic_key] = value_format_callback( current_step_depth_level_value[str_unic_key]) step_path_level_value = current_step_depth_level_value dictionary = glom.assign(dictionary, step_path, step_path_level_value) except: step_path_level_value = {} if is_last_iteration: current_step_depth_level_value[str_unic_key] = value step_path_level_value = current_step_depth_level_value dictionary = glom.assign(dictionary, step_path, step_path_level_value)
def test_apply_labels_to_deployment_config( input_config, is_ci, exp_deployed_by, run_pipeline_gke, monkeypatch, deployment_config, ): monkeypatch.setitem(job_gke.os.environ, "USER", "stub-user") monkeypatch.setitem(job_gke.os.environ, "CI", is_ci) monkeypatch.setattr(job_gke, "klio_cli_version", "stub-version") # TODO: patch user config for user labels user_labels = [ "label_a=value_a", "label-b=value-b", "label-c=", "labeld", # invalid, expected to be ignored ] monkeypatch.setattr(run_pipeline_gke.klio_config.pipeline_options, "labels", user_labels) labels = { "app": "test-job", "role": "testjob", "klio/deployed_by": exp_deployed_by, "klio/klio_cli_version": "stub-version", "label_a": "value_a", "label-b": "value-b", "label-c": "", } expected_config = deployment_config.copy() glom.assign(expected_config, "spec.template.metadata.labels", labels) run_pipeline_gke._apply_labels_to_deployment_config(deployment_config) assert expected_config == deployment_config
def parse(self, stream, media_type=None, parser_context=None): parsed = MultiPartParser.parse(self, stream, media_type, parser_context) if len(parsed.data) > 0: if len(parsed.files) > 0: raise ParseError('Either pass data or files') return parsed.data data = {} # Find any JSON content first for name, content in parsed.files.items(): if content.content_type != 'application/json': continue data.update(**json.load(content.file)) # Now get any other content for name, content in parsed.files.items(): if content.content_type == 'application/json': continue # name is the path into the object to assign glom.assign(data, name, content) return data
def test_assign_missing_unassignable(): """Check that the final assignment to the target object comes last, ensuring that failed assignments don't leave targets in a bad state. """ class Tarjay(object): init_count = 0 def __init__(self): self.__class__.init_count += 1 @property def unassignable(self): return value = object() target = {"preexisting": "ok"} with pytest.raises(PathAssignError): assign(target, 'tarjay.unassignable.a.b.c', value, missing=Tarjay) assert target == {'preexisting': 'ok'} # why 3? "c" gets the value of "value", while "b", "a", and # "tarjay" all succeed and are set to Tarjay instances. Then # unassignable is already present, but not possible to assign to, # raising the PathAssignError. assert Tarjay.init_count == 3
def _apply_image_to_deployment_config(self, deployment_config): image_tag = self.docker_runtime_config.image_tag pipeline_options = self.klio_config.pipeline_options if image_tag: image_path = "spec.template.spec.containers.0.image" # TODO: If more than one image deployed, # we need to search for correct container image_base = glom.glom(deployment_config, image_path) # Strip off existing image tag if any image_base = re.split(":", image_base)[0] full_image = f"{image_base}:{image_tag}" glom.assign(deployment_config, image_path, full_image) # Check to see if the kubernetes image to be deployed is the same # image that is built k8s_image = glom.glom(deployment_config, image_path) built_image_base = pipeline_options.worker_harness_container_image built_image = f"{built_image_base}:{image_tag}" if built_image != k8s_image: logging.warning( f"Image deployed by kubernetes {k8s_image} does not match " f"the built image {built_image}. " "This may result in an `ImagePullBackoff` for the deployment. " "If this is not intended, please change " "`pipeline_options.worker_harness_container_image` " "and rebuild or change the container image" "set in kubernetes/deployment.yaml file.")
def action(self, raw_crash, raw_dumps, processed_crash, processor_meta): params = { "command_pathname": self.command_pathname, "kill_timeout": self.kill_timeout, "dump_file_pathname": raw_dumps[self.dump_field], } command_line = self.command_line.format(**params) output, return_code = execute_external_process( command_pathname=self.command_pathname, command_line=command_line, processor_meta=processor_meta, interpret_output=self._interpret_output, ) glom.assign(processed_crash, "classifications.jit.category", val=output, missing=dict) glom.assign( processed_crash, "classifications.jit.category_return_code", val=return_code, missing=dict, )
def test_invalid_assign_op_target(): target = {'afunc': lambda x: 'hi %s' % x} spec = T['afunc'](x=1) with pytest.raises(ValueError): assign(target, spec, None) return
def make_nested(path_to_value: t.Dict[t.Tuple, t.Any]) -> dict: d = {} for path, value in sorted(path_to_value.items(), key=lambda path_value: len(path_value[0])): func = dict glom.assign(d, ".".join(path), value, missing=func) return d
def load_from_env(self) -> None: for env_name, full_conf_name in self._envs.items(): env_val = os.getenv(env_name, None) if env_val is not None: conf_val = self._get_value_in_correct_type(full_conf_name, env_val) assign(self, full_conf_name, conf_val)
def __camouflage_nested_dict(args_and_values: dict, keypaths: List[str]): # example keypath: "path.to.priv_key.subkey" for keypath in keypaths: # "priv_key.subkey" if (pos_start := keypath.find('priv_')) != -1: # "priv_key" pos_end = keypath[pos_start:].find('.') if pos_end == -1: # if keypath is only "priv_key", then end_pos is end # of string pos_end = len(keypath) # "path.to.priv_key" priv_keypath = keypath[:pos_start+pos_end] # camouflage sensitive value of argument try: glom.assign( args_and_values, priv_keypath, CAMOUFLAGE_SIGN ) except Exception as exc: logger.critical( 'Failed to camouflage sensitive argument ' 'for path "%s".' 'Exception: "%s"', keypath, exc ) # Keep sensitive value in log instead of aborting # logging. continue
def test_assign_missing_object(): val = object() class Container(object): pass target = Container() target.a = extant_a = Container() assign(target, 'a.b.c.d', val, missing=Container) assert target.a.b.c.d is val assert target.a is extant_a # make sure we didn't overwrite anything on the path
def parse_multipart_resources_spec(forms, files): json_spec = {} # Loads options. if "compiler" in forms: json_spec["compiler"] = forms["compiler"] # options. entries. for option_key in (param_key for param_key in forms if "options." in param_key): glom.assign(json_spec, option_key, forms[option_key], missing=dict) # Get resources specification. if "resources" in forms: try: json_spec["resources"] = json.loads(forms["resources"]) except json.decoder.JSONDecodeError as jde: return ( None, { "error": "INVALID_RESOURCES_JSON", "exception_content": str(jde), }, ) else: # TODO Else reconstruct resources spec with best guest: # one main tex file, with other non tex resources. # Replace files in resource spec by uploaded multipart files. json_spec[ "resources"], error = construct_resources_specification_from_files( files) if error: return None, error logger.info("Reconstructed resource spec: %s", pprint.pformat(json_spec["resources"])) for resource in json_spec["resources"]: if "multipart" not in resource: continue # Does an uploaded file match? if resource["multipart"] not in files: return ( None, { "error": "MISSING_MULTIPART_FILE", "filename": resource["multipart"] }, ) multipart_file = files[resource["multipart"]] # We uses base64 for encoding file content. resource["file"] = base64.b64encode(multipart_file.read()) if "path" not in resource: resource["path"] = multipart_file.filename return json_spec, None
def test_assign_missing_dict(): target = {} val = object() def debugdict(): ret = dict() # ret['id'] = id(ret) # ret['inc'] = counter.next() return ret assign(target, 'a.b.c.d', val, missing=debugdict) assert target == {'a': {'b': {'c': {'d': val}}}}
def update_table_with_new_entry(main_data: dict, new_entry: dict, config: dict, fidelity: dict) -> dict: """ Updates the benchmark dict-hierarchy with a new function evaluation entry The storage is in a nested dict structure where the keys are arranged in the order of the configuration parameters ordered by their name, fidelity parameters ordered by their names and the seed. The final value element in the dict contains another dict returned by the actual function evaluations containing the result, cost, other misc. information. Given that the depth of this dict data will vary for different parameter space, the package `glom` is used. Wherein, the sequence of keys can be provided for easy retrieval, and assignment of values even for varying depth of a hierarchical dict. """ seed = new_entry['info']['seed'] key_nest = [] for k, v in config.items(): v = str(v) if isinstance(v, str) else np.float32(v) key_nest.append(v) if glom.glom(main_data, glom.Path(*key_nest), default=None) is None: glom.assign(main_data, glom.Path(*key_nest), dict()) for k, v in fidelity.items(): key_nest.append(np.float32(v)) if glom.glom(main_data, glom.Path(*key_nest), default=None) is None: glom.assign(main_data, glom.Path(*key_nest), dict()) key_nest.append(seed) if glom.glom(main_data, glom.Path(*key_nest), default=None) is None: glom.assign(main_data, glom.Path(*key_nest), dict()) glom.assign(main_data, glom.Path(*key_nest), new_entry) return main_data
def test_assign(): class Foo(object): pass assert glom({}, Assign(T['a'], 1)) == {'a': 1} assert glom({'a': {}}, Assign(T['a']['a'], 1)) == {'a': {'a': 1}} assert glom({'a': {}}, Assign('a.a', 1)) == {'a': {'a': 1}} assert glom(Foo(), Assign(T.a, 1)).a == 1 assert glom({}, Assign('a', 1)) == {'a': 1} assert glom(Foo(), Assign('a', 1)).a == 1 assert glom({'a': Foo()}, Assign('a.a', 1))['a'].a == 1 def r(): r = {} r['r'] = r return r assert glom(r(), Assign('r.r.r.r.r.r.r.r.r', 1)) == {'r': 1} assert glom(r(), Assign(T['r']['r']['r']['r'], 1)) == {'r': 1} assert glom(r(), Assign(Path('r', 'r', T['r']), 1)) == {'r': 1} assert assign(r(), Path('r', 'r', T['r']), 1) == {'r': 1} with pytest.raises(TypeError, match='path argument must be'): Assign(1, 'a') with pytest.raises(ValueError, match='path must have at least one element'): Assign(T, 1) assert repr(Assign(T.a, 1)) == 'Assign(T.a, 1)' assign_spec = Assign(T.a, 1, missing=dict) assert repr(assign_spec) == "Assign(T.a, 1, missing=dict)" assert repr(assign_spec) == repr(eval(repr(assign_spec)))
def set(meta, spec=None, value=None, missing=dict) -> dict: """ Set metadata in a dataframe columns :param meta: Meta data to be modified :param spec: path to the key to be modified :param value: dict value :param missing: :return: """ if spec is not None: data = copy.deepcopy(meta) assign(data, spec, value, missing=missing) else: data = value return data
def test_sequence_assign(): target = {'alist': [0, 1, 2]} assign(target, 'alist.2', 3) assert target['alist'][2] == 3 with pytest.raises(PathAssignError, match='could not assign') as exc_info: assign(target, 'alist.3', 4) # the following test is because pypy's IndexError is different than CPython's: # E - PathAssignError(IndexError('list index out of range',), Path('alist'), '3') # E + PathAssignError(IndexError('list assignment index out of range',), Path('alist'), '3') # E ? +++++++++++ exc_repr = repr(exc_info.value) assert exc_repr.startswith('PathAssignError(') assert exc_repr.endswith("'3')") return
def test_bad_assign_target(): class BadTarget(object): def __setattr__(self, name, val): raise Exception("and you trusted me?") # sanity check spec = Assign('a', 'b') ok_target = lambda: None glom(ok_target, spec) assert ok_target.a == 'b' with pytest.raises(PathAssignError, match='could not assign'): glom(BadTarget(), spec) with pytest.raises(PathAccessError, match='could not access'): assign({}, 'a.b.c', 'moot') return
def parse_json_resources_spec(json_payload): json_spec = {} # Select / copy several keys. for entry_key in (entry_key for entry_key in json_payload if entry_key in PAYLOAD_KEYS_TO_COPY): json_spec[entry_key] = json_payload[entry_key] # Auto-spread options. entries. for option_key in (param_key for param_key in json_payload if "options." in param_key): glom.assign(json_spec, option_key, json_payload[option_key], missing=dict) return json_spec, None
def override_values(crash_data, values): """ Takes a dict of path -> value to override in the original crash data. After the context is over, the crash data will return to the original value. :arg crash_data: the crash data that conforms to the schema :arg values: dict of path -> value to override :yields: dict with overridden values """ crash_data = copy.deepcopy(crash_data) for path, value in values.items(): assign(crash_data, path, val=value, missing=dict) yield crash_data
def test_apply_labels_to_deployment_config_overrides(run_pipeline_gke, monkeypatch, deployment_config): monkeypatch.setitem(job_gke.os.environ, "USER", "stub-user") monkeypatch.setattr(job_gke, "klio_cli_version", "stub-version") labels = { "app": "different-app-name", "role": "differentappname", "klio/deployed_by": "stub-user", "klio/klio_cli_version": "stub-version", } expected_config = deployment_config.copy() glom.assign(expected_config, "spec.template.metadata.labels", labels) run_pipeline_gke._apply_labels_to_deployment_config(deployment_config) assert expected_config == deployment_config
def _set_config(self, target, value): try: glom.assign(self.config_data, target, value, missing=dict) except glom.mutation.PathAssignError as e: if "IndexError" not in str(e): raise e # handle if user is trying to append to a list - for some reason # glom can't handle that stems = target.split(".") last_index = 0 for index, stem in enumerate(stems): try: int(stem) except Exception: continue new_target = ".".join(stems[last_index:index]) property_list = glom.glom(self.config_data, new_target) property_list.insert(index, {}) glom.assign(self.config_data, target, value, missing=dict)
def _apply_overrides(raw_config, overrides): """Applies overrides to raw klio config. If a key already exists in the raw config, it will be updated with the override value provided in the overrides dict. If a key does not yet exist in the raw config, it will be created and assigned the override value. Example formats include: RAW = { "allow_non_klio_messages": False, "events": { "inputs": { "file0": { "type": "file", "location": "gs://sigint-output/yesterday.txt", }, "file1": { "type": "file", "location": "gs://sigint-output/today.txt", } } } } OVER = { "allow_non_klio_messages": True, # Non-nested key "events.inputs.file1.location": "gs://sigint-output/01-01-2020.txt", "events.inputs.file2.location": "gs://sigint-output/01-02-2020.txt", "events.inputs.file2.type": "file" } Args: raw_config (dict): raw klio config dict overrides (dict): override field to override value Return: config (dict): config dict with overrides applied """ # NOQA E501 for path, value in overrides.items(): glom.assign(raw_config, path, value, missing=dict) return raw_config
def test_assign_missing_with_extant_keys(): """This test ensures that assign with missing doesn't overwrite perfectly fine extant keys that are along the path it needs to assign to. call count is also checked to make sure missing() isn't invoked too many times. """ target = {} value = object() default_struct = {'b': {'c': {}}} call_count = [0] def _get_default_struct(): call_count[0] += 1 # make sure this is only called once return default_struct assign(target, 'a.b.c', value, missing=_get_default_struct) assert target['a']['b']['c'] is value assert target['a']['b'] is default_struct['b'] assert call_count == [1]
def _edit_deployment(self, deployment_config, replica_count=None, image_tag=None): """This will update a deployment with a provided replica count or image tag. This mutates the deployment_config object Args: deployment_config(dict): deployment configuration dict that will get mutated with updated fields replica_count (int): Number of replicas the deployment will be updated with. If not provided then this will not be changed image_tag (str): The image tag that will be applied to the updated deployment. If not provided then this will not be updated. """ log_messages = [] if replica_count is not None: glom.assign(deployment_config, "spec.replicas", replica_count) log_messages.append(f"Scaled deployment to {replica_count}") if image_tag: image_path = "spec.template.spec.containers.0.image" image_base = glom.glom(deployment_config, image_path) # Strip off existing image tag if present image_base = re.split(":", image_base)[0] full_image = image_base + f":{image_tag}" glom.assign(deployment_config, image_path, full_image) log_messages.append( f"Update deployment with image tag {image_tag}") for message in log_messages: logging.info(message) ui_link = self._build_ui_link_from_current_context(deployment_config) logging.info(f"Deployment details: {ui_link}")
def _count_data_types(self, df, columns, infer=False, mismatch=None): """ Count the number of int, float, string, date and booleans and output the count in json format :param df: Dataframe to be processed :param columns: Columns to be processed :param infer: infer the column datatype :return: json """ columns = parse_columns(df, columns) count_by_data_type = df.cols.count_by_dtypes(columns, infer=infer, mismatch=mismatch) count_by_data_type_no_mismatch = copy.deepcopy(count_by_data_type) # Info from all the columns type_details = {} for col_name in columns: """ Function for determine if register value is float or int or string. :param col_name: :return: """ # Not count mismatch if "mismatch" in count_by_data_type_no_mismatch[col_name]: count_by_data_type_no_mismatch[col_name].pop("mismatch") # Get the greatest count by column data type greatest_data_type_count = max( count_by_data_type_no_mismatch[col_name], key=count_by_data_type_no_mismatch[col_name].get) if greatest_data_type_count == "string" or greatest_data_type_count == "boolean": cat = "categorical" elif greatest_data_type_count == "int" or greatest_data_type_count == "decimal": cat = "numeric" elif greatest_data_type_count == "date": cat = "date" elif greatest_data_type_count == "array": cat = "array" elif greatest_data_type_count == "binary": cat = "binary" elif greatest_data_type_count == "null": cat = "null" else: cat = None assign(type_details, col_name + ".dtype", greatest_data_type_count, dict) assign(type_details, col_name + ".type", cat, dict) assign(type_details, col_name + ".stats", count_by_data_type[col_name], dict) # print(type_details) return type_details
def set_meta(self, spec=None, value=None, missing=dict): """ Set metadata in a dataframe columns :param self: :param spec: path to the key to be modified :param value: dict value :param missing: :return: """ if spec is not None: target = self.get_meta() data = assign(target, spec, value, missing=missing) else: data = value df = self df.schema[-1].metadata = data return df
def unglom(d: T_StrAnyMapping, path: str, value: typing.Any) -> T_StrAnyMapping: """Create nested dictionary structure given a glom compatible path. This is essentially just a wrapper around :func:`glom.assign`, but works with nested paths. >>> unglom({}, 'foo.bar.baz', 'spam') {'foo': {'bar': {'baz': 'spam'}}} Args: d (:obj:`dict`): The target dictionary. path (str): The key path. value: Any value. Returns: :obj:`dict`: The original, now mutated dictionary. """ try: return glom.assign(d, path, value) except KeyError: parent, child = path.rsplit(".", 1) return unglom(d, parent, {child: value})
def columns_stats(self, df, columns, buckets=10, infer=False, relative_error=RELATIVE_ERROR, approx_count=True, mismatch=None): """ Return statistical information about a specific column in json format :param df: Dataframe to be processed :param columns: Columns that you want to profile :param buckets: Create buckets divided by range. Each bin is equal. :param infer: try to infer the column datatype :param relative_error: relative error when the percentile is calculated. 0 is more exact as slow 1 more error and faster :param approx_count: Use the function approx_count_distinct or countDistinct. approx_count_distinct is faster :param mismatch: :return: json object """ if self.rows_count is None: self.rows_count = df.count() columns = parse_columns(df, columns) # Initialize Objects logger.print("Processing Stats For columns...") # Get columns data types. This is necessary to make the pertinent histogram calculations. type_details = self._count_data_types(df, columns, infer, mismatch) # Count the categorical, numerical, boolean and date columns count_types = {} for value in type_details.values(): name = value["dtype"] if name in count_types: count_types[name] += 1 else: count_types[name] = 1 # List the data types this data set have total = 0 dtypes = [] for key, value in count_types.items(): if value > 0: dtypes.append(key) total = total + 1 count_types = fill_missing_col_types(count_types) columns_info = {} columns_info["count_types"] = count_types columns_info["total_count_dtypes"] = total columns_info["dtypes_list"] = dtypes columns_info["columns"] = type_details # Aggregation stats = Profiler.columns_agg(df, columns, buckets, relative_error, approx_count) # Calculate Frequency logger.print("Processing Frequency ...") df_freq = df.cols.select("*", data_type=PYSPARK_NUMERIC_TYPES, invert=True) freq = None if df_freq is not None: freq = df_freq.cols.frequency("*", buckets, True, self.rows_count) # Calculate percentage for col_name in columns: col_info = {} assign(col_info, "stats", stats[col_name], dict) if freq is not None: if col_name in freq: assign(col_info, "frequency", freq[col_name]) col_info["stats"].update( self.extra_columns_stats(df, col_name, stats)) assign(col_info, "name", col_name) assign(col_info, "column_dtype", columns_info["columns"][col_name]['dtype']) assign(col_info, "dtypes_stats", columns_info["columns"][col_name]['stats']) assign(col_info, "column_type", columns_info["columns"][col_name]['type']) assign(columns_info, "columns." + col_name, col_info, dict) assign(col_info, "id", df.cols.get_meta(col_name, "id")) return columns_info