def test_flatten_nested_struct(self): inputs1 = {'a': {'b': 1, 'c': 2}, 'e': 3} must1 = {'a//b': 1, 'a//c': 2, 'e': 3} self.assertDictEqual(nest_utils.flatten_nested_struct(inputs1), must1) inputs2 = {'a': 1, 'e': 3} must2 = {'a': 1, 'e': 3} self.assertDictEqual(nest_utils.flatten_nested_struct(inputs2), must2) inputs3 = { 'a': { 'b': 1, 'c': 2, 'd': { 'd1': 2, 'd2': 4 } }, 'e': [1, 2, { 'e1': 10 }] } must3 = { 'a//b': 1, 'a//c': 2, 'a//d//d1': 2, 'a//d//d2': 4, 'e//0': 1, 'e//1': 2, 'e//2//e1': 10 } self.assertDictEqual(nest_utils.flatten_nested_struct(inputs3), must3)
def build_inference_graph(self, features: Dict[str, tf.Tensor]) -> dict: """ Build graph for inference Parameters ---------- features dict with mappings feature name to feature tensor Returns ------- predictions_flatten flatten dict holding predictions Raises ------ ValueError if no predictions were built """ logger = logging.getLogger(__name__) logger.info('Build inference graph') self.mode = tf.estimator.ModeKeys.PREDICT self._validate_genes_for_inference() self.reset_tf_graph() model_results = self(features) predictions = model_results.predictions inputs_connected = tf_utils.get_connected_inputs_to_predictions( features, predictions, tf.get_default_graph()) tf_collections_utils.nested2collection(CollectionNames.INPUTS, inputs_connected) tf_collections_utils.nested2collection(CollectionNames.PREDICTIONS, predictions) predictions_flatten = nest_utils.flatten_nested_struct(predictions) return predictions_flatten
def predict(self, **inputs) -> Dict[str, tf.Tensor]: result_flat = {} default_axis = self.axis.get("default") inputs_flat = nest_utils.flatten_nested_struct(inputs, flatten_lists=False) for each_key, each_input_list in inputs_flat.items(): if not isinstance(each_input_list, (list, tuple)): msg = ("{}: all inputs to concat must be lists or tuples! " "(input for key {} is of type {})").format( self.name, each_key, type(each_input_list)) raise ValueError(msg) axis_for_key = self.axis.get(each_key, default_axis) if axis_for_key is None: msg = ("{}: axis for key {} was not provided and default key " "does not exist!").format(self.name, each_key) raise ValueError(msg) if axis_for_key >= len(each_input_list[0].shape): msg = ("{}: axis {} for input key {} is not valid for" "inputs with shape {}").format(self.name, axis_for_key, each_key, each_input_list[0].shape) raise ValueError(msg) inputs_concat = tf.concat(each_input_list, axis=axis_for_key) result_flat[each_key] = inputs_concat result = nest_utils.unflatten_dict_to_nested(result_flat) return result
def predict_batch(self, inputs: Union[dict, list]) -> Tuple[dict, float]: """ Make predictions given inputs Parameters ---------- inputs inputs to the network Returns ------- predictions predictions of the network predict_exec_time execution time of network prediction """ time_start_predict = time.time() if not isinstance(inputs, list): inputs = [inputs] list_of_predictions = [] for each_input in inputs: if self.model_incoming_keys_mapping is not None: each_input = nucleotide_utils.remap_and_collapse_inputs( [each_input], [self.model_incoming_keys_mapping]) each_input_flatten = nest_utils.flatten_nested_struct(each_input) current_prediction_flatten = predictors.predict_using_predictor( predictor=self._predictor, inputs=each_input_flatten, model_parameters=self.model_parameters) list_of_predictions.append(current_prediction_flatten) predictions_flatten = nucleotide_utils.collapse_inputs( list_of_predictions) predictions = nest_utils.unflatten_dict_to_nested(predictions_flatten) predict_exec_time = time.time() - time_start_predict return predictions, predict_exec_time
def combine_fn(*list_of_features) -> tf.data.Dataset: """ Method to combine the features Parameters ---------- list_of_features list of features to combine Returns ------- data_with_combined_features data with combined features """ features_combined_flatten = {} for each_features in list_of_features: each_features_flatten = nest_utils.flatten_nested_struct( each_features) for (each_feature_name, each_feature) in each_features_flatten.items(): if each_feature_name in features_combined_flatten: _assert_tensors_have_same_shape( features_combined_flatten[each_feature_name], each_feature) _assert_tensors_have_same_type( features_combined_flatten[each_feature_name], each_feature) else: features_combined_flatten[each_feature_name] = each_feature features_combined = nest_utils.unflatten_dict_to_nested( features_combined_flatten) data_with_featured = tf.data.Dataset.from_tensors(features_combined) return data_with_featured
def preprocess_dataset_inputs( inputs: Dict[str, tf.Tensor]) -> Dict[str, Dict[str, tf.Tensor]]: """ Add preprocessing step as identity nodes on dataset inputs and add all the inputs to dataset key Parameters ---------- inputs inputs from datasets Returns ------- inputs_with_identity same as inputs, but with added identity ops and add to dataset key """ inputs_flat = nest_utils.flatten_nested_struct(inputs) inputs_flat_identity = { k: tf.identity(v) for k, v in sorted(inputs_flat.items()) } inputs_identity = nest_utils.unflatten_dict_to_nested( inputs_flat_identity) inputs_identity = {"dataset": inputs_identity} return inputs_identity
def _parse_single_example(example, features): example_flat = nest_utils.flatten_nested_struct(example, "/") result = { k: "-".join([str(example_flat[k]), features[k]]) for k in example_flat } return result
def predict_using_predictor(predictor: Predictor, *, inputs: dict, model_parameters: Optional[dict] = None) -> dict: """ Make predictions using predictor Parameters ---------- predictor predictor to use inputs inputs for predictor model_parameters model parameters to feed in nested view Returns ------- predictions predictions """ model_parameters = model_parameters or {} model_parameters_flat = nest_utils.flatten_nested_struct( model_parameters, flatten_lists=False) inputs_filtered = { k: v for k, v in inputs.items() if k in predictor.feed_tensors } inputs_filtered.update(model_parameters_flat) predictions = predictor(inputs_filtered) return predictions
def maybe_cast_dtype( inputs: Dict[str, tf.Tensor], cast_dtypes: Dict[tf.DType, tf.DType] = None) -> Dict[str, tf.Tensor]: """ Cast values from nested inputs structure according to cast_dtypes mapping. If dtype of value inside of inputs is not inside of cast_dtypes keys, it will not be casted at all. Parameters ---------- inputs possibly nested dict, with values as tensors cast_dtypes dict with mapping of which dtype should be casted to which, e.g. {float32: float16} means that all of float32 tensors will be casted to float16 before passing to nucleotide Returns ------- inputs_casted : dict same structure as inputs, but with inputs casted according to cast_dtypes """ if cast_dtypes is None: return inputs inputs_flatten = nest_utils.flatten_nested_struct(inputs) for k, each_input in inputs_flatten.items(): if not isinstance(each_input, tf.Tensor): continue dtype_input = each_input.dtype if dtype_input in cast_dtypes: input_casted = tf.cast(each_input, cast_dtypes[dtype_input]) inputs_flatten[k] = input_casted inputs_casted = nest_utils.unflatten_dict_to_nested(inputs_flatten) return inputs_casted
def _add_metrics_to_summaries(self, model_results: ModelResults): if model_results.metrics is not None: metrics_flatten = nest_utils.flatten_nested_struct( model_results.metrics) for metric_name, metric_value in metrics_flatten.items(): model_utils.add_summary_by_name( metric_name, metric_value, self.max_outputs_tb)
def test_forward_pass(self, use_mixed_precision): inputs = self._get_inputs_for_model() model = self._get_model(inputs=inputs) if use_mixed_precision: model.mixed_precision_config = MixedPrecisionConfig(True, 100) model.mode = 'train' inputs_from_dataset = {'dataset': inputs} predictions = model.forward_pass( inputs_from_dataset=inputs_from_dataset) predictions_flatten = nest_utils.flatten_nested_struct(predictions) prediction_keys = set(predictions_flatten.keys()) prediction_keys_must = set([ '//'.join([pl.name, 'predictions']) for pl in model.plugins.values() ]) self.assertSetEqual(prediction_keys_must, prediction_keys) # test predictions dtype # since all plugins in model return float predictions, they should be # float16 with mixed precision and float32 otherwise dtype_must = tf.float16 if use_mixed_precision else tf.float32 for prediction_name, pred in predictions_flatten.items(): plugin_name = prediction_name.split("//")[0] if not model.plugins[plugin_name].allow_mixed_precision: self.assertEqual(pred.dtype, tf.float32) else: self.assertEqual(pred.dtype, dtype_must)
def remap_single_input(inputs: dict, mapping: Optional[dict] = None) -> dict: """ Remap single input keys according to mapping Parameters ---------- inputs dict with inputs, where keys should be remapped mapping mapping of old keys to new keys; if some key was not present, it will be passed as is; if new key is "_", it will be ignored in remapped result Returns ------- remapped_inputs inputs with remapped keys """ inputs_remapped_flat = {} mapping = mapping or {} inputs_flat = nest_utils.flatten_nested_struct( inputs, separator=_NESTED_KEY_SEPARATOR) for old_name, value in sorted(inputs_flat.items()): remapped_new_names = _get_new_key_for_nested_input_and_map( old_name, mapping) for each_new_name in remapped_new_names: if each_new_name == NucleotideKeyFields.IGNORE_KEY: continue inputs_remapped_flat[each_new_name] = value inputs_remapped = (nest_utils.unflatten_dict_to_nested( inputs_remapped_flat, separator=_NESTED_KEY_SEPARATOR)) return inputs_remapped
def _log_eval_result_to_mlflow(eval_result_filtered: dict): eval_result_filtered_flatten = nest_utils.flatten_nested_struct( eval_result_filtered, separator="--") for each_eval_name, each_eval_value in ( eval_result_filtered_flatten.items()): mlflow_utils.log_metric_to_mlflow( each_eval_name, each_eval_value)
def filter_kpi_values(kpi: dict, return_flattened: bool = False) -> (dict, dict): """ Filter kpis according to its value type. If kpi value is not of type str or number, it will be filtered out. If value is numpy array of size 1, then element will be selected and not filtered out. Parameters ---------- kpi dict, possibly nested, mapping kpi names to its values return_flattened flag to return flattened dict and do not unflatten it back Returns ------- kpi_filtered dict with same structure as kpi, but only with values of numbers and string type kpi_filtered_out dict with same structure as kpi with values other then numbers and string type """ logger = logging.getLogger(__name__) kpi_flatten = nest_utils.flatten_nested_struct(kpi) names_filtered_out = [] for kpi_name in kpi_flatten: kpi_value = kpi_flatten[kpi_name] if isinstance(kpi_value, np.ndarray) and np.prod(kpi_value.shape) == 1: kpi_value = np.reshape(kpi_value, (1, ))[0] # pylint: disable=no-member # numpy does have floating member if isinstance(kpi_value, np.floating): kpi_value = float(kpi_value) elif isinstance(kpi_value, np.integer): kpi_value = int(kpi_value) elif isinstance(kpi_value, np.str): kpi_value = str(kpi_value) kpi_flatten[kpi_name] = kpi_value if not isinstance(kpi_value, (numbers.Number, str, list)): names_filtered_out.append(kpi_name) kpi_filtered = { k: v for k, v in kpi_flatten.items() if k not in names_filtered_out } kpi_filtered_out = { k: v for k, v in kpi_flatten.items() if k in names_filtered_out } if kpi_filtered and not return_flattened: kpi_filtered = nest_utils.unflatten_dict_to_nested(kpi_filtered) if kpi_filtered_out: logger.warning( "Following kpi keys cannot be serialized to json: " "%s", kpi_filtered_out.keys()) if not return_flattened: kpi_filtered_out = nest_utils.unflatten_dict_to_nested( kpi_filtered_out) return kpi_filtered, kpi_filtered_out
def select_inputs_by_sample_mask_np(sample_mask: np.ndarray, keys_to_exclude_from_sample_mask: Optional[ List[str]] = None, **inputs) -> Dict[str, np.ndarray]: """ Select inputs by masking out samples with sample_mask == 0 Parameters ---------- sample_mask tensor of shape [batch_size] with 1 indicating that sample should be leaved as is and 0 - remove sample keys_to_exclude_from_sample_mask list of keys that will not be masked using sample_mask **inputs inputs to mask Returns ------- masked_inputs masked inputs sample-wise """ inputs_flatten = nest_utils.flatten_nested_struct(inputs) inputs_masked_flatten = {} keys_to_exclude = keys_to_exclude_from_sample_mask or [] sample_mask = sample_mask.astype(bool) for each_key, each_value in inputs_flatten.items(): if each_key in keys_to_exclude: inputs_masked_flatten[each_key] = each_value else: inputs_masked_flatten[each_key] = each_value[sample_mask] inputs_masked = nest_utils.unflatten_dict_to_nested(inputs_masked_flatten) return inputs_masked
def parse_tfrecord_example(self, example) -> dict: """Parse tfrecord example""" features_flat = nest_utils.flatten_nested_struct( self.get_tfrecords_features(), '/') output_types = self.get_tfrecords_output_types() or {} output_types_flat = nest_utils.flatten_nested_struct( output_types, '/') parsed_example = tf.parse_single_example(example, features_flat) data_decoded = {} for field_name, field_value in parsed_example.items(): output_type = output_types_flat.get(field_name) data_decoded[field_name] = self.decode_field( field_name, field_value, output_type) data = nest_utils.unflatten_dict_to_nested(data_decoded, '/') data = self.postprocess_tfrecords(**data) return data
def _add_summary_to_summaries(self, model_results: ModelResults): if model_results.summary is not None: summary_flatten = nest_utils.flatten_nested_struct( model_results.summary) for summary_name, summary_value in summary_flatten.items(): model_utils.add_summary_by_name( summary_name, summary_value, self.max_outputs_tb)
def _validate_features(self, train_features, infer_features): train_features_flat = nest_utils.flatten_nested_struct(train_features) infer_features_flat = nest_utils.flatten_nested_struct(infer_features) train_features_keys = set(train_features_flat) infer_features_keys = set(infer_features_flat) not_existing_train_keys = infer_features_keys.difference( train_features_keys) not_existing_infer_keys = train_features_keys.difference( infer_features_keys) if not_existing_train_keys or not_existing_infer_keys: msg = ("{}: train and infer features differ! " "(not existing train features: {}, " "not existing infer features: {})" ).format(self.name, not_existing_train_keys, not_existing_infer_keys) raise ValueError(msg)
def _get_data_results(data: tf.data.Dataset, session_manager, max_iteration=None) -> dict: iterator = data.make_one_shot_iterator() sample = iterator.get_next() outputs_flatten = {} iteration_number = 0 with session_manager as sess: while True: try: sample_out = sess.run(sample) sample_out_flatten = nest_utils.flatten_nested_struct( sample_out) for k, v in sample_out_flatten.items(): outputs_flatten.setdefault(k, []) if isinstance(v, bytes): v = v.decode() outputs_flatten[k].append(v) iteration_number += 1 except tf.errors.OutOfRangeError: break if max_iteration is not None and iteration_number >= max_iteration: break outputs = nest_utils.unflatten_dict_to_nested(outputs_flatten) return outputs
def test_serialize_to_file(self, log_config_parameter_to_mlflow_fn): def _log_config_parameter_to_mlflow(param_name, param_value): pass log_config_parameter_to_mlflow_fn.side_effect = ( _log_config_parameter_to_mlflow) single_config_names = ["model", "dataset"] serializer = project_serializer.MlflowConfigSerializer( save_dir="", single_config_names=single_config_names) serializer.serialize_to_file(self.configs_to_log) config_serialized_must_flatten = nest_utils.flatten_nested_struct( self.config_serialized_must, separator="/") for each_param_name, each_param_value in ( config_serialized_must_flatten.items()): log_config_parameter_to_mlflow_fn.assert_has_calls( [mock_call(each_param_name, each_param_value)]) log_config_parameter_to_mlflow_fn.assert_has_calls( [mock_call("CLUSTER_SPEC", {})]) len_of_calls_must = len(config_serialized_must_flatten) + 1 self.assertEqual(len_of_calls_must, log_config_parameter_to_mlflow_fn.call_count) self.assertDictEqual(self.configs_to_log_copy, self.configs_to_log)
def _get_nested_shapes(nested_dict): flatten = nest_utils.flatten_nested_struct(nested_dict) flatten_with_shapes = {k: v.get_shape().as_list() for k, v in flatten.items()} nested_with_shapes = nest_utils.unflatten_dict_to_nested( flatten_with_shapes) return nested_with_shapes
def _get_default_features(nested_features): features_flatten = nest_utils.flatten_nested_struct(nested_features) zero_values_flatten = { each_key: tf.zeros_like(each_value) for each_key, each_value in features_flatten.items() } zero_values = nest_utils.unflatten_dict_to_nested(zero_values_flatten) return zero_values
def add(self, **sample_inputs): _validate_buffer_keys_and_shapes(buffer=self, new_sample=sample_inputs) sample_inputs_flat = nest_utils.flatten_nested_struct(sample_inputs) for each_key, each_value in sample_inputs_flat.items(): buffer_value = self._buffer_flat.setdefault(each_key, 0) new_value = self.add_value(buffer_value, each_value) self._buffer_flat[each_key] = new_value self._number_of_samples += 1
def nested_to_tfrecords_feature(nested_values: dict, ignore_empty_arrays: bool = True) -> dict: """ Create tf records features from nested dictionary structure Structure will be first flatten with separator '/' and then encoded using corresponding feature type. Parameters ---------- nested_values nested dict holding data ignore_empty_arrays if the array is empty, e.g. it has 0 in the shape, it will not store it; tfrecords parser raises 'Invalid argument: Key: {key}. Can't parse serialized Example' on empty arrays Returns ------- features dict with features from flatten elements of nested_values """ def _get_feature(value): # pylint: disable=too-many-return-statements if (ignore_empty_arrays and isinstance(value, np.ndarray) and value.size == 0): return None if (isinstance(value, np.ndarray) and all(isinstance(i, str) for i in value)): value = [val.encode('utf-8') for val in value] return _bytes_list_feature(value) if isinstance(value, list): value = np.array(value) if isinstance(value, np.ndarray): return _bytes_feature(value.tostring()) if isinstance(value, int): return _int32_feature(value) if isinstance(value, str): return _bytes_feature(value.encode()) if isinstance(value, float): return _float_feature(value) if isinstance(value, bytes): return _bytes_feature(value) raise ValueError('Value of type {} cannot be encoded!'.format( type(value))) flatten_values = nest_utils.flatten_nested_struct( nested_values, separator='/') features = {} for name, value in flatten_values.items(): feature = _get_feature(value) if feature is None: continue features[name] = feature return features
def get_estimator_spec(mode: str, predictions: Optional[dict], losses: Optional[dict], metrics: Optional[dict], train_op: Optional[tf.Operation] ) -> tf.estimator.EstimatorSpec: """ Construct the estimator spec Parameters ---------- mode mode of the model predictions all predictions losses all losses metrics metrics of the model train_op training operation Returns ------- estimator_spec estimator spec """ loss = None eval_metric_ops = None if losses is not None and 'total_loss' in losses: loss = losses['total_loss'] if predictions is not None: predictions = nest_utils.flatten_nested_struct(predictions) if metrics: metrics_flatten = nest_utils.flatten_nested_struct(metrics) eval_metric_ops = { k: (v, tf.no_op()) for k, v in metrics_flatten.items()} estimator_spec = tf.estimator.EstimatorSpec( mode=mode, loss=loss, train_op=train_op, predictions=predictions, eval_metric_ops=eval_metric_ops) return estimator_spec
def cache(self, values): io_utils.maybe_mkdir(self.cache_target) cache_fname = self._get_cache_fname() inputs_flatten = nest_utils.flatten_nested_struct(values) if os.path.exists(cache_fname): logger = logging.getLogger(__name__) logger.warning("Cache with name %s already exist!", cache_fname) return with open(cache_fname, "w") as file: json.dump(inputs_flatten, file, default=lambda x: x.tolist())
def _format_data_for_log(data: Optional[dict] = None, shape_fn: Callable = np.shape) -> Optional[dict]: if data is None: return None data_flat = nest_utils.flatten_nested_struct(data) data_repr_flat = { k: (v if isinstance(v, tf.Tensor) else shape_fn(v)) for k, v in data_flat.items() } data_repr = nest_utils.unflatten_dict_to_nested(data_repr_flat) return data_repr
def split_batch_inputs( inputs: dict, not_batch_keys: Optional[List[str]] = None, ignore_none_values=True, ) -> Tuple[List[dict], dict]: """ Split batch inputs to sample inputs Parameters ---------- inputs batch inputs to split not_batch_keys keys to exclude from split ignore_none_values if the keys with None values should be treated as not batch keys Returns ------- batch_inputs_flat_as_list list split batch inputs not_batch_inputs dict with not batch inputs """ not_batch_keys = not_batch_keys or [] batch_inputs = { each_key: each_value for each_key, each_value in inputs.items() if each_key not in not_batch_keys } not_batch_inputs = { each_key: each_value for each_key, each_value in inputs.items() if each_key in not_batch_keys } batch_inputs_flat = nest_utils.flatten_nested_struct(batch_inputs) if ignore_none_values: none_keys = [k for k, v in batch_inputs_flat.items() if v is None] batch_inputs_flat = { k: v for k, v in batch_inputs_flat.items() if k not in none_keys } not_batch_inputs.update( nest_utils.unflatten_dict_to_nested({k: None for k in none_keys})) batch_inputs_flat_as_list = ( nest_utils.dict_of_lists_to_list_of_dicts(batch_inputs_flat)) batch_inputs_as_list = [ nest_utils.unflatten_dict_to_nested(each_flat_input) for each_flat_input in batch_inputs_flat_as_list ] return batch_inputs_as_list, not_batch_inputs
def _write_tfrecords(self): data_flatten = nest_utils.flatten_nested_struct(self.data, separator='/') sample_index = 0 for filename in self.file_names['data']: tfrecord_writer = tf.python_io.TFRecordWriter(filename) for _ in range(self.number_of_samples_per_file): sample = {key: value[sample_index] for key, value in data_flatten.items()} sample_index += 1 _write_sample_to_tfrecord(sample, tfrecord_writer) tfrecord_writer.close()
def __init__(self, meta_graph_path: str, checkpoint_path: str, graph: tf.Graph = None, config: tf.ConfigProto = None): self._graph = graph or tf.Graph() with self._graph.as_default(): tf.train.import_meta_graph(meta_graph_path, clear_devices=True) try: saver = tf.train.Saver() except ValueError: tf_utils.add_variables_from_graph_without_collection() saver = tf.train.Saver() self._session = tf.Session(config=config) saver.restore(self._session, checkpoint_path) inputs = tf_collections_utils.collection2nested( CollectionNames.INPUTS) predictions = tf_collections_utils.collection2nested( CollectionNames.PREDICTIONS) self._feed_tensors = nest_utils.flatten_nested_struct(inputs) self._fetch_tensors = nest_utils.flatten_nested_struct(predictions)