def _visit_apply_savedmodel_operation(self, operation_def, upstream_views): if any(v.fine_grained_view for v in upstream_views): raise ValueError( 'Was not expecting a fine_grained_view input for ApplySavedModel' ) (saved_model_path_upstream_view, input_upstream_view) = upstream_views fine_grained_view = collections.OrderedDict() for (dataset_idx, dataset_key) in enumerate(self._sorted_dataset_keys): infix = 'AnalysisIndex{}'.format(dataset_idx) input_node = nodes.apply_operation( beam_nodes.ExtractInputForSavedModel, dataset_key=dataset_key, label='ExtractInputForSavedModel[{}]'.format(infix)) # We use an index for the label in order to make beam labels more stable. (fine_grained_view[dataset_key], ) = (nodes.OperationNode( operation_def._replace( label='{}[{}]'.format(operation_def.label, infix)), (saved_model_path_upstream_view.flattened_view, input_node)).outputs) (flattened_view, ) = nodes.OperationNode( operation_def, (saved_model_path_upstream_view.flattened_view, input_upstream_view.flattened_view)).outputs return (_OptimizationView(prefer_fine_grained_view=False, flattened_view=flattened_view, fine_grained_view=fine_grained_view, hashed_path=b'APPLY_SAVEDMODEL'), )
def _apply_operation_on_fine_grained_view(self, operation_def, fine_grained_views, next_hashed_path): """Applies a shardable operation on a fine grained view. This also updates `cache_output_nodes` when necessary. Args: operation_def: A shardable `OperationDef`. fine_grained_views: A tuple of `_OptimizationView.fine_grained_view`s. next_hashed_path: The hashed path for the currently processed operation_def. Returns: The resulting list of `_OptimizationView.fine_grained_view`s. """ result_fine_grained_view = collections.OrderedDict() cache_entry_key = analyzer_cache.make_cache_entry_key( tf.compat.as_bytes(operation_def.label) + b'-' + next_hashed_path) for (dataset_idx, dataset_key) in enumerate(self._sorted_dataset_keys): # We use an index for the label in order to make beam labels more stable. infix = 'AnalysisIndex{}'.format(dataset_idx) if (operation_def.cache_coder and self._cache_dict.get( dataset_key, {}).get(cache_entry_key) is not None): self._dataset_has_cache_misses[dataset_key] |= False decode_cache = analyzer_nodes.DecodeCache( dataset_key, cache_entry_key, coder=operation_def.cache_coder, label='DecodeCache[{}][{}]'.format(operation_def.label, infix)) (op_output, ) = nodes.OperationNode(decode_cache, tuple()).outputs else: value_nodes = tuple(v[dataset_key] for v in fine_grained_views) (op_output, ) = nodes.OperationNode( operation_def._replace( label='{}[{}]'.format(operation_def.label, infix)), value_nodes).outputs if operation_def.cache_coder: self._dataset_has_cache_misses[dataset_key] = True encode_cache = nodes.apply_operation( analyzer_nodes.EncodeCache, op_output, coder=operation_def.cache_coder, label='EncodeCache[{}][{}]'.format( operation_def.label, infix)) self.cache_output_nodes[(dataset_key, cache_entry_key)] = encode_cache result_fine_grained_view[dataset_key] = op_output return result_fine_grained_view
def _apply_operation_on_fine_grained_view(self, operation_def, fine_grained_view): """Applies a shardable operation on a fine grained view. This also updates `cache_output_nodes` when necessary. Args: operation_def: A shardable `OperationDef`. fine_grained_view: A `_OptimizationView.fine_grained_view`. Returns: The resulting list of `_OptimizationView.fine_grained_view`s. """ result_fine_grained_view = collections.OrderedDict() # TODO(b/37788560): Use a better cache key than label. A good alternative is # to reuse graph_tools logic to compose names that include properties and # fingerprint it. cache_entry_key = analyzer_cache.make_cache_entry_key( operation_def.label) for dataset_key in self._dataset_keys: # TODO(b/37788560): Add instrumentation. if self._cache_dict.get(dataset_key, {}).get(cache_entry_key) is not None: (op_output, ) = nodes.OperationNode( analyzer_nodes.DecodeCache( dataset_key, cache_entry_key, coder=operation_def.cache_coder), tuple()).outputs else: value_node = fine_grained_view[dataset_key] (op_output, ) = nodes.OperationNode( operation_def._replace(label='{}[{}]'.format( operation_def.label, dataset_key)), (value_node, )).outputs if operation_def.cache_coder: encoded_cache = nodes.apply_operation( analyzer_nodes.EncodeCache, op_output, coder=operation_def.cache_coder, label='EncodeCache[{}][{}]'.format( operation_def.label, dataset_key)) self.cache_output_nodes[(dataset_key, cache_entry_key)] = encoded_cache result_fine_grained_view[dataset_key] = op_output return result_fine_grained_view
def _apply_operation_on_fine_grained_view(self, operation_def, fine_grained_view, next_hashed_path): """Applies a shardable operation on a fine grained view. This also updates `cache_output_nodes` when necessary. Args: operation_def: A shardable `OperationDef`. fine_grained_view: A `_OptimizationView.fine_grained_view`. next_hashed_path: The hashed path for the currently processed operation_def. Returns: The resulting list of `_OptimizationView.fine_grained_view`s. """ result_fine_grained_view = collections.OrderedDict() cache_entry_key = analyzer_cache.make_cache_entry_key( tf.compat.as_bytes(operation_def.label) + b'-' + next_hashed_path) for dataset_key in self._dataset_keys: if (operation_def.cache_coder and self._cache_dict.get( dataset_key, {}).get(cache_entry_key) is not None): (op_output,) = nodes.OperationNode( analyzer_nodes.DecodeCache( dataset_key, cache_entry_key, operation_def.label, coder=operation_def.cache_coder), tuple()).outputs else: value_node = fine_grained_view[dataset_key] (op_output,) = nodes.OperationNode( operation_def._replace( label='{}[{}]'.format(operation_def.label, dataset_key)), (value_node,)).outputs if operation_def.cache_coder: encoded_cache = nodes.apply_operation( analyzer_nodes.EncodeCache, op_output, coder=operation_def.cache_coder, label='EncodeCache[{}][{}]'.format(operation_def.label, dataset_key)) self.cache_output_nodes[(dataset_key, cache_entry_key)] = encoded_cache result_fine_grained_view[dataset_key] = op_output return result_fine_grained_view
def visit(self, operation_def, input_values): # If we see a combine node which can be packed, create the packed combine # node and cache it as we will use the same packed node for all the combines # in the group. if operation_def.label in self._combine_to_grand_parent: return self._get_packed_combine(operation_def, input_values) return nodes.OperationNode(operation_def, input_values).outputs
def _visit_partitionable_operation(self, operation_def, upstream_views): # This is a hint for whether or not the `fine_grained_view` should be used # downstream. It should be set to true if either the upstream view has # cacheing operations that haven't been flattened yet, or the current # operation is cacheable. all_fine_grained_views_available = all(v.fine_grained_view for v in upstream_views) prefer_fine_grained_view = (any(v.prefer_fine_grained_view for v in upstream_views) or all_fine_grained_views_available and operation_def.cache_coder is not None) next_hashed_path = self._make_next_hashed_path( [v.hashed_path for v in upstream_views], operation_def) if all_fine_grained_views_available: fine_grained_views = (self._apply_operation_on_fine_grained_view( operation_def, tuple(v.fine_grained_view for v in upstream_views), next_hashed_path), ) else: fine_grained_views = (None, ) * operation_def.num_outputs flattened_views = nodes.OperationNode( operation_def, tuple(v.flattened_view for v in upstream_views)).outputs assert len(fine_grained_views) == len(flattened_views) return tuple( _OptimizationView( # pylint: disable=g-complex-comprehension prefer_fine_grained_view=prefer_fine_grained_view, flattened_view=flat, fine_grained_view=fine, hashed_path=next_hashed_path) for flat, fine in zip(flattened_views, fine_grained_views))
def _remove_redundant_nodes(self, operation_def, input_values): # Input values to be used as input to CreateSavedModel. # Since some of the input values are generated from the redundant nodes, # those needs to be reconstructed with the final packed merge node. reconstructed_input_values = [] redundant_values, non_redundant_values = ( self._get_redundant_and_non_redundant_input_values(input_values)) # Keep track of the final packed merge combine node. For those input nodes # which are descendants of the redundant nodes, we would create a new node # generated from the final packed merge combine node. (final_packed_merge_combine, final_packed_merge_combine_tensor_bindings) = ( self._get_final_packed_combine_and_tensor_bindings(redundant_values)) reconstructed_input_values.extend( final_packed_merge_combine_tensor_bindings) # Add the non-redundant nodes to the input values. reconstructed_input_values.extend(non_redundant_values) # Keep track of the info needed to reconstruct the descendents of the # redundant nodes. to_be_created_tensor_bindings = ( self._get_to_be_created_tensor_bindings_info(redundant_values)) reconstructed_input_values.extend(self._create_tensor_bindings( to_be_created_tensor_bindings, final_packed_merge_combine)) assert len(input_values) == len(reconstructed_input_values) return nodes.OperationNode( operation_def, tuple(reconstructed_input_values)).outputs
def visit(self, operation_def, input_values): self.validate_operation_def(operation_def) if input_values and isinstance(operation_def, beam_nodes.CreateSavedModel): # This will only be called once since this is a single phase analysis # graph and in that case only the final CreateSavedModel node has inputs. return self._remove_redundant_nodes(operation_def, input_values) return nodes.OperationNode(operation_def, input_values).outputs
def _visit_partitionable_operation(self, operation_def, upstream_views): # TODO(b/37788560) Possibly support partitionable operations with multiple # inputs. (upstream_view, ) = upstream_views # This is a hint for whether or not the `fine_grained_view` should be used # downstream. It should be set to true if either the upstream view has # cacheing operations that haven't been flattened yet, or the current # operation is cacheable. prefer_fine_grained_view = (upstream_view.prefer_fine_grained_view or upstream_view.fine_grained_view and operation_def.cache_coder is not None) next_hashed_path = self._make_next_hashed_path( [v.hashed_path for v in upstream_views], operation_def) if upstream_view.fine_grained_view: fine_grained_views = (self._apply_operation_on_fine_grained_view( operation_def, upstream_view.fine_grained_view, next_hashed_path), ) else: fine_grained_views = (None, ) * operation_def.num_outputs flattened_views = nodes.OperationNode( operation_def, (upstream_view.flattened_view, )).outputs assert len(fine_grained_views) == len(flattened_views) return tuple( _OptimizationView( # pylint: disable=g-complex-comprehension prefer_fine_grained_view=prefer_fine_grained_view, flattened_view=flat, fine_grained_view=fine, hashed_path=next_hashed_path) for flat, fine in zip(flattened_views, fine_grained_views))
def _visit_apply_savedmodel_operation(self, operation_def, upstream_views): (upstream_view, ) = upstream_views if upstream_view.fine_grained_view: raise ValueError( 'Was not expecting a fine_grained_view input for ApplySavedModel' ) fine_grained_view = collections.OrderedDict() for key in self._dataset_keys: (fine_grained_view[key], ) = (nodes.OperationNode( operation_def._replace(dataset_key=key, label='{}[{}]'.format( operation_def.label, key)), (upstream_view.flattened_view, )).outputs) (flattened_view, ) = nodes.OperationNode( operation_def, (upstream_view.flattened_view, )).outputs return (_OptimizationView(prefer_fine_grained_view=False, flattened_view=flattened_view, fine_grained_view=fine_grained_view), )
def visit(self, operation_def, input_values): if isinstance(operation_def, analyzer_nodes.TensorSource): tensors = operation_def.tensors label = operation_def.label # Add tensor to signature so it gets produced by the SavedModel. for tensor in tensors: self.intermediate_output_signature[_tensor_name(tensor)] = tensor keys = tuple(map(_tensor_name, tensors)) output = nodes.apply_operation( beam_nodes.ExtractFromDict, self.extracted_values_dict, keys=keys, label=label) return (output,) else: return nodes.OperationNode(operation_def, input_values).outputs
def visit(self, operation_def, input_values): self._validate_operation_def(operation_def) # TODO(b/37788560): Possibly make this generic instead of special casing the # ApplySavedModel operation. if (isinstance(operation_def, beam_nodes.ApplySavedModel) and operation_def.phase == 0): return self._visit_apply_savedmodel_operation( operation_def, input_values) # When self._cache_dict is None this means that we shouldn't do any cacheing # for this pipeline, and so there's no need to create any fine grained # views. if self._cache_dict is not None and operation_def.is_partitionable: return self._visit_partitionable_operation(operation_def, input_values) if input_values and any( v.fine_grained_view and v.prefer_fine_grained_view for v in input_values): # We can 'flatten' the cached outputs of the parent operation since this # operation doesn't support partitioning. disaggregated_input_values = [] for view in input_values: disaggregated_input_values.extend( view.fine_grained_view.values()) # Checking that all cache has the same size. assert len({len(value) for value in disaggregated_input_values}) == 1 next_inputs = nodes.apply_multi_output_operation( beam_nodes.Flatten, *disaggregated_input_values, label='FlattenCache[{}]'.format(operation_def.label)) else: # Parent operation output is not cacheable, therefore we can just use # a flattened view. next_inputs = tuple(v.flattened_view for v in input_values) flattened_view = nodes.OperationNode(operation_def, next_inputs).outputs return tuple( _OptimizationView( # pylint: disable=g-complex-comprehension prefer_fine_grained_view=False, flattened_view=flat, fine_grained_view=None, hashed_path=None) for flat in flattened_view)
def visit(self, operation_def, input_values): self._validate_operation_def(operation_def) if (isinstance(operation_def, beam_nodes.ApplySavedModel) and operation_def.phase == 0): return self._visit_apply_savedmodel_operation( operation_def, input_values) if self._cache_location and operation_def.is_partitionable: return self._visit_partitionable_operation(operation_def, input_values) if input_values and any( v.fine_grained_view and v.prefer_fine_grained_view for v in input_values): # We can 'flatten' the cached outputs of the parent operation since this # operation doesn't support partitioning. disaggregated_input_values = [] for view in input_values: disaggregated_input_values.extend( view.fine_grained_view.values()) # Checking that all cache has the same size. assert len({len(value) for value in disaggregated_input_values}) == 1 next_inputs = nodes.apply_multi_output_operation( beam_nodes.Flatten, *disaggregated_input_values, label='FlattenCache[{}]'.format(operation_def.label)) else: # Parent operation output is not cacheable, therefore we can just use # a flattened view. next_inputs = tuple(v.flattened_view for v in input_values) flattened_view = nodes.OperationNode(operation_def, next_inputs).outputs return tuple( _OptimizationView(prefer_fine_grained_view=False, flattened_view=flat, fine_grained_view=None) for flat in flattened_view)
def _visit_partitionable_operation(self, operation_def, upstream_views): (upstream_view, ) = upstream_views prefer_fine_grained_view = (upstream_view.prefer_fine_grained_view or upstream_view.fine_grained_view and operation_def.cache_coder is not None) if upstream_view.fine_grained_view: value_nodes = collections.OrderedDict() for key in self._dataset_keys: if operation_def.cache_coder is not None: cache_file_path = analyzer_cache.make_cache_file_path( key, operation_def.label) pattern = '{}-00000*.gz'.format( os.path.join(self._cache_location.input_cache_dir, cache_file_path)) try: if tf.gfile.Glob(pattern): op_outputs = nodes.apply_multi_output_operation( analyzer_nodes.ReadCache, path=cache_file_path, coder=operation_def.cache_coder, label='ReadCache[{}][{}]'.format( operation_def.label, key)) value_nodes[key] = op_outputs continue except tf.errors.NotFoundError: pass else: cache_file_path = None values = upstream_view.fine_grained_view[key] op_outputs = nodes.OperationNode( operation_def._replace( label='{}[{}]'.format(operation_def.label, key)), (values, )).outputs if cache_file_path is not None: op_outputs = nodes.apply_multi_output_operation( analyzer_nodes.WriteCache, *op_outputs, path=cache_file_path, coder=operation_def.cache_coder, label='WriteCache[{}][{}]'.format( operation_def.label, key)) value_nodes[key] = op_outputs fine_grained_views = ([collections.OrderedDict()] * operation_def.num_outputs) for key in self._dataset_keys: for idx in range(operation_def.num_outputs): fine_grained_views[idx][key] = value_nodes[key][idx] else: fine_grained_views = (None, ) * operation_def.num_outputs flattened_views = nodes.OperationNode( operation_def, (upstream_view.flattened_view, )).outputs return tuple( _OptimizationView( prefer_fine_grained_view=prefer_fine_grained_view, flattened_view=flat, fine_grained_view=fine) for flat, fine in zip(flattened_views, fine_grained_views))
def _visit_partitionable_operation(self, operation_def, upstream_views): # TODO(b/37788560) Possibly support partitionable operations with multiple # inputs. (upstream_view,) = upstream_views prefer_fine_grained_view = ( upstream_view.prefer_fine_grained_view or upstream_view.fine_grained_view and operation_def.cache_coder is not None) if upstream_view.fine_grained_view: value_nodes = collections.OrderedDict() for key in self._dataset_keys: if operation_def.cache_coder is not None: # TODO(b/37788560): Add instrumentation. # TODO(b/37788560): Use a better cache key than label. A good # alternative is to reuse graph_tools logic to compose names that # include properties and fingerprint it. cache_file_path = analyzer_cache.make_cache_file_path( key, operation_def.label) # TODO(b/37788560): Come up with a more abstract way to do this that # also ensures concistency. pattern = '{}-00000*.gz'.format( os.path.join(self._cache_location.input_cache_dir, cache_file_path)) try: if tf.gfile.Glob(pattern): op_outputs = nodes.apply_multi_output_operation( analyzer_nodes.ReadCache, path=cache_file_path, coder=operation_def.cache_coder, label='ReadCache[{}][{}]'.format(operation_def.label, key)) value_nodes[key] = op_outputs continue except tf.errors.NotFoundError: pass else: cache_file_path = None values = upstream_view.fine_grained_view[key] op_outputs = nodes.OperationNode( operation_def._replace( label='{}[{}]'.format(operation_def.label, key)), (values,)).outputs if cache_file_path is not None: op_outputs = nodes.apply_multi_output_operation( analyzer_nodes.WriteCache, *op_outputs, path=cache_file_path, coder=operation_def.cache_coder, label='WriteCache[{}][{}]'.format(operation_def.label, key)) value_nodes[key] = op_outputs fine_grained_views = ( [collections.OrderedDict()] * operation_def.num_outputs) for key in self._dataset_keys: for idx in range(operation_def.num_outputs): fine_grained_views[idx][key] = value_nodes[key][idx] else: fine_grained_views = (None,) * operation_def.num_outputs flattened_views = nodes.OperationNode( operation_def, (upstream_view.flattened_view,)).outputs return tuple( _OptimizationView( prefer_fine_grained_view=prefer_fine_grained_view, flattened_view=flat, fine_grained_view=fine) for flat, fine in zip(flattened_views, fine_grained_views))
def testOperationNodeWithBadInputs(self): with self.assertRaisesRegexp(TypeError, 'inputs must be a tuple, got'): nodes.OperationNode(_Concat(label='Concat'), 'not a tuple')
def _maybe_create_node(op_def, inputs): if op_def.label in labels_to_new_nodes: return labels_to_new_nodes[op_def.label] new_node = nodes.OperationNode(op_def, inputs).outputs labels_to_new_nodes[op_def.label] = new_node return new_node
def testOperationNodeWithBadInput(self): a = nodes.apply_operation(_Constant, value='a', label='Constant[a]') with self.assertRaisesRegexp( TypeError, 'Inputs to Operation must be a ValueNode, got'): nodes.OperationNode(_Concat(label='Concat'), (a, 'not a value_node'))
def testOperationNodeWithBadOperatonDef(self): with self.assertRaisesRegexp( TypeError, 'operation_def must be an OperationDef, got'): nodes.OperationNode('not a operation_def', ())
def visit(self, operation_def, input_values): if isinstance(operation_def, beam_nodes.ExtractInputForSavedModel): self._required_dataset_keys.add(operation_def.dataset_key) return nodes.OperationNode(operation_def, input_values).outputs
def visit(self, operation_def, input_values): self._maybe_add_packable_combine(operation_def, input_values) return nodes.OperationNode(operation_def, input_values).outputs
def visit(self, operation_def, input_values): self.validate_operation_def(operation_def) # We look for the ExtractOutputs node of packable combines if operation_def.label in self._packable_combine_extract_outputs: return self._add_flatten_placeholder(operation_def, input_values) return nodes.OperationNode(operation_def, input_values).outputs
def visit(self, operation_def, input_values): if isinstance(operation_def, analyzer_nodes.TensorSource): for tensor in operation_def.tensors: self.sourced_tensors.append(tensor) return nodes.OperationNode(operation_def, input_values).outputs
def testValueNodeWithTooHighValueIndex(self): parent = nodes.OperationNode(_Constant('a'), ()) with self.assertRaisesWithLiteralMatch( ValueError, 'value_index was 2 but parent_operation had 1 outputs'): nodes.ValueNode(parent, 2)