Example #1
0
def _write_object_graph(saveable_view, export_dir, asset_file_def_index):
  """Save a SavedObjectGraph proto for `root`."""
  # SavedObjectGraph is similar to the CheckpointableObjectGraph proto in the
  # checkpoint. It will eventually go into the SavedModel.
  proto = saved_object_graph_pb2.SavedObjectGraph()
  saveable_view.fill_object_graph_proto(proto)

  coder = nested_structure_coder.StructureCoder()
  for concrete_function in saveable_view.concrete_functions:
    serialized = function_serialization.serialize_concrete_function(
        concrete_function, saveable_view.captured_tensor_node_ids, coder)
    if serialized is not None:
      proto.concrete_functions[concrete_function.name].CopyFrom(
          serialized)

  for obj, obj_proto in zip(saveable_view.nodes, proto.nodes):
    _write_object_proto(obj, obj_proto, asset_file_def_index)

  extra_asset_dir = os.path.join(
      compat.as_bytes(export_dir),
      compat.as_bytes(constants.EXTRA_ASSETS_DIRECTORY))
  file_io.recursive_create_dir(extra_asset_dir)
  object_graph_filename = os.path.join(
      extra_asset_dir, compat.as_bytes("object_graph.pb"))
  file_io.write_string_to_file(object_graph_filename, proto.SerializeToString())
Example #2
0
  def save(self, as_text=False):
    """Writes a `SavedModel` protocol buffer to disk.

    The function writes the SavedModel protocol buffer to the export directory
    in serialized format.

    Args:
      as_text: Writes the SavedModel protocol buffer in text format to disk.

    Returns:
      The path to which the SavedModel protocol buffer was written.
    """
    if not file_io.file_exists(self._export_dir):
      file_io.recursive_create_dir(self._export_dir)

    if as_text:
      path = os.path.join(
          compat.as_bytes(self._export_dir),
          compat.as_bytes(constants.SAVED_MODEL_FILENAME_PBTXT))
      file_io.write_string_to_file(path, str(self._saved_model))
    else:
      path = os.path.join(
          compat.as_bytes(self._export_dir),
          compat.as_bytes(constants.SAVED_MODEL_FILENAME_PB))
      file_io.write_string_to_file(path, self._saved_model.SerializeToString())
    tf_logging.info("SavedModel written to: %s", path)

    return path
def run_analysis(args):
  """Builds an analysis file for training.

  Uses BiqQuery tables to do the analysis.

  Args:
    args: command line args

  Raises:
    ValueError if schema contains unknown types.
  """
  import google.datalab.bigquery as bq
  if args.bigquery_table:
    table = bq.Table(args.bigquery_table)
    schema_list = table.schema._bq_schema
  else:
    schema_list = json.loads(
        file_io.read_file_to_string(args.schema_file).decode())
    table = bq.ExternalDataSource(
        source=args.input_file_pattern,
        schema=bq.Schema(schema_list))

  # Check the schema is supported.
  for col_schema in schema_list:
    col_type = col_schema['type'].lower()
    if col_type != 'string' and col_type != 'integer' and col_type != 'float':
      raise ValueError('Schema contains an unsupported type %s.' % col_type)

  run_numerical_analysis(table, schema_list, args)
  run_categorical_analysis(table, schema_list, args)

  # Save a copy of the schema to the output location.
  file_io.write_string_to_file(
      os.path.join(args.output_dir, SCHEMA_FILE),
      json.dumps(schema_list, indent=2, separators=(',', ': ')))
Example #4
0
def _export_model_json(model, saved_model_path):
  """Saves model configuration as a json string under assets folder."""
  model_json = model.to_json()
  model_json_filepath = os.path.join(
      saved_model_utils.get_or_create_assets_dir(saved_model_path),
      compat.as_text(constants.SAVED_MODEL_FILENAME_JSON))
  file_io.write_string_to_file(model_json_filepath, model_json)
Example #5
0
  def test_numerics(self):
    test_folder = os.path.join(self._bucket_root, 'test_numerics')
    input_file_path = os.path.join(test_folder, 'input.csv')
    output_folder = os.path.join(test_folder, 'test_output')
    file_io.recursive_create_dir(output_folder)

    file_io.write_string_to_file(
      input_file_path,
      '\n'.join(['%s,%s' % (i, 10 * i + 0.5) for i in range(100)]))

    schema = [{'name': 'col1', 'type': 'INTEGER'},
              {'name': 'col2', 'type': 'FLOAT'}]
    features = {'col1': {'transform': 'scale', 'source_column': 'col1'},
                'col2': {'transform': 'identity', 'source_column': 'col2'}}
    analyze.run_cloud_analysis(
        output_dir=output_folder,
        csv_file_pattern=input_file_path,
        bigquery_table=None,
        schema=schema,
        inverted_features=analyze.invert_features(features))

    stats = json.loads(
        file_io.read_file_to_string(
            os.path.join(output_folder, analyze.constant.STATS_FILE)).decode())

    self.assertEqual(stats['num_examples'], 100)
    col = stats['column_stats']['col1']
    self.assertAlmostEqual(col['max'], 99.0)
    self.assertAlmostEqual(col['min'], 0.0)
    self.assertAlmostEqual(col['mean'], 49.5)

    col = stats['column_stats']['col2']
    self.assertAlmostEqual(col['max'], 990.5)
    self.assertAlmostEqual(col['min'], 0.5)
    self.assertAlmostEqual(col['mean'], 495.5)
Example #6
0
def write_graph(graph_def, logdir, name, as_text=True):
  """Writes a graph proto to a file.

  The graph is written as a binary proto unless `as_text` is `True`.

  ```python
  v = tf.Variable(0, name='my_variable')
  sess = tf.Session()
  tf.train.write_graph(sess.graph_def, '/tmp/my-model', 'train.pbtxt')
  ```

  Args:
    graph_def: A `GraphDef` protocol buffer.
    logdir: Directory where to write the graph. This can refer to remote
      filesystems, such as Google Cloud Storage (GCS).
    name: Filename for the graph.
    as_text: If `True`, writes the graph as an ASCII proto.
  """
  # gcs does not have the concept of directory at the moment.
  if not file_io.file_exists(logdir) and not logdir.startswith("gs:"):
    file_io.recursive_create_dir(logdir)
  path = os.path.join(logdir, name)
  if as_text:
    file_io.write_string_to_file(path, str(graph_def))
  else:
    file_io.write_string_to_file(path, graph_def.SerializeToString())
Example #7
0
 def testCopy(self):
   file_path = os.path.join(self._base_dir, "temp_file")
   file_io.write_string_to_file(file_path, "testing")
   copy_path = os.path.join(self._base_dir, "copy_file")
   file_io.copy(file_path, copy_path)
   self.assertTrue(file_io.file_exists(copy_path))
   self.assertEqual(b"testing", file_io.read_file_to_string(file_path))
Example #8
0
  def test_categorical(self):
    output_folder = tempfile.mkdtemp()
    input_file_path = tempfile.mkstemp(dir=output_folder)[1]
    try:
      csv_file = ['red,apple', 'red,pepper', 'red,apple', 'blue,grape',
                  'blue,apple', 'green,pepper']
      file_io.write_string_to_file(
        input_file_path,
        '\n'.join(csv_file))

      schema = [{'name': 'color', 'type': 'STRING'},
                {'name': 'type', 'type': 'STRING'}]
      features = {'color': {'transform': 'one_hot', 'source_column': 'color'},
                  'type': {'transform': 'target'}}
      feature_analysis.run_local_analysis(
        output_folder, [input_file_path], schema, features)

      stats = json.loads(
          file_io.read_file_to_string(
              os.path.join(output_folder, analyze.constant.STATS_FILE)).decode())
      self.assertEqual(stats['column_stats']['color']['vocab_size'], 3)

      # Color column.
      vocab_str = file_io.read_file_to_string(
        os.path.join(output_folder, analyze.constant.VOCAB_ANALYSIS_FILE % 'color'))
      vocab = pd.read_csv(six.StringIO(vocab_str),
                          header=None,
                          names=['color', 'count'])
      expected_vocab = pd.DataFrame(
          {'color': ['red', 'blue', 'green'], 'count': [3, 2, 1]},
          columns=['color', 'count'])
      pd.util.testing.assert_frame_equal(vocab, expected_vocab)

    finally:
      shutil.rmtree(output_folder)
  def test_make_transform_graph_images(self):

    print('Testing make_transform_graph with image_to_vec.' +
          'It may take a few minutes because it needs to download a large inception checkpoint.')

    def _open_and_encode_image(img_url):
      with file_io.FileIO(img_url, 'r') as f:
        img = Image.open(f).convert('RGB')
        output = cStringIO.StringIO()
        img.save(output, 'jpeg')
      return base64.urlsafe_b64encode(output.getvalue())

    try:
      output_folder = tempfile.mkdtemp()
      stats_file_path = os.path.join(output_folder, feature_transforms.STATS_FILE)
      stats = {'column_stats': {}}
      file_io.write_string_to_file(stats_file_path, json.dumps(stats))

      schema = [{'name': 'img', 'type': 'STRING'}]
      features = {'img': {'transform': 'image_to_vec', 'source_column': 'img'}}

      img_string1 = _open_and_encode_image(
          'gs://cloud-ml-data/img/flower_photos/daisy/15207766_fc2f1d692c_n.jpg')
      img_string2 = _open_and_encode_image(
          'gs://cloud-ml-data/img/flower_photos/dandelion/8980164828_04fbf64f79_n.jpg')
      input_data = [img_string1, img_string2]
      results = self._run_graph(output_folder, features, schema, stats, input_data)
      embeddings = results['img']
      self.assertEqual(len(embeddings), 2)
      self.assertEqual(len(embeddings[0]), 2048)
      self.assertEqual(embeddings[0].dtype, np.float32)
      self.assertTrue(any(x != 0.0 for x in embeddings[1]))

    finally:
      shutil.rmtree(output_folder)
Example #10
0
 def testRename(self):
   file_path = os.path.join(self._base_dir, "temp_file")
   file_io.write_string_to_file(file_path, "testing")
   rename_path = os.path.join(self._base_dir, "rename_file")
   file_io.rename(file_path, rename_path)
   self.assertTrue(file_io.file_exists(rename_path))
   self.assertFalse(file_io.file_exists(file_path))
def update_renames_v2(output_file_path):
  """Writes a Python dictionary mapping deprecated to canonical API names.

  Args:
    output_file_path: File path to write output to. Any existing contents
      would be replaced.
  """
  # Set of rename lines to write to output file in the form:
  #   'tf.deprecated_name': 'tf.canonical_name'
  rename_line_set = set()
  # _tf_api_names attribute name
  tensorflow_api_attr = tf_export.API_ATTRS[tf_export.TENSORFLOW_API_NAME].names

  def visit(unused_path, unused_parent, children):
    """Visitor that collects rename strings to add to rename_line_set."""
    for child in children:
      _, attr = tf_decorator.unwrap(child[1])
      if not hasattr(attr, '__dict__'):
        continue
      api_names = attr.__dict__.get(tensorflow_api_attr, [])
      deprecated_api_names = attr.__dict__.get('_tf_deprecated_api_names', [])
      canonical_name = tf_export.get_canonical_name(
          api_names, deprecated_api_names)
      for name in deprecated_api_names:
        rename_line_set.add('    \'tf.%s\': \'tf.%s\'' % (name, canonical_name))

  visitor = public_api.PublicAPIVisitor(visit)
  visitor.do_not_descend_map['tf'].append('contrib')
  traverse.traverse(tf, visitor)

  renames_file_text = '%srenames = {\n%s\n}\n' % (
      _FILE_HEADER, ',\n'.join(sorted(rename_line_set)))
  file_io.write_string_to_file(output_file_path, renames_file_text)
Example #12
0
def visualize_embeddings(summary_writer, config):
  """Stores a config file used by the embedding projector.

  Args:
    summary_writer: The summary writer used for writing events.
    config: `tf.contrib.tensorboard.plugins.projector.ProjectorConfig`
      proto that holds the configuration for the projector such as paths to
      checkpoint files and metadata files for the embeddings. If
      `config.model_checkpoint_path` is none, it defaults to the
      `logdir` used by the summary_writer.

  Raises:
    ValueError: If the summary writer does not have a `logdir`.
  """
  logdir = summary_writer.get_logdir()

  # Sanity checks.
  if logdir is None:
    raise ValueError('Summary writer must have a logdir')

  # Saving the config file in the logdir.
  config_pbtxt = text_format.MessageToString(config)
  # FYI - the 'projector_config.pbtxt' string is hardcoded in the projector
  # plugin.
  # TODO(dandelion): Restore this to a reference to the projector plugin
  file_io.write_string_to_file(
      os.path.join(logdir, 'projector_config.pbtxt'), config_pbtxt)
Example #13
0
 def testCopyOverwriteFalse(self):
   file_path = os.path.join(self._base_dir, "temp_file")
   file_io.write_string_to_file(file_path, "testing")
   copy_path = os.path.join(self._base_dir, "copy_file")
   file_io.write_string_to_file(copy_path, "copy")
   with self.assertRaises(errors.AlreadyExistsError):
     file_io.copy(file_path, copy_path, overwrite=False)
  def testAssets(self):
    export_dir = self._get_export_dir("test_assets")
    builder = saved_model_builder.SavedModelBuilder(export_dir)

    with self.test_session(graph=ops.Graph()) as sess:
      self._init_and_validate_variable(sess, "v", 42)

      # Build an asset collection.
      ignored_filepath = os.path.join(
          compat.as_bytes(test.get_temp_dir()), compat.as_bytes("ignored.txt"))
      file_io.write_string_to_file(ignored_filepath, "will be ignored")

      asset_collection = self._build_asset_collection("hello42.txt",
                                                      "foo bar baz",
                                                      "asset_file_tensor")

      builder.add_meta_graph_and_variables(
          sess, ["foo"], assets_collection=asset_collection)

    # Save the SavedModel to disk.
    builder.save()

    with self.test_session(graph=ops.Graph()) as sess:
      foo_graph = loader.load(sess, ["foo"], export_dir)
      self._validate_asset_collection(export_dir, foo_graph.collection_def,
                                      "hello42.txt", "foo bar baz",
                                      "asset_file_tensor:0")
      ignored_asset_path = os.path.join(
          compat.as_bytes(export_dir),
          compat.as_bytes(constants.ASSETS_DIRECTORY),
          compat.as_bytes("ignored.txt"))
      self.assertFalse(file_io.file_exists(ignored_asset_path))
Example #15
0
 def _build_asset_collection(self, asset_file_name, asset_file_contents, asset_file_tensor_name):
     asset_filepath = os.path.join(compat.as_bytes(tf.test.get_temp_dir()), compat.as_bytes(asset_file_name))
     file_io.write_string_to_file(asset_filepath, asset_file_contents)
     asset_file_tensor = tf.constant(asset_filepath, name=asset_file_tensor_name)
     tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, asset_file_tensor)
     asset_collection = tf.get_collection(tf.GraphKeys.ASSET_FILEPATHS)
     return asset_collection
Example #16
0
def _write_object_graph(saveable_view, export_dir, asset_file_def_index):
  """Save a SavedObjectGraph proto for `root`."""
  # SavedObjectGraph is similar to the CheckpointableObjectGraph proto in the
  # checkpoint. It will eventually go into the SavedModel.
  proto = saved_object_graph_pb2.SavedObjectGraph()
  saveable_view.fill_object_graph_proto(proto)

  node_ids = util.ObjectIdentityDictionary()
  for i, obj in enumerate(saveable_view.nodes):
    node_ids[obj] = i
    if resource_variable_ops.is_resource_variable(obj):
      node_ids[obj.handle] = i
    elif isinstance(obj, tracking.TrackableAsset):
      node_ids[obj.asset_path.handle] = i

  for obj, obj_proto in zip(saveable_view.nodes, proto.nodes):
    _write_object_proto(obj, obj_proto, asset_file_def_index, node_ids)

  extra_asset_dir = os.path.join(
      compat.as_bytes(export_dir),
      compat.as_bytes(constants.EXTRA_ASSETS_DIRECTORY))
  file_io.recursive_create_dir(extra_asset_dir)
  object_graph_filename = os.path.join(
      extra_asset_dir, compat.as_bytes("object_graph.pb"))
  file_io.write_string_to_file(object_graph_filename, proto.SerializeToString())
Example #17
0
  def test_numerics(self):
    output_folder = tempfile.mkdtemp()
    input_file_path = tempfile.mkstemp(dir=output_folder)[1]
    try:
      file_io.write_string_to_file(
        input_file_path,
        '\n'.join(['%s,%s,%s' % (i, 10 * i + 0.5, i + 0.5) for i in range(100)]))

      schema = [{'name': 'col1', 'type': 'INTEGER'},
                {'name': 'col2', 'type': 'FLOAT'},
                {'name': 'col3', 'type': 'FLOAT'}]
      features = {'col1': {'transform': 'scale', 'source_column': 'col1'},
                  'col2': {'transform': 'identity', 'source_column': 'col2'},
                  'col3': {'transform': 'target'}}
      feature_analysis.run_local_analysis(
          output_folder, [input_file_path], schema, features)

      stats = json.loads(
          file_io.read_file_to_string(
              os.path.join(output_folder, analyze.constant.STATS_FILE)).decode())

      self.assertEqual(stats['num_examples'], 100)
      col = stats['column_stats']['col1']
      self.assertAlmostEqual(col['max'], 99.0)
      self.assertAlmostEqual(col['min'], 0.0)
      self.assertAlmostEqual(col['mean'], 49.5)

      col = stats['column_stats']['col2']
      self.assertAlmostEqual(col['max'], 990.5)
      self.assertAlmostEqual(col['min'], 0.5)
      self.assertAlmostEqual(col['mean'], 495.5)
    finally:
      shutil.rmtree(output_folder)
Example #18
0
def create_object_test():
  """Verifies file_io's object manipulation methods ."""
  starttime = int(round(time.time() * 1000))
  dir_name = "%s/tf_gcs_test_%s" % (FLAGS.gcs_bucket_url, starttime)
  print("Creating dir %s." % dir_name)
  file_io.create_dir(dir_name)

  # Create a file in this directory.
  file_name = "%s/test_file.txt" % dir_name
  print("Creating file %s." % file_name)
  file_io.write_string_to_file(file_name, "test file creation.")

  list_files_pattern = "%s/test_file*.txt" % dir_name
  print("Getting files matching pattern %s." % list_files_pattern)
  files_list = file_io.get_matching_files(list_files_pattern)
  print(files_list)

  assert len(files_list) == 1
  assert files_list[0] == file_name

  # Cleanup test files.
  print("Deleting file %s." % file_name)
  file_io.delete_file(file_name)

  # Delete directory.
  print("Deleting directory %s." % dir_name)
  file_io.delete_recursively(dir_name)
Example #19
0
 def testFileWrite(self):
   file_path = os.path.join(self.get_temp_dir(), "temp_file")
   file_io.write_string_to_file(file_path, "testing")
   self.assertTrue(file_io.file_exists(file_path))
   file_contents = file_io.read_file_to_string(file_path)
   self.assertEqual(b"testing", file_contents)
   file_io.delete_file(file_path)
Example #20
0
 def testCreateRecursiveDir(self):
   dir_path = os.path.join(self._base_dir, "temp_dir/temp_dir1/temp_dir2")
   file_io.recursive_create_dir(dir_path)
   file_path = os.path.join(dir_path, "temp_file")
   file_io.write_string_to_file(file_path, "testing")
   self.assertTrue(file_io.file_exists(file_path))
   file_io.delete_recursively(os.path.join(self._base_dir, "temp_dir"))
   self.assertFalse(file_io.file_exists(file_path))
Example #21
0
 def testStat(self):
   file_path = os.path.join(self._base_dir, "temp_file")
   file_io.write_string_to_file(file_path, "testing")
   file_statistics = file_io.stat(file_path)
   os_statistics = os.stat(file_path)
   self.assertEquals(7, file_statistics.length)
   self.assertEqual(
       int(os_statistics.st_mtime), int(file_statistics.mtime_nsec / 1e9))
Example #22
0
 def testRenameOverwriteFalse(self):
   file_path = os.path.join(self._base_dir, "temp_file")
   file_io.write_string_to_file(file_path, "testing")
   rename_path = os.path.join(self._base_dir, "rename_file")
   file_io.write_string_to_file(rename_path, "rename")
   with self.assertRaises(errors.AlreadyExistsError):
     file_io.rename(file_path, rename_path, overwrite=False)
   self.assertTrue(file_io.file_exists(rename_path))
   self.assertTrue(file_io.file_exists(file_path))
Example #23
0
 def testRenameOverwrite(self):
   file_path = os.path.join(self.get_temp_dir(), "temp_file")
   file_io.write_string_to_file(file_path, "testing")
   rename_path = os.path.join(self.get_temp_dir(), "rename_file")
   file_io.write_string_to_file(rename_path, "rename")
   file_io.rename(file_path, rename_path, overwrite=True)
   self.assertTrue(file_io.file_exists(rename_path))
   self.assertFalse(file_io.file_exists(file_path))
   file_io.delete_file(rename_path)
def save_schema_features(schema, features, output):
  # Save a copy of the schema and features in the output folder.
  file_io.write_string_to_file(
    os.path.join(output, constant.SCHEMA_FILE),
    json.dumps(schema, indent=2))

  file_io.write_string_to_file(
    os.path.join(output, constant.FEATURES_FILE),
    json.dumps(features, indent=2))
Example #25
0
 def testCopyOverwrite(self):
   file_path = os.path.join(self.get_temp_dir(), "temp_file")
   file_io.write_string_to_file(file_path, "testing")
   copy_path = os.path.join(self.get_temp_dir(), "copy_file")
   file_io.write_string_to_file(copy_path, "copy")
   file_io.copy(file_path, copy_path, overwrite=True)
   self.assertTrue(file_io.file_exists(copy_path))
   self.assertEqual(b"testing", file_io.read_file_to_string(file_path))
   file_io.delete_file(file_path)
   file_io.delete_file(copy_path)
Example #26
0
  def test_text(self):
    output_folder = tempfile.mkdtemp()
    input_file_path = tempfile.mkstemp(dir=output_folder)[1]
    try:
      csv_file = ['the quick brown fox,raining in kir,cat1|cat2,true',
                  'quick   brown brown chicken,raining in pdx,cat2|cat3|cat4,false']
      file_io.write_string_to_file(
        input_file_path,
        '\n'.join(csv_file))

      schema = [{'name': 'col1', 'type': 'STRING'},
                {'name': 'col2', 'type': 'STRING'},
                {'name': 'col3', 'type': 'STRING'},
                {'name': 'col4', 'type': 'STRING'}]
      features = {'col1': {'transform': 'bag_of_words', 'source_column': 'col1'},
                  'col2': {'transform': 'tfidf', 'source_column': 'col2'},
                  'col3': {'transform': 'multi_hot', 'source_column': 'col3', 'separator': '|'},
                  'col4': {'transform': 'target'}}
      feature_analysis.run_local_analysis(
        output_folder, [input_file_path], schema, features)

      stats = json.loads(
          file_io.read_file_to_string(
              os.path.join(output_folder, analyze.constant.STATS_FILE)).decode())
      self.assertEqual(stats['column_stats']['col1']['vocab_size'], 5)
      self.assertEqual(stats['column_stats']['col2']['vocab_size'], 4)
      self.assertEqual(stats['column_stats']['col3']['vocab_size'], 4)

      vocab_str = file_io.read_file_to_string(
          os.path.join(output_folder,
                       analyze.constant.VOCAB_ANALYSIS_FILE % 'col1'))
      vocab = pd.read_csv(six.StringIO(vocab_str),
                          header=None,
                          names=['col1', 'count'])

      # vocabs are sorted by count only
      col1_vocab = vocab['col1'].tolist()
      self.assertItemsEqual(col1_vocab[:2], ['brown', 'quick'])
      self.assertItemsEqual(col1_vocab[2:], ['chicken', 'fox', 'the'])
      self.assertEqual(vocab['count'].tolist(), [2, 2, 1, 1, 1])

      vocab_str = file_io.read_file_to_string(
          os.path.join(output_folder,
                       analyze.constant.VOCAB_ANALYSIS_FILE % 'col2'))
      vocab = pd.read_csv(six.StringIO(vocab_str),
                          header=None,
                          names=['col2', 'count'])

      # vocabs are sorted by count only
      col2_vocab = vocab['col2'].tolist()
      self.assertItemsEqual(col2_vocab[:2], ['in', 'raining'])
      self.assertItemsEqual(col2_vocab[2:], ['kir', 'pdx'])
      self.assertEqual(vocab['count'].tolist(), [2, 2, 1, 1])
    finally:
      shutil.rmtree(output_folder)
Example #27
0
 def testIsDirectory(self):
   dir_path = os.path.join(self._base_dir, "test_dir")
   # Failure for a non-existing dir.
   with self.assertRaises(errors.NotFoundError):
     file_io.is_directory(dir_path)
   file_io.create_dir(dir_path)
   self.assertTrue(file_io.is_directory(dir_path))
   file_path = os.path.join(dir_path, "test_file")
   file_io.write_string_to_file(file_path, "test")
   # False for a file.
   self.assertFalse(file_io.is_directory(file_path))
Example #28
0
  def testAssets(self):
    export_dir = os.path.join(
        compat.as_bytes(tf.test.get_temp_dir()), compat.as_bytes("with-assets"))
    builder = saved_model_builder.SavedModelBuilder(export_dir)

    with self.test_session(graph=tf.Graph()) as sess:
      v = tf.Variable(42, name="v")
      sess.run(tf.initialize_all_variables())
      self.assertEqual(42, v.eval())

      # Build an asset collection.
      asset_filepath = os.path.join(
          compat.as_bytes(tf.test.get_temp_dir()),
          compat.as_bytes("hello42.txt"))
      file_io.write_string_to_file(asset_filepath, "foo bar baz")
      asset_file_tensor = tf.constant(asset_filepath, name="asset_file_tensor")
      tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, asset_file_tensor)

      ignored_filepath = os.path.join(
          compat.as_bytes(tf.test.get_temp_dir()),
          compat.as_bytes("ignored.txt"))
      file_io.write_string_to_file(ignored_filepath, "will be ignored")

      asset_collection = tf.get_collection(tf.GraphKeys.ASSET_FILEPATHS)

      builder.add_meta_graph_and_variables(
          sess, ["foo"], assets_collection=asset_collection)

    # Save the SavedModel to disk.
    builder.save()

    with self.test_session(graph=tf.Graph()) as sess:
      foo_graph = loader.load(sess, ["foo"], export_dir)

      # Validate the assets.
      collection_def = foo_graph.collection_def
      assets_any = collection_def[constants.ASSETS_KEY].any_list.value
      self.assertEqual(len(assets_any), 1)
      asset = manifest_pb2.AssetFile()
      assets_any[0].Unpack(asset)
      assets_path = os.path.join(
          compat.as_bytes(export_dir),
          compat.as_bytes(constants.ASSETS_DIRECTORY),
          compat.as_bytes("hello42.txt"))
      asset_contents = file_io.read_file_to_string(assets_path)
      self.assertEqual("foo bar baz", compat.as_text(asset_contents))
      self.assertEqual("hello42.txt", asset.filename)
      self.assertEqual("asset_file_tensor:0", asset.tensor_binding.tensor_name)
      ignored_asset_path = os.path.join(
          compat.as_bytes(export_dir),
          compat.as_bytes(constants.ASSETS_DIRECTORY),
          compat.as_bytes("ignored.txt"))
      self.assertFalse(file_io.file_exists(ignored_asset_path))
Example #29
0
  def test_text(self):
    test_folder = os.path.join(self._bucket_root, 'test_text')
    input_file_path = os.path.join(test_folder, 'input.csv')
    output_folder = os.path.join(test_folder, 'test_output')
    file_io.recursive_create_dir(output_folder)

    csv_file = ['the quick brown fox,raining in kir,cat1|cat2,true',
                'quick   brown brown chicken,raining in pdx,cat2|cat3|cat4,false']
    file_io.write_string_to_file(
      input_file_path,
      '\n'.join(csv_file))

    schema = [{'name': 'col1', 'type': 'STRING'},
              {'name': 'col2', 'type': 'STRING'},
              {'name': 'col3', 'type': 'STRING'},
              {'name': 'col4', 'type': 'STRING'}]
    features = {'col1': {'transform': 'bag_of_words', 'source_column': 'col1'},
                'col2': {'transform': 'tfidf', 'source_column': 'col2'},
                'col3': {'transform': 'multi_hot', 'source_column': 'col3', 'separator': '|'},
                'col4': {'transform': 'target'}}
    analyze.run_cloud_analysis(
        output_dir=output_folder,
        csv_file_pattern=input_file_path,
        bigquery_table=None,
        schema=schema,
        features=features)

    stats = json.loads(
        file_io.read_file_to_string(
            os.path.join(output_folder, analyze.constant.STATS_FILE)).decode())
    self.assertEqual(stats['column_stats']['col1']['vocab_size'], 5)
    self.assertEqual(stats['column_stats']['col2']['vocab_size'], 4)
    self.assertEqual(stats['column_stats']['col3']['vocab_size'], 4)

    vocab_str = file_io.read_file_to_string(
        os.path.join(output_folder,
                     analyze.constant.VOCAB_ANALYSIS_FILE % 'col1'))
    vocab = pd.read_csv(six.StringIO(vocab_str),
                        header=None,
                        names=['col1', 'count'])
    self.assertEqual(vocab['col1'].tolist(),
                     ['brown', 'quick', 'chicken', 'fox', 'the', ])
    self.assertEqual(vocab['count'].tolist(), [2, 2, 1, 1, 1])

    vocab_str = file_io.read_file_to_string(
        os.path.join(output_folder,
                     analyze.constant.VOCAB_ANALYSIS_FILE % 'col2'))
    vocab = pd.read_csv(six.StringIO(vocab_str),
                        header=None,
                        names=['col2', 'count'])
    self.assertEqual(vocab['col2'].tolist(), ['in', 'raining', 'kir', 'pdx'])
    self.assertEqual(vocab['count'].tolist(), [2, 2, 1, 1])
Example #30
0
  def test_categorical(self):
    test_folder = os.path.join(self._bucket_root, 'test_categorical')
    input_file_path = os.path.join(test_folder, 'input.csv')
    output_folder = os.path.join(test_folder, 'test_output')
    file_io.recursive_create_dir(output_folder)

    csv_file = ['red,car,apple', 'red,truck,pepper', 'red,van,apple', 'blue,bike,grape',
                'blue,train,apple', 'green,airplane,pepper']
    file_io.write_string_to_file(
      input_file_path,
      '\n'.join(csv_file))

    schema = [{'name': 'color', 'type': 'STRING'},
              {'name': 'transport', 'type': 'STRING'},
              {'name': 'type', 'type': 'STRING'}]
    features = {'color': {'transform': 'one_hot', 'source_column': 'color'},
                'transport': {'transform': 'embedding', 'source_column': 'transport'},
                'type': {'transform': 'target'}}
    analyze.run_cloud_analysis(
        output_dir=output_folder,
        csv_file_pattern=input_file_path,
        bigquery_table=None,
        schema=schema,
        features=features)

    stats = json.loads(
        file_io.read_file_to_string(
            os.path.join(output_folder, analyze.constant.STATS_FILE)).decode())
    self.assertEqual(stats['column_stats']['color']['vocab_size'], 3)
    self.assertEqual(stats['column_stats']['transport']['vocab_size'], 6)

    # Color column.
    vocab_str = file_io.read_file_to_string(
      os.path.join(output_folder, analyze.constant.VOCAB_ANALYSIS_FILE % 'color'))
    vocab = pd.read_csv(six.StringIO(vocab_str),
                        header=None,
                        names=['color', 'count'])
    expected_vocab = pd.DataFrame(
        {'color': ['red', 'blue', 'green'], 'count': [3, 2, 1]},
        columns=['color', 'count'])
    pd.util.testing.assert_frame_equal(vocab, expected_vocab)

    # transport column.
    vocab_str = file_io.read_file_to_string(
        os.path.join(output_folder,
                     analyze.constant.VOCAB_ANALYSIS_FILE % 'transport'))
    vocab = pd.read_csv(six.StringIO(vocab_str),
                        header=None,
                        names=['transport', 'count'])
    self.assertEqual(vocab['count'].tolist(), [1 for i in range(6)])
    self.assertEqual(vocab['transport'].tolist(),
                     ['airplane', 'bike', 'car', 'train', 'truck', 'van'])
Example #31
0
    def testRun(self, mock_publisher):
        mock_publisher.return_value.publish_execution.return_value = {}

        test_dir = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName)

        connection_config = metadata_store_pb2.ConnectionConfig()
        connection_config.sqlite.SetInParent()

        pipeline_root = os.path.join(test_dir, 'Test')
        input_path = os.path.join(test_dir, 'input')
        tf.gfile.MakeDirs(os.path.dirname(input_path))
        file_io.write_string_to_file(input_path, 'test')

        input_artifact = types.Artifact(type_name='InputPath')
        input_artifact.uri = input_path

        component = test_utils._FakeComponent(
            name='FakeComponent',
            input_channel=channel_utils.as_channel([input_artifact]))

        pipeline_info = data_types.PipelineInfo(pipeline_name='Test',
                                                pipeline_root=pipeline_root,
                                                run_id='123')

        driver_args = data_types.DriverArgs(enable_cache=True)

        # We use InProcessComponentLauncher to test BaseComponentLauncher logics.
        launcher = in_process_component_launcher.InProcessComponentLauncher.create(
            component=component,
            pipeline_info=pipeline_info,
            driver_args=driver_args,
            metadata_connection_config=connection_config,
            beam_pipeline_args=[],
            additional_pipeline_args={})
        self.assertEqual(
            launcher._component_info.component_type, '.'.join([
                test_utils._FakeComponent.__module__,
                test_utils._FakeComponent.__name__
            ]))
        launcher.launch()

        output_path = os.path.join(pipeline_root, 'output')
        self.assertTrue(tf.gfile.Exists(output_path))
        contents = file_io.read_file_to_string(output_path)
        self.assertEqual('test', contents)
Example #32
0
def local_batch_predict(model_dir,
                        csv_file_pattern,
                        output_dir,
                        output_format,
                        batch_size=100):
    """ Batch Predict with a specified model.

  It does batch prediction, saves results to output files and also creates an output
  schema file. The output file names are input file names prepended by 'predict_results_'.

  Args:
    model_dir: The model directory containing a SavedModel (usually saved_model.pb).
    csv_file_pattern: a pattern of csv files as batch prediction source.
    output_dir: the path of the output directory.
    output_format: csv or json.
    batch_size: Larger batch_size improves performance but may
        cause more memory usage.
  """

    file_io.recursive_create_dir(output_dir)
    csv_files = file_io.get_matching_files(csv_file_pattern)
    if len(csv_files) == 0:
        raise ValueError('No files found given ' + csv_file_pattern)

    with tf.Graph().as_default(), tf.Session() as sess:
        input_alias_map, output_alias_map = _tf_load_model(sess, model_dir)
        csv_tensor_name = list(input_alias_map.values())[0]
        output_schema = _get_output_schema(sess, output_alias_map)
        for csv_file in csv_files:
            output_file = os.path.join(
                output_dir, 'predict_results_' +
                os.path.splitext(os.path.basename(csv_file))[0] + '.' +
                output_format)
            with file_io.FileIO(output_file, 'w') as f:
                prediction_source = _batch_csv_reader(csv_file, batch_size)
                for batch in prediction_source:
                    batch = [l.rstrip() for l in batch if l]
                    predict_results = sess.run(
                        fetches=output_alias_map,
                        feed_dict={csv_tensor_name: batch})
                    formatted_results = _format_results(
                        output_format, output_schema, predict_results)
                    f.write('\n'.join(formatted_results) + '\n')

    file_io.write_string_to_file(
        os.path.join(output_dir, 'predict_results_schema.json'),
        json.dumps(output_schema, indent=2))
Example #33
0
    def test_make_transform_graph_images(self):

        print(
            'Testing make_transform_graph with image_to_vec.' +
            'It may take a few minutes because it needs to download a large inception checkpoint.'
        )

        def _open_and_encode_image(img_url):
            with file_io.FileIO(img_url, 'r') as f:
                img = Image.open(f).convert('RGB')
                output = cStringIO.StringIO()
                img.save(output, 'jpeg')
            return output.getvalue()

        try:
            output_folder = tempfile.mkdtemp()
            stats_file_path = os.path.join(output_folder,
                                           analyze_data.STATS_FILE)
            file_io.write_string_to_file(stats_file_path,
                                         json.dumps({'column_stats': {}}))
            analyze_data.make_transform_graph(output_folder, [{
                'name': 'img',
                'type': 'STRING'
            }], {'img': {
                'transform': 'image_to_vec'
            }})

            model_path = os.path.join(output_folder, 'transform_fn')
            self.assertTrue(
                os.path.isfile(os.path.join(model_path, 'saved_model.pb')))

            img_string1 = _open_and_encode_image(
                'gs://cloud-ml-data/img/flower_photos/daisy/15207766_fc2f1d692c_n.jpg'
            )
            img_string2 = _open_and_encode_image(
                'gs://cloud-ml-data/img/flower_photos/dandelion/8980164828_04fbf64f79_n.jpg'
            )
            results = self._run_graph(model_path,
                                      {'img': [img_string1, img_string2]})
            embeddings = results['img']
            self.assertEqual(len(embeddings), 2)
            self.assertEqual(len(embeddings[0]), 2048)
            self.assertEqual(embeddings[0].dtype, np.float32)
            self.assertTrue(any(x != 0.0 for x in embeddings[1]))

        finally:
            shutil.rmtree(output_folder)
Example #34
0
  def _make_test_files(self):
    """Builds test files and folders"""

    # Make the output folders
    self._test_dir = tempfile.mkdtemp()
    self._preprocess_output = os.path.join(self._test_dir, 'preprocess')
    self._train_output = os.path.join(self._test_dir, 'train')
    self._batch_predict_output = os.path.join(self._test_dir, 'batch_predict')

    # Don't make train_output folder as it should not exist at training time.
    os.mkdir(self._preprocess_output)
    os.mkdir(self._batch_predict_output)

    # Make csv files
    self._csv_train_filename = os.path.join(self._test_dir,
                                            'train_csv_data.csv')
    self._csv_eval_filename = os.path.join(self._test_dir,
                                           'eval_csv_data.csv')
    self._csv_predict_filename = os.path.join(self._test_dir,
                                              'predict_csv_data.csv')
    e2e_functions.make_csv_data(self._csv_train_filename, 100, 'regression',
                                True)
    e2e_functions.make_csv_data(self._csv_eval_filename, 100, 'regression',
                                True)
    self._predict_num_rows = 10
    e2e_functions.make_csv_data(self._csv_predict_filename,
                                self._predict_num_rows, 'regression', False)

    # Make schema file
    self._schema_filename = os.path.join(self._test_dir, 'schema.json')
    e2e_functions.make_preprocess_schema(self._schema_filename, 'regression')

    # Make feature file
    self._input_features_filename = os.path.join(self._test_dir,
                                                 'input_features_file.json')
    transforms = {
        "num1": {"transform": "scale"},
        "num2": {"transform": "scale", "value": 4},
        "str1": {"transform": "one_hot"},
        "str2": {"transform": "embedding", "embedding_dim": 3},
        "target": {"transform": "target"},
        "key": {"transform": "key"},
    }
    file_io.write_string_to_file(
        self._input_features_filename,
        json.dumps(transforms, indent=2))
def write_anomalies_text(anomalies, output_path):
    """Writes the Anomalies proto to a file in text format.

  Args:
    anomalies: An Anomalies protocol buffer.
    output_path: File path to which to write the Anomalies proto.

  Raises:
    TypeError: If the input Anomalies proto is not of the expected type.
  """
    if not isinstance(anomalies, anomalies_pb2.Anomalies):
        raise TypeError(
            'anomalies is of type %s; should be an Anomalies proto.' %
            type(anomalies).__name__)

    anomalies_text = text_format.MessageToString(anomalies)
    file_io.write_string_to_file(output_path, anomalies_text)
Example #36
0
    def test_numerics(self):
        output_folder = tempfile.mkdtemp()
        input_file_path = tempfile.mkstemp(dir=output_folder)[1]
        try:
            file_io.write_string_to_file(
                input_file_path,
                '\n'.join(['%s,%s' % (i, 10 * i + 0.5) for i in range(100)]))

            schema = [{
                'name': 'col1',
                'type': 'INTEGER'
            }, {
                'name': 'col2',
                'type': 'FLOAT'
            }]
            features = {
                'col1': {
                    'transform': 'scale',
                    'source_column': 'col1'
                },
                'col2': {
                    'transform': 'identity',
                    'source_column': 'col2'
                }
            }
            analyze.run_local_analysis(output_folder,
                                       [input_file_path], schema,
                                       analyze.invert_features(features))

            stats = json.loads(
                file_io.read_file_to_string(
                    os.path.join(output_folder,
                                 analyze.constant.STATS_FILE)).decode())

            self.assertEqual(stats['num_examples'], 100)
            col = stats['column_stats']['col1']
            self.assertAlmostEqual(col['max'], 99.0)
            self.assertAlmostEqual(col['min'], 0.0)
            self.assertAlmostEqual(col['mean'], 49.5)

            col = stats['column_stats']['col2']
            self.assertAlmostEqual(col['max'], 990.5)
            self.assertAlmostEqual(col['min'], 0.5)
            self.assertAlmostEqual(col['mean'], 495.5)
        finally:
            shutil.rmtree(output_folder)
Example #37
0
    def write(self, schema, path):
        """Writes a v1 `Schema` to disk as JSON.

    The function converts the in-memory Schema representation to the v1 Schema
    JSON representation, and writes it to the specified path.

    Args:
      schema: The Schema to write.
      path: the filename to write to.
    """
        schema_as_json = schema_io_v1_json_writer.to_schema_json(schema)

        basedir = os.path.dirname(path)
        if not file_io.file_exists(basedir):
            file_io.recursive_create_dir(basedir)

        file_io.write_string_to_file(path + ".json", schema_as_json)
Example #38
0
def write_stats_text(stats, output_path):
    """Writes a DatasetFeatureStatisticsList proto to a file in text format.

  Args:
    stats: A DatasetFeatureStatisticsList proto.
    output_path: File path to write the DatasetFeatureStatisticsList proto.

  Raises:
    TypeError: If the input proto is not of the expected type.
  """
    if not isinstance(stats, statistics_pb2.DatasetFeatureStatisticsList):
        raise TypeError('stats is of type %s, should be a '
                        'DatasetFeatureStatisticsList proto.' %
                        type(stats).__name__)

    stats_proto_text = text_format.MessageToString(stats)
    file_io.write_string_to_file(output_path, stats_proto_text)
Example #39
0
def validate_stats(stats_path, schema_path, anomalies_path):
    """Validates the statistics against the schema and materializes anomalies.

    Args:
      stats_path: Location of the stats used to infer the schema.
      schema_path: Location of the schema to be used for validation.
      anomalies_path: Location where the detected anomalies are materialized.
    """
    # Validating schema against the computed statistics
    schema = my_metadata.read_schema(schema_path)

    stats = tfdv.load_statistics(stats_path)
    anomalies = tfdv.validate_statistics(stats, schema)

    # Writing anomalies to anomalies path to
    file_io.write_string_to_file(anomalies_path,
                                 text_format.MessageToString(anomalies))
def train_step(sess, train_op, global_step, train_step_kwargs):
    """
  Function that takes a gradient step and specifies whether to stop.
  """
    start_time = time.time()

    trace_run_options = None
    run_metadata = None
    if 'should_trace' in train_step_kwargs:
        if 'logdir' not in train_step_kwargs:
            raise ValueError(
                'logdir must be present in train_step_kwargs when '
                'should_trace is present')
        if sess.run(train_step_kwargs['should_trace']):
            trace_run_options = config_pb2.RunOptions(
                trace_level=config_pb2.RunOptions.FULL_TRACE)
            run_metadata = config_pb2.RunMetadata()

    total_loss, np_global_step = sess.run(
        [train_op[0], train_op[1], global_step],
        options=trace_run_options,
        run_metadata=run_metadata)
    # sess.run(train_op[1])
    time_elapsed = time.time() - start_time

    if run_metadata is not None:
        tl = timeline.Timeline(run_metadata.step_stats)
        trace = tl.generate_chrome_trace_format()
        trace_filename = os.path.join(train_step_kwargs['logdir'],
                                      'tf_trace-%d.json' % np_global_step)
        logging.info('Writing trace to %s', trace_filename)
        file_io.write_string_to_file(trace_filename, trace)
        if 'summary_writer' in train_step_kwargs:
            train_step_kwargs['summary_writer'].add_run_metadata(
                run_metadata, 'run_metadata-%d' % np_global_step)

    if 'should_log' in train_step_kwargs:
        if sess.run(train_step_kwargs['should_log']):
            logging.info('global step %d: loss = %.4f (%.3f sec/step)',
                         np_global_step, total_loss, time_elapsed)
    if 'should_stop' in train_step_kwargs:
        should_stop = sess.run(train_step_kwargs['should_stop'])
    else:
        should_stop = False

    return total_loss, should_stop
Example #41
0
def _do_aiplatform_inference(model, version, serialized_examples):
    """Performs inference on the model:version in AI Platform."""
    working_dir = tempfile.mkdtemp()
    instances_file = os.path.join(working_dir, 'test.json')
    json_examples = []
    for serialized_example in serialized_examples:
        # The encoding follows the example in:
        # https://github.com/GoogleCloudPlatform/training-data-analyst/blob/master/quests/tpu/invoke_model.py
        json_examples.append(
            '{ "inputs": { "b64": "%s" } }' %
            base64.b64encode(serialized_example).decode('utf-8'))
    file_io.write_string_to_file(instances_file, '\n'.join(json_examples))
    gcloud_command = [
        'gcloud', 'ai-platform', 'predict', '--model', model, '--version',
        version, '--json-instances', instances_file
    ]
    print(subprocess.check_output(gcloud_command))
def train_step(sess, train_op, global_step, train_step_kwargs):
    start_time = time.time()

    trace_run_options = None
    run_metadata = None
    if 'should_trace' in train_step_kwargs:
        if 'logdir' not in train_step_kwargs:
            raise ValueError('logdir must be present in train_step_kwargs when '
                             'should_trace is present')
        if sess.run(train_step_kwargs['should_trace']):
            trace_run_options = config_pb2.RunOptions(
                trace_level=config_pb2.RunOptions.FULL_TRACE)
            run_metadata = config_pb2.RunMetadata()

    total_loss, np_global_step = sess.run([train_op, global_step],
                                          options=trace_run_options,
                                          run_metadata=run_metadata)
    time_elapsed = time.time() - start_time

    if run_metadata is not None:
        tl = timeline.Timeline(run_metadata.step_stats)
        trace = tl.generate_chrome_trace_format()
        trace_filename = os.path.join(train_step_kwargs['logdir'],
                                      'tf_trace-%d.json' % np_global_step)
        logging.info('Writing trace to %s', trace_filename)
        file_io.write_string_to_file(trace_filename, trace)
        if 'summary_writer' in train_step_kwargs:
            train_step_kwargs['summary_writer'].add_run_metadata(run_metadata,
                                                                 'run_metadata-%d' %
                                                                 np_global_step)

    total_time_elapsed = (time.time() - train_step_kwargs['start_time'])
    if 'should_log' in train_step_kwargs:
        if np_global_step % train_step_kwargs['log_every_n_steps'] == 0:
            logging.info('global_step: %d\ttotal_time: %.3f\tloss: %.4f\tsec/step: %.3f)',
                         np_global_step, total_time_elapsed, total_loss, time_elapsed)

    if 'max_train_time_sec' in train_step_kwargs and 'start_time' in train_step_kwargs:
        should_stop = total_time_elapsed > train_step_kwargs['max_train_time_sec']
        if should_stop:
            logging.info('stopping after %.3f seconds' % total_time_elapsed)
    else:
        logging.warn('Time Boundaries for training not give. Training will run until stopped')
        should_stop = False

    return total_loss, should_stop
Example #43
0
    def after_run(self, run_context, run_values):  # pylint: disable=unused-argument
        if self._should_trigger:
            self._np_global_step = run_values.results
            # self._iter_count = self._np_global_step
            self._timer.update_last_triggered_step(self._iter_count)
            run_metadata = run_values.run_metadata
            if run_metadata is not None:
                tl = timeline.Timeline(run_metadata.step_stats)
                trace = tl.generate_chrome_trace_format()
                trace_filename = os.path.join(self._log_dir, f"tf_trace-{self._np_global_step}.json")
                tf.logging.info(f"Writing trace to {trace_filename}.")
                file_io.write_string_to_file(trace_filename, trace)
                # TODO: add run_metadata to summaries with summary_writer
                #   find how summaries are saved in the estimator and add them
                # summary_writer.add_run_metadata(run_metadata, f"run_metadata-{self._global_step}")

        self._iter_count += 1
def write_model_config(model_dir: str, config_str: str):
    """Writes the config to the model's directory."""
    model_dir = root_dir(model_dir)
    model_config_path = get_model_config_path(model_dir)

    # if the model config file already exists, rename it
    if file_io.file_exists(model_config_path):
        prev_config_filename = '%s_%d.yaml' % (model_config_path.split('.')[0],
                                               int(time()))
        file_io.rename(model_config_path, prev_config_filename)
        logging.info('Previous model config file was renamed: %s' %
                     prev_config_filename)

    # save the config file to the model directory
    if not is_s3_path(model_dir):
        os.makedirs(model_dir, exist_ok=True)
    file_io.write_string_to_file(model_config_path, config_str)
Example #45
0
def save_once_or_wait_for_chief(write_fn,
                                metadata_dir,
                                is_chief,
                                timeout_secs=600):
    """Synchronizes saving data to disk across multiple tensorflow processes.

  This function can be used for synchronizing creation of data on disk that
  needs to be available to all processes in a Tensorflow cluster. Each process
  should call this function prior to using the data. The function makes the
  designated chief process write the data and every other process blocks until
  the data has been written.

  Args:
    write_fn: A function taking no arguments that executes the write to disk.
    metadata_dir: A path on the filesystem used for storing internal data used
      in this function (currently, a "done" sentinal file). If this directory
      doesn't exist it would be created; otherwise it should be writeable.
    is_chief: Whether the current process is the designated chief. Only one
      process should pass this as "True".
    timeout_secs: The (approximate) time in seconds a non-chief process should
      wait for the data to be created.

  Raises:
    SaveOnceOrWaitTimeOutError if this is a non-chief process and the data has
      not been created by the chief after timeout_secs seconds.
  """
    done_file = os.path.join(metadata_dir, '__tensorflow_lattice__done')
    if not is_chief:
        _poll_for_file(done_file, timeout_secs)
        return

    if file_io.file_exists(done_file):
        return

    write_fn()

    # Create an empty done file.
    file_io.recursive_create_dir(metadata_dir)
    file_io.write_string_to_file(
        done_file, 'Time created [UTC]: %s'
        '\nHostname: %s'
        '\nProcess id: %s'
        '\nTraceback:\n%s' %
        (datetime.datetime.utcnow(), socket.gethostname(), os.getpid(),
         '\n'.join(traceback.format_stack())))
Example #46
0
  def _create_schema_features(self, problem_type, with_image=False):
    features = {
        'num_id': {'transform': 'identity'},
        'num_scale': {'transform': 'scale', 'value': 4},
        'str_one_hot': {'transform': 'one_hot'},
        'str_embedding': {'transform': 'embedding', 'embedding_dim': 3},
        'str_bow': {'transform': 'bag_of_words'},
        'str_tfidf': {'transform': 'tfidf'},
        'target': {'transform': 'target'},
        'key': {'transform': 'key'}}
    if with_image:
      # Download inception checkpoint. Note that gs url doesn't work because
      # we may not have gcloud signed in when running the test.
      url = ('https://storage.googleapis.com/cloud-ml-data/img/' +
             'flower_photos/inception_v3_2016_08_28.ckpt')
      checkpoint_path = os.path.join(self._test_dir, "checkpoint")
      response = urlopen(url)
      with open(checkpoint_path, 'wb') as f:
        f.write(response.read())

      features['image'] = {'transform': 'image_to_vec', 'checkpoint': checkpoint_path}

    schema = [
        {'name': 'key', 'type': 'integer'},
        {'name': 'target', 'type': 'string' if problem_type == 'classification' else 'float'},
        {'name': 'num_id', 'type': 'integer'},
        {'name': 'num_scale', 'type': 'float'},
        {'name': 'str_one_hot', 'type': 'string'},
        {'name': 'str_embedding', 'type': 'string'},
        {'name': 'str_bow', 'type': 'string'},
        {'name': 'str_tfidf', 'type': 'string'}]
    if with_image:
      schema.append({'name': 'image', 'type': 'string'})

    self._schema = schema

    file_io.write_string_to_file(self._schema_filename, json.dumps(schema, indent=2))
    file_io.write_string_to_file(self._features_filename, json.dumps(features, indent=2))

    if with_image:
      self.make_image_files()

    self.make_csv_data(self._csv_train_filename, 50, problem_type, True, with_image)
    self.make_csv_data(self._csv_eval_filename, 30, problem_type, True, with_image)
    self.make_csv_data(self._csv_predict_filename, 10, problem_type, False, with_image)
  def test_stale_asset_collections_are_cleaned(self):
    vocabulary_file = os.path.join(
        compat.as_bytes(test.get_temp_dir()), compat.as_bytes('asset'))
    file_io.write_string_to_file(vocabulary_file, 'foo bar baz')

    export_path = os.path.join(tempfile.mkdtemp(), 'export')

    # create a SavedModel including assets
    with tf.Graph().as_default():
      with tf.Session().as_default() as session:
        input_string = tf.placeholder(tf.string)
        # Map string through a table loaded from an asset file
        table = lookup.string_to_index_table_from_file(
            vocabulary_file, num_oov_buckets=12,
            default_value=12)
        output = table.lookup(input_string)
        inputs = {'input': input_string}
        outputs = {'output': output}
        saved_transform_io.write_saved_transform_from_session(
            session, inputs, outputs, export_path)

    # Load it and save it again repeatedly, verifying that the asset collections
    # remain valid.
    for _ in [1, 2, 3]:
      with tf.Graph().as_default() as g:
        with tf.Session().as_default() as session:
          input_string = tf.constant('dog')
          inputs = {'input': input_string}
          outputs = saved_transform_io.apply_saved_transform(export_path,
                                                             inputs)

          self.assertEqual(
              1, len(g.get_collection(ops.GraphKeys.ASSET_FILEPATHS)))
          self.assertEqual(
              0, len(g.get_collection(tf.saved_model.constants.ASSETS_KEY)))

          # Check that every ASSET_FILEPATHS refers to a Tensor in the graph.
          # If not, get_tensor_by_name() raises KeyError.
          for asset_path in g.get_collection(ops.GraphKeys.ASSET_FILEPATHS):
            tensor_name = asset_path.name
            g.get_tensor_by_name(tensor_name)

          export_path = os.path.join(tempfile.mkdtemp(), 'export')
          saved_transform_io.write_saved_transform_from_session(
              session, inputs, outputs, export_path)
Example #48
0
    def _train_step(self, sess, train_op, global_step, train_step_kwargs):
        """Function that takes a gradient step and specifies whether to stop.

        Args:
            sess: The current session.
            train_op: An `Operation` that evaluates the gradients and returns the
                total loss.
            global_step: A `Tensor` representing the global training step.
            train_step_kwargs: A dictionary of keyword arguments.

        Returns:
            The total loss and a boolean indicating whether or not to stop training.

        Raises:
            ValueError: if 'should_trace' is in `train_step_kwargs` but `logdir` is not.
        """
        start_time = time.time()
        trace_run_options = None
        run_metadata = None
        if 'should_trace' in train_step_kwargs:
            if 'logdir' not in train_step_kwargs:
                raise ValueError('logdir must be present in train_step_kwargs when ''should_trace is present')
            if sess.run(train_step_kwargs['should_trace']):
                trace_run_options = config_pb2.RunOptions(trace_level=config_pb2.RunOptions.FULL_TRACE)
                run_metadata = config_pb2.RunMetadata()
        total_loss, np_global_step = sess.run([train_op, global_step],
                                              options=trace_run_options,
                                              run_metadata=run_metadata)
        time_elapsed = time.time() - start_time
        if run_metadata is not None:
            tl = timeline.Timeline(run_metadata.step_stats)
            trace = tl.generate_chrome_trace_format()
            trace_filename = os.path.join(train_step_kwargs['logdir'], 'tf_trace-%d.json' % np_global_step)
            logging.info('Writing trace to %s', trace_filename)
            file_io.write_string_to_file(trace_filename, trace)
            if 'summary_writer' in train_step_kwargs:
                train_step_kwargs['summary_writer'].add_run_metadata(run_metadata, 'run_metadata-%d' % np_global_step)
        if 'should_log' in train_step_kwargs:
            if sess.run(train_step_kwargs['should_log']):
                logging.info('global step %d: loss = %.4f (%.2f sec/step)', np_global_step, total_loss, time_elapsed)
        if 'should_stop' in train_step_kwargs:
            should_stop = sess.run(train_step_kwargs['should_stop'])
        else:
            should_stop = False
        return total_loss, should_stop
def _analyze(output_dir, dataset, cloud=False, project_id=None):
    import google.datalab.ml as ml
    from . import preprocess

    if not isinstance(dataset, ml.CsvDataSet):
        raise ValueError('Only CsvDataSet is supported')

    if len(dataset.input_files) != 1:
        raise ValueError(
            'CsvDataSet should be built with a file pattern, not a '
            'list of files.')

    if project_id and not cloud:
        raise ValueError('project_id only needed if cloud is True')

    if cloud:
        _assert_gcs_files([output_dir, dataset.input_files[0]])

    tmp_dir = tempfile.mkdtemp()
    try:
        # write the schema file.
        _, schema_file_path = tempfile.mkstemp(dir=tmp_dir,
                                               suffix='.json',
                                               prefix='schema')
        file_io.write_string_to_file(schema_file_path,
                                     json.dumps(dataset.schema))

        # TODO(brandondutra) use project_id in the local preprocess function.
        args = [
            'preprocess',
            '--input-file-pattern=%s' % dataset.input_files[0],
            '--output-dir=%s' % output_dir,
            '--schema-file=%s' % schema_file_path
        ]

        if cloud:
            if not project_id:
                project_id = _default_project()
            print('Track BigQuery status at')
            print('https://bigquery.cloud.google.com/queries/%s' % project_id)
            preprocess.cloud_preprocess.main(args)
        else:
            preprocess.local_preprocess.main(args)
    finally:
        shutil.rmtree(tmp_dir)
Example #50
0
def validate_stats(stats_path, schema_path, anomalies_path):
    """Validates the statistics against the schema and materializes anomalies.

  Args:
    stats_path: Location of the stats used to infer the schema.
    schema_path: Location of the schema to be used for validation.
    anomalies_path: Location where the detected anomalies are materialized.
  """
    print('Validating schema against the computed statistics.')
    schema = tfdv.load_schema_text(schema_path)
    stats = tfdv.load_statistics(stats_path)
    anomalies = tfdv.validate_statistics(stats, schema)
    print('Detected following anomalies:')
    print(text_format.MessageToString(anomalies))

    print('Writing anomalies to anomalies path.')
    file_io.write_string_to_file(anomalies_path,
                                 text_format.MessageToString(anomalies))
Example #51
0
def write_file_contents(file_path: Text, content: Text):
    """
    Writes contents of file.

    Args:
        file_path (str): Path to file.
        content (str): Contents of file.
    """
    return file_io.write_string_to_file(file_path, content)
def _write_mlpipeline_ui_metadata(stats_viz_output_path, inferred_schema):
    metadata_as_json = json.dumps({
        'outputs': [
            {
                'type': 'web-app',
                'storage': 'gcs',
                'source': stats_viz_output_path,
            },
            {
                'storage': 'inline',
                'source':
                _render_inferred_schema_summary_markdown(inferred_schema),
                'type': 'markdown',
            },
        ]
    })
    file_io.write_string_to_file('/tmp/mlpipeline-ui-metadata.json',
                                 metadata_as_json)
def _write_assets(assets_directory, assets_filename):
    """Writes asset files to be used with SavedModel for half plus two.

  Args:
    assets_directory: The directory to which the assets should be written.
    assets_filename: Name of the file to which the asset contents should be
        written.

  Returns:
    The path to which the assets file was written.
  """
    if not file_io.file_exists(assets_directory):
        file_io.recursive_create_dir(assets_directory)

    path = os.path.join(tf.compat.as_bytes(assets_directory),
                        tf.compat.as_bytes(assets_filename))
    file_io.write_string_to_file(path, "asset-file-contents")
    return path
Example #54
0
def datahtml(bucket_name, commit_sha, train_file_path):
    import json
    import seaborn as sns
    import matplotlib.pyplot as plt
    import os
    image_path = os.path.join(bucket_name, commit_sha, 'visualization.png')
    image_url = os.path.join('https://storage.googleapis.com',
                             bucket_name.lstrip('gs://'), commit_sha,
                             'visualization.png')
    html_path = os.path.join(bucket_name, 'kaggle.html')
    # ouptut visualization to a file

    import pandas as pd
    df_train = pd.read_csv(train_file_path)
    sns.set()
    cols = [
        'SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF',
        'FullBath', 'YearBuilt'
    ]
    sns.pairplot(df_train[cols], size=3)
    plt.savefig('visualization.png')
    from tensorflow.python.lib.io import file_io
    file_io.copy('visualization.png', image_path)
    rendered_template = """
    <html>
        <head>
            <title>correlation image</title>
        </head>
        <body>
            <img src={}>
        </body>
    </html>""".format(image_url)
    file_io.write_string_to_file(html_path, rendered_template)

    metadata = {
        'outputs': [{
            'type': 'web-app',
            'storage': 'gcs',
            'source': html_path,
        }]
    }
    with file_io.FileIO('/mlpipeline-ui-metadata.json', 'w') as f:
        json.dump(metadata, f)
def update_renames_v2(output_file_path):
  """Writes a Python dictionary mapping deprecated to canonical API names.

  Args:
    output_file_path: File path to write output to. Any existing contents
      would be replaced.
  """
  function_renames = collect_function_renames()
  constant_renames = collect_constant_renames()
  all_renames = function_renames.union(constant_renames)

  # List of rename lines to write to output file in the form:
  #   'tf.deprecated_name': 'tf.canonical_name'
  rename_lines = [
      get_rename_line(name, canonical_name)
      for name, canonical_name in all_renames]
  renames_file_text = '%srenames = {\n%s\n}\n' % (
      _FILE_HEADER, ',\n'.join(sorted(rename_lines)))
  file_io.write_string_to_file(output_file_path, renames_file_text)
Example #56
0
    def test_run(self, mock_publisher):
        mock_publisher.return_value.publish_execution.return_value = {}

        test_dir = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName)

        connection_config = metadata_store_pb2.ConnectionConfig()
        connection_config.sqlite.SetInParent()

        pipeline_root = os.path.join(test_dir, 'Test')
        input_path = os.path.join(test_dir, 'input')
        tf.gfile.MakeDirs(os.path.dirname(input_path))
        file_io.write_string_to_file(input_path, 'test')

        input_artifact = types.TfxArtifact(type_name='InputPath')
        input_artifact.uri = input_path

        component = _FakeComponent(name='FakeComponent',
                                   input_channel=channel.as_channel(
                                       [input_artifact]))

        pipeline_info = data_types.PipelineInfo(pipeline_name='Test',
                                                pipeline_root=pipeline_root,
                                                run_id=123)

        driver_args = data_types.DriverArgs(worker_name=component.component_id,
                                            base_output_dir=os.path.join(
                                                pipeline_root,
                                                component.component_id),
                                            enable_cache=True)

        component_launcher.ComponentLauncher(
            component=component,
            pipeline_info=pipeline_info,
            driver_args=driver_args,
            metadata_connection_config=connection_config,
            additional_pipeline_args={}).run()

        output_path = os.path.join(pipeline_root, 'output')
        self.assertTrue(tf.gfile.Exists(output_path))
        contents = file_io.read_file_to_string(output_path)
        self.assertEqual('test', contents)
Example #57
0
    def test_numerics(self):
        test_folder = os.path.join(self._bucket_root, 'test_numerics')
        input_file_path = os.path.join(test_folder, 'input.csv')
        output_folder = os.path.join(test_folder, 'test_output')
        file_io.recursive_create_dir(output_folder)

        file_io.write_string_to_file(
            input_file_path,
            '\n'.join(['%s,%s' % (i, 10 * i + 0.5) for i in range(100)]))

        analyze_data.run_cloud_analysis(output_dir=output_folder,
                                        csv_file_pattern=input_file_path,
                                        bigquery_table=None,
                                        schema=[{
                                            'name': 'col1',
                                            'type': 'INTEGER'
                                        }, {
                                            'name': 'col2',
                                            'type': 'FLOAT'
                                        }],
                                        features={
                                            'col1': {
                                                'transform': 'scale'
                                            },
                                            'col2': {
                                                'transform': 'identity'
                                            }
                                        })
        stats = json.loads(
            file_io.read_file_to_string(
                os.path.join(output_folder, analyze_data.STATS_FILE)).decode())

        self.assertEqual(stats['num_examples'], 100)
        col = stats['column_stats']['col1']
        self.assertAlmostEqual(col['max'], 99.0)
        self.assertAlmostEqual(col['min'], 0.0)
        self.assertAlmostEqual(col['mean'], 49.5)

        col = stats['column_stats']['col2']
        self.assertAlmostEqual(col['max'], 990.5)
        self.assertAlmostEqual(col['min'], 0.5)
        self.assertAlmostEqual(col['mean'], 495.5)
Example #58
0
def main(argv=None):
  args = parse_arguments(sys.argv if argv is None else argv)

  if args.csv_schema_file:
    schema = json.loads(
        file_io.read_file_to_string(args.csv_schema_file).decode())
  else:
    import google.datalab.bigquery as bq
    schema = bq.Table(args.bigquery_table).schema._bq_schema
  features = json.loads(
      file_io.read_file_to_string(args.features_file).decode())

  expand_defaults(schema, features)  # features are updated.
  check_schema_transforms_match(schema, features)

  file_io.recursive_create_dir(args.output_dir)

  if args.cloud:
    run_cloud_analysis(
        output_dir=args.output_dir,
        csv_file_pattern=args.csv_file_pattern,
        bigquery_table=args.bigquery_table,
        schema=schema,
        features=features)
  else:
    run_local_analysis(
        output_dir=args.output_dir,
        csv_file_pattern=args.csv_file_pattern,
        schema=schema,
        features=features)

  # Also writes the transform fn and tft metadata.
  make_transform_graph(args.output_dir, schema, features)

  # Save a copy of the schema and features in the output folder.
  file_io.write_string_to_file(
    os.path.join(args.output_dir, SCHEMA_FILE),
    json.dumps(schema, indent=2))

  file_io.write_string_to_file(
    os.path.join(args.output_dir, FEATURES_FILE),
    json.dumps(features, indent=2))
Example #59
0
def _write_object_graph(root, export_dir, asset_file_def_index):
  """Save a SavedObjectGraph proto for `root`."""
  # SavedObjectGraph is similar to the CheckpointableObjectGraph proto in the
  # checkpoint. It will eventually go into the SavedModel.
  proto = saved_object_graph_pb2.SavedObjectGraph()

  checkpointable_objects, node_ids, slot_variables = util.find_objects(root)
  util.fill_object_graph_proto(checkpointable_objects, node_ids, slot_variables,
                               proto)

  for obj, obj_proto in zip(checkpointable_objects, proto.nodes):
    _write_object_proto(obj, obj_proto, asset_file_def_index)

  extra_asset_dir = os.path.join(
      compat.as_bytes(export_dir),
      compat.as_bytes(constants.EXTRA_ASSETS_DIRECTORY))
  file_io.recursive_create_dir(extra_asset_dir)
  object_graph_filename = os.path.join(
      extra_asset_dir, compat.as_bytes("object_graph.pb"))
  file_io.write_string_to_file(object_graph_filename, proto.SerializeToString())
def update_reorders_v2(output_file_path):
  """Writes a Python dictionary mapping function name to argument order.

  Args:
    output_file_path: File path to write output to. Any existing contents
      would be replaced.
  """
  reordered_function_names = (
      tf_upgrade_v2.TFAPIChangeSpec().reordered_function_names)

  all_reorders = collect_function_arg_names(reordered_function_names)

  # List of reorder lines to write to output file in the form:
  #   'tf.function_name': ['arg1', 'arg2', ...]
  rename_lines = [
      get_reorder_line(name, arg_names)
      for name, arg_names in all_reorders.items()]
  renames_file_text = '%sreorders = {\n%s\n}\n' % (
      _FILE_HEADER, ',\n'.join(sorted(rename_lines)))
  file_io.write_string_to_file(output_file_path, renames_file_text)