Exemple #1
0
def train(_):
    training_dir = pjoin(FLAGS.training_set, FLAGS.train_subdir)
    feature_context = Datasets.get_context(training_dir)

    (feature_names, label_names) = feature_context.multispec_feature_groups

    training_dataset = Datasets.dict.read_dataset(training_dir)
    (feature_train_data,
     labels_train_data) = transform_dataset(feature_context, training_dataset)

    params = {
        'objective': 'multi:softprob',
        'verbose': False,
        'num_class': len(label_names),
        'max_depth': 6,
        'nthread': 4,
        'silent': 1
    }

    xg_train = xgb.DMatrix(feature_train_data, label=labels_train_data)
    xg_model = xgb.train(params, xg_train, FLAGS.rounds)

    model_path = pjoin(FLAGS.local_dir, "iterator.model")
    xg_model.save_model(model_path)

    output_path = pjoin(FLAGS.training_set, "xgboost/iterator.model")
    file_io.copy(model_path, output_path, overwrite=True)
Exemple #2
0
  def _save_and_write_assets(self, assets_collection_to_add=None):
    """Saves asset to the meta graph and writes asset files to disk.

    Args:
      assets_collection_to_add: The collection where the asset paths are setup.
    """
    asset_source_filepath_list = _maybe_save_assets(assets_collection_to_add)

    # Return if there are no assets to write.
    if len(asset_source_filepath_list) is 0:
      tf_logging.info("No assets to write.")
      return

    assets_destination_dir = os.path.join(
        compat.as_bytes(self._export_dir),
        compat.as_bytes(constants.ASSETS_DIRECTORY))

    if not file_io.file_exists(assets_destination_dir):
      file_io.recursive_create_dir(assets_destination_dir)

    # Copy each asset from source path to destination path.
    for asset_source_filepath in asset_source_filepath_list:
      asset_source_filename = os.path.basename(asset_source_filepath)

      asset_destination_filepath = os.path.join(
          compat.as_bytes(assets_destination_dir),
          compat.as_bytes(asset_source_filename))

      # Only copy the asset file to the destination if it does not already
      # exist. This is to ensure that an asset with the same name defined as
      # part of multiple graphs is only copied the first time.
      if not file_io.file_exists(asset_destination_filepath):
        file_io.copy(asset_source_filepath, asset_destination_filepath)

    tf_logging.info("Assets written to: %s", assets_destination_dir)
Exemple #3
0
 def testCopyOverwriteFalse(self):
   file_path = os.path.join(self._base_dir, "temp_file")
   file_io.FileIO(file_path, mode="w").write("testing")
   copy_path = os.path.join(self._base_dir, "copy_file")
   file_io.FileIO(copy_path, mode="w").write("copy")
   with self.assertRaises(errors.AlreadyExistsError):
     file_io.copy(file_path, copy_path, overwrite=False)
    def _save_and_write_assets(self, assets_collection_to_add=None):
        """Saves asset to the meta graph and writes asset files to disk.

    Args:
      assets_collection_to_add: The collection where the asset paths are setup.
    """
        asset_filename_map = _maybe_save_assets(assets_collection_to_add)

        # Return if there are no assets to write.
        if not asset_filename_map:
            tf_logging.info("No assets to write.")
            return

        assets_destination_dir = saved_model_utils.get_or_create_assets_dir(
            self._export_dir)

        # Copy each asset from source path to destination path.
        for asset_basename, asset_source_filepath in asset_filename_map.items(
        ):
            asset_destination_filepath = os.path.join(
                compat.as_bytes(assets_destination_dir),
                compat.as_bytes(asset_basename))

            # Only copy the asset file to the destination if it does not already
            # exist. This is to ensure that an asset with the same name defined as
            # part of multiple graphs is only copied the first time.
            if not file_io.file_exists(asset_destination_filepath):
                file_io.copy(asset_source_filepath, asset_destination_filepath)

        tf_logging.info("Assets written to: %s",
                        compat.as_text(assets_destination_dir))
def down(id, cloud_path=None):
    shard = get_shard(id)
    to_path = "%s/%d" % (emb_path, shard)
    if not os.path.exists(to_path):
        try:
            os.mkdir(to_path)
        except OSError as e:
            if e.errno != 17:  # File exists
                raise e
    to_filepath = "%s/%d.emb" % (to_path, id)
    url = 'http://ml.daangn.com/articles/image_embeddings/%s' % id_to_path(id)
    logging.info('down: %s', url)
    result = call(
        ['curl', '-f', '--connect-timeout', '2', '-o', to_filepath, url])
    if not os.path.exists(to_filepath):
        return 0
    if os.stat(to_filepath).st_size < 1:
        os.remove(filepath)
        return 0

    if cloud_path:
        to_gs_filepath = '%s/%s' % (cloud_path, to_filepath)
        if file_io.file_exists(to_gs_filepath):
            return 0

        to_gs_path = '%s/%s' % (cloud_path, to_path)
        if not file_io.is_directory(to_gs_path):
            file_io.create_dir(to_gs_path)
        file_io.copy(to_filepath, to_gs_filepath)
    return 1
Exemple #6
0
    def _save_and_write_assets(self, assets_collection_to_add=None):
        """Saves asset to the meta graph and writes asset files to disk.

    Args:
      assets_collection_to_add: The collection where the asset paths are setup.
    """
        asset_source_filepath_list = self._save_assets(
            assets_collection_to_add)

        # Return if there are no assets to write.
        if len(asset_source_filepath_list) is 0:
            tf_logging.info("No assets to write.")
            return

        assets_destination_dir = os.path.join(
            compat.as_bytes(self._export_dir),
            compat.as_bytes(constants.ASSETS_DIRECTORY))

        if not file_io.file_exists(assets_destination_dir):
            file_io.recursive_create_dir(assets_destination_dir)

        # Copy each asset from source path to destination path.
        for asset_source_filepath in asset_source_filepath_list:
            asset_source_filename = os.path.basename(asset_source_filepath)

            asset_destination_filepath = os.path.join(
                compat.as_bytes(assets_destination_dir),
                compat.as_bytes(asset_source_filename))
            file_io.copy(asset_source_filepath,
                         asset_destination_filepath,
                         overwrite=True)

        tf_logging.info("Assets written to: %s", assets_destination_dir)
Exemple #7
0
 def testCopy(self):
   file_path = os.path.join(self._base_dir, "temp_file")
   file_io.FileIO(file_path, mode="w").write("testing")
   copy_path = os.path.join(self._base_dir, "copy_file")
   file_io.copy(file_path, copy_path)
   self.assertTrue(file_io.file_exists(copy_path))
   self.assertEqual(b"testing", file_io.read_file_to_string(file_path))
Exemple #8
0
def copy_file(src, dest):
    if not file_io.file_exists(src):
        raise Exception("Src file doesn't exist at %s" % src)
    if file_io.is_directory(src):
        copy_dir(src, dest)
        return
    file_io.copy(src, dest, overwrite=True)
Exemple #9
0
  def _save_and_write_assets(self, assets_collection_to_add=None):
    """Saves asset to the meta graph and writes asset files to disk.

    Args:
      assets_collection_to_add: The collection where the asset paths are setup.
    """
    asset_source_filepath_list = self._save_assets(assets_collection_to_add)

    # Return if there are no assets to write.
    if len(asset_source_filepath_list) is 0:
      tf_logging.info("No assets to write.")
      return

    assets_destination_dir = os.path.join(
        compat.as_bytes(self._export_dir),
        compat.as_bytes(constants.ASSETS_DIRECTORY))

    if not file_io.file_exists(assets_destination_dir):
      file_io.recursive_create_dir(assets_destination_dir)

    # Copy each asset from source path to destination path.
    for asset_source_filepath in asset_source_filepath_list:
      asset_source_filename = os.path.basename(asset_source_filepath)

      asset_destination_filepath = os.path.join(
          compat.as_bytes(assets_destination_dir),
          compat.as_bytes(asset_source_filename))
      file_io.copy(
          asset_source_filepath, asset_destination_filepath, overwrite=True)

    tf_logging.info("Assets written to: %s", assets_destination_dir)
def _load_tf_custom_op(model_path):
    """Loads a custom TF OP (in .so format) from /assets.extra directory."""
    assets_dir = os.path.join(model_path, _CUSTOM_OP_DIRECTORY_NAME)
    if file_io.is_directory(assets_dir):
        custom_ops_pattern = os.path.join(assets_dir, _CUSTOM_OP_SUFFIX)
        for custom_op_path_original in file_io.get_matching_files(
                custom_ops_pattern):
            logging.info("Found custom op file: %s", custom_op_path_original)
            if custom_op_path_original.startswith("gs://"):
                if not os.path.isdir(_CUSTOM_OP_LOCAL_DIR):
                    os.makedirs(_CUSTOM_OP_LOCAL_DIR)
                custom_op_path_local = os.path.join(
                    _CUSTOM_OP_LOCAL_DIR,
                    os.path.basename(custom_op_path_original))
                logging.info("Copying custop op from: %s to: %s",
                             custom_op_path_original, custom_op_path_local)
                file_io.copy(custom_op_path_original, custom_op_path_local,
                             True)
            else:
                custom_op_path_local = custom_op_path_original
            try:
                import tensorflow as tf  # pylint: disable=g-import-not-at-top
                logging.info("Loading custom op: %s", custom_op_path_local)
                logging.info("TF Version: %s", tf.__version__)
                tf.load_op_library(custom_op_path_local)
            except RuntimeError as e:
                logging.exception(
                    "Failed to load custom op: %s with error: %s. Prediction "
                    "will likely fail due to missing operations.",
                    custom_op_path_local, e)
Exemple #11
0
  def _save_and_write_assets(self, assets_collection_to_add=None):
    """Saves asset to the meta graph and writes asset files to disk.

    Args:
      assets_collection_to_add: The collection where the asset paths are setup.
    """
    asset_filename_map = _maybe_save_assets(assets_collection_to_add)

    # Return if there are no assets to write.
    if not asset_filename_map:
      tf_logging.info("No assets to write.")
      return

    assets_destination_dir = saved_model_utils.get_or_create_assets_dir(
        self._export_dir)

    # Copy each asset from source path to destination path.
    for asset_basename, asset_source_filepath in asset_filename_map.items():
      asset_destination_filepath = os.path.join(
          compat.as_bytes(assets_destination_dir),
          compat.as_bytes(asset_basename))

      # Only copy the asset file to the destination if it does not already
      # exist. This is to ensure that an asset with the same name defined as
      # part of multiple graphs is only copied the first time.
      if not file_io.file_exists(asset_destination_filepath):
        file_io.copy(asset_source_filepath, asset_destination_filepath)

    tf_logging.info("Assets written to: %s",
                    compat.as_text(assets_destination_dir))
Exemple #12
0
 def testCopyOverwriteFalse(self):
     file_path = os.path.join(self._base_dir, "temp_file")
     file_io.write_string_to_file(file_path, "testing")
     copy_path = os.path.join(self._base_dir, "copy_file")
     file_io.write_string_to_file(copy_path, "copy")
     with self.assertRaises(errors.AlreadyExistsError):
         file_io.copy(file_path, copy_path, overwrite=False)
Exemple #13
0
 def testCopyOverwriteFalse(self):
   file_path = os.path.join(self._base_dir, "temp_file")
   file_io.write_string_to_file(file_path, "testing")
   copy_path = os.path.join(self._base_dir, "copy_file")
   file_io.write_string_to_file(copy_path, "copy")
   with self.assertRaises(errors.AlreadyExistsError):
     file_io.copy(file_path, copy_path, overwrite=False)
 def testCopyOverwriteFalse(self):
   file_path = os.path.join(self._base_dir, "temp_file")
   file_io.FileIO(file_path, mode="w").write("testing")
   copy_path = os.path.join(self._base_dir, "copy_file")
   file_io.FileIO(copy_path, mode="w").write("copy")
   with self.assertRaises(errors.AlreadyExistsError):
     file_io.copy(file_path, copy_path, overwrite=False)
  def _save_and_write_assets(self, assets_collection_to_add=None):
    """Saves asset to the meta graph and writes asset files to disk.

    Args:
      assets_collection_to_add: The collection where the asset paths are setup.
    """
    asset_source_filepath_list = _maybe_save_assets(assets_collection_to_add)

    # Return if there are no assets to write.
    if len(asset_source_filepath_list) is 0:
      tf_logging.info("No assets to write.")
      return

    assets_destination_dir = os.path.join(
        compat.as_bytes(self._export_dir),
        compat.as_bytes(constants.ASSETS_DIRECTORY))

    if not file_io.file_exists(assets_destination_dir):
      file_io.recursive_create_dir(assets_destination_dir)

    # Copy each asset from source path to destination path.
    for asset_source_filepath in asset_source_filepath_list:
      asset_source_filename = os.path.basename(asset_source_filepath)

      asset_destination_filepath = os.path.join(
          compat.as_bytes(assets_destination_dir),
          compat.as_bytes(asset_source_filename))

      # Only copy the asset file to the destination if it does not already
      # exist. This is to ensure that an asset with the same name defined as
      # part of multiple graphs is only copied the first time.
      if not file_io.file_exists(asset_destination_filepath):
        file_io.copy(asset_source_filepath, asset_destination_filepath)

    tf_logging.info("Assets written to: %s", assets_destination_dir)
 def testCopyOverwrite(self):
   file_path = os.path.join(self._base_dir, "temp_file")
   file_io.FileIO(file_path, mode="w").write("testing")
   copy_path = os.path.join(self._base_dir, "copy_file")
   file_io.FileIO(copy_path, mode="w").write("copy")
   file_io.copy(file_path, copy_path, overwrite=True)
   self.assertTrue(file_io.file_exists(copy_path))
   self.assertEqual("testing", file_io.FileIO(file_path, mode="r").read())
Exemple #17
0
 def testCopyOverwrite(self):
   file_path = os.path.join(self._base_dir, "temp_file")
   file_io.write_string_to_file(file_path, "testing")
   copy_path = os.path.join(self._base_dir, "copy_file")
   file_io.write_string_to_file(copy_path, "copy")
   file_io.copy(file_path, copy_path, overwrite=True)
   self.assertTrue(file_io.file_exists(copy_path))
   self.assertEqual(b"testing", file_io.read_file_to_string(file_path))
Exemple #18
0
 def testCopyOverwrite(self):
   file_path = os.path.join(self._base_dir, "temp_file")
   file_io.FileIO(file_path, mode="w").write("testing")
   copy_path = os.path.join(self._base_dir, "copy_file")
   file_io.FileIO(copy_path, mode="w").write("copy")
   file_io.copy(file_path, copy_path, overwrite=True)
   self.assertTrue(file_io.file_exists(copy_path))
   self.assertEqual(b"testing", file_io.FileIO(file_path, mode="r").read())
Exemple #19
0
 def testCopyOverwrite(self):
     file_path = os.path.join(self._base_dir, "temp_file")
     file_io.write_string_to_file(file_path, "testing")
     copy_path = os.path.join(self._base_dir, "copy_file")
     file_io.write_string_to_file(copy_path, "copy")
     file_io.copy(file_path, copy_path, overwrite=True)
     self.assertTrue(file_io.file_exists(copy_path))
     self.assertEqual(b"testing", file_io.read_file_to_string(file_path))
 def testCopy(self):
   file_path = os.path.join(self.get_temp_dir(), "temp_file")
   file_io.write_string_to_file(file_path, "testing")
   copy_path = os.path.join(self.get_temp_dir(), "copy_file")
   file_io.copy(file_path, copy_path)
   self.assertTrue(file_io.file_exists(copy_path))
   self.assertEqual(b"testing", file_io.read_file_to_string(file_path))
   file_io.delete_file(file_path)
   file_io.delete_file(copy_path)
 def testCopy(self):
   file_path = os.path.join(self._base_dir, "temp_file")
   file_io.FileIO(file_path, mode="w").write("testing")
   copy_path = os.path.join(self._base_dir, "copy_file")
   file_io.copy(file_path, copy_path)
   self.assertTrue(file_io.file_exists(copy_path))
   f = file_io.FileIO(file_path, mode="r")
   self.assertEqual("testing", f.read())
   self.assertEqual(7, f.tell())
Exemple #22
0
  def preprocess(train_dataset, output_dir, eval_dataset, checkpoint, pipeline_option):
    """Preprocess data in Cloud with DataFlow."""

    import apache_beam as beam
    import google.datalab.utils
    from . import _preprocess

    if checkpoint is None:
      checkpoint = _util._DEFAULT_CHECKPOINT_GSURL

    job_name = ('preprocess-image-classification-' +
                datetime.datetime.now().strftime('%y%m%d-%H%M%S'))

    staging_package_url = _util.repackage_to_staging(output_dir)
    tmpdir = tempfile.mkdtemp()
    # suppress DataFlow warnings about wheel package as extra package.
    original_level = logging.getLogger().getEffectiveLevel()
    logging.getLogger().setLevel(logging.ERROR)
    try:
      # Workaround for DataFlow 2.0, which doesn't work well with extra packages in GCS.
      # Remove when the issue is fixed and new version of DataFlow is included in Datalab.
      extra_packages = [staging_package_url, _TF_GS_URL, _PROTOBUF_GS_URL]
      local_packages = [os.path.join(tmpdir, os.path.basename(p))
                        for p in extra_packages]
      for source, dest in zip(extra_packages, local_packages):
        file_io.copy(source, dest, overwrite=True)

      options = {
          'staging_location': os.path.join(output_dir, 'tmp', 'staging'),
          'temp_location': os.path.join(output_dir, 'tmp'),
          'job_name': job_name,
          'project': _util.default_project(),
          'extra_packages': local_packages,
          'teardown_policy': 'TEARDOWN_ALWAYS',
          'no_save_main_session': True
      }
      if pipeline_option is not None:
        options.update(pipeline_option)

      opts = beam.pipeline.PipelineOptions(flags=[], **options)
      p = beam.Pipeline('DataflowRunner', options=opts)
      _preprocess.configure_pipeline(p, train_dataset, eval_dataset,
                                     checkpoint, output_dir, job_name)
      job_results = p.run()
    finally:
      shutil.rmtree(tmpdir)
      logging.getLogger().setLevel(original_level)

    if (_util.is_in_IPython()):
      import IPython
      dataflow_url = 'https://console.developers.google.com/dataflow?project=%s' % \
                     _util.default_project()
      html = 'Job "%s" submitted.' % job_name
      html += '<p>Click <a href="%s" target="_blank">here</a> to track preprocessing job. <br/>' \
          % dataflow_url
      IPython.display.display_html(html, raw=True)
    return google.datalab.utils.DataflowJob(job_results)
Exemple #23
0
 def testCopy(self, join):
     file_path = join(self._base_dir, "temp_file")
     file_io.FileIO(file_path, mode="w").write("testing")
     copy_path = join(self._base_dir, "copy_file")
     file_io.copy(file_path, copy_path)
     self.assertTrue(file_io.file_exists(copy_path))
     f = file_io.FileIO(file_path, mode="r")
     self.assertEqual("testing", f.read())
     self.assertEqual(7, f.tell())
 def testCopy(self):
     file_path = os.path.join(self.get_temp_dir(), "temp_file")
     file_io.write_string_to_file(file_path, "testing")
     copy_path = os.path.join(self.get_temp_dir(), "copy_file")
     file_io.copy(file_path, copy_path)
     self.assertTrue(file_io.file_exists(copy_path))
     self.assertEqual(b"testing", file_io.read_file_to_string(file_path))
     file_io.delete_file(file_path)
     file_io.delete_file(copy_path)
Exemple #25
0
  def batch_predict(dataset, model_dir, output_csv, output_bq_table, pipeline_option):
    """Batch predict running in cloud."""

    import apache_beam as beam
    import google.datalab.utils
    from . import _predictor

    if output_csv is None and output_bq_table is None:
      raise ValueError('output_csv and output_bq_table cannot both be None.')
    if 'temp_location' not in pipeline_option:
      raise ValueError('"temp_location" is not set in cloud.')

    job_name = ('batch-predict-image-classification-' +
                datetime.datetime.now().strftime('%y%m%d-%H%M%S'))
    staging_package_url = _util.repackage_to_staging(pipeline_option['temp_location'])
    tmpdir = tempfile.mkdtemp()
    # suppress DataFlow warnings about wheel package as extra package.
    original_level = logging.getLogger().getEffectiveLevel()
    logging.getLogger().setLevel(logging.ERROR)
    try:
      # Workaround for DataFlow 2.0, which doesn't work well with extra packages in GCS.
      # Remove when the issue is fixed and new version of DataFlow is included in Datalab.
      extra_packages = [staging_package_url, _TF_GS_URL, _PROTOBUF_GS_URL]
      local_packages = [os.path.join(tmpdir, os.path.basename(p))
                        for p in extra_packages]
      for source, dest in zip(extra_packages, local_packages):
        file_io.copy(source, dest, overwrite=True)

      options = {
          'staging_location': os.path.join(pipeline_option['temp_location'], 'staging'),
          'job_name': job_name,
          'project': _util.default_project(),
          'extra_packages': local_packages,
          'teardown_policy': 'TEARDOWN_ALWAYS',
          'no_save_main_session': True
      }
      options.update(pipeline_option)

      opts = beam.pipeline.PipelineOptions(flags=[], **options)
      p = beam.Pipeline('DataflowRunner', options=opts)
      _predictor.configure_pipeline(p, dataset, model_dir, output_csv, output_bq_table)
      job_results = p.run()
    finally:
      shutil.rmtree(tmpdir)
      logging.getLogger().setLevel(original_level)

    if (_util.is_in_IPython()):
      import IPython
      dataflow_url = ('https://console.developers.google.com/dataflow?project=%s' %
                      _util.default_project())
      html = 'Job "%s" submitted.' % job_name
      html += ('<p>Click <a href="%s" target="_blank">here</a> to track batch prediction job. <br/>'
               % dataflow_url)
      IPython.display.display_html(html, raw=True)
    return google.datalab.utils.DataflowJob(job_results)
def run_analysis(args):
    """Builds an analysis files for training."""

    # Read the schema and input feature types
    schema_list = json.loads(file_io.read_file_to_string(args.schema_file))

    run_numerical_categorical_analysis(args, schema_list)

    # Also save a copy of the schema in the output folder.
    file_io.copy(args.schema_file,
                 os.path.join(args.output_dir, SCHEMA_FILE),
                 overwrite=True)
Exemple #27
0
    def session(
            cls,
            session,  # type: tf.Session
            path,  # type: str
            network  # type: Union[tf.Tensor, str, List[str]]
    ):
        # type: (...) -> str
        """
        Freeze a graph by taking a session and a network and storing
        the results into a pb file at the given path. This function wil convert
        variables to constants which is necessary for JVM serving.

        :param session: TF Session
        :param path: Where the graph will be written, this can be local filesystem or GCS
        :param network: Tensor, Operation name, or list of Operation names
        :return: Path to the written graph
        """
        input_graph_def = tf.get_default_graph().as_graph_def()

        time = timeit.default_timer()
        logger.info("Freezing model at {}".format(time))

        if isinstance(network, tf.Tensor):
            output_node_names = [t.op.name for t in [network]]
        elif isinstance(network, str):
            output_node_names = [network]
        elif isinstance(network, list):
            output_node_names = network
        else:
            raise ValueError(
                "Network must be a Tensor, String or List of Strings")

        output_graph_def = tf.graph_util.convert_variables_to_constants(
            session,
            input_graph_def,
            output_node_names,
            variable_names_blacklist=["global_step"])

        if FreezeGraph.__is_gcs(path):
            import tempfile
            local_path = tempfile.mktemp("local_temp_graph")
            file_io.write_string_to_file(local_path,
                                         output_graph_def.SerializeToString())
            file_io.copy(local_path, path, overwrite=True)
        else:
            file_io.write_string_to_file(path,
                                         output_graph_def.SerializeToString())

        logger.info("Froze graph in %4d seconds" %
                    (timeit.default_timer() - time))

        return path
def run_analysis(args):
  """Builds an analysis files for training."""

  # Read the schema and input feature types
  schema_list = json.loads(
      file_io.read_file_to_string(args.schema_file))

  run_numerical_categorical_analysis(args, schema_list)

  # Also save a copy of the schema in the output folder.
  file_io.copy(args.schema_file,
               os.path.join(args.output_dir, SCHEMA_FILE),
               overwrite=True)
Exemple #29
0
 def open(self, filename: str, mode: str):
     if is_external_location(filename):
         # there seem to be an issue with GzipFile and fileobj
         with tempfile.TemporaryDirectory(suffix='-gzip') as gzip_dir:
             local_gzip_file = os.path.join(gzip_dir,
                                            os.path.basename(filename))
             with ClosingGzipFile(filename=local_gzip_file,
                                  mode=mode) as local_fp:
                 yield local_fp
             tf_file_io.copy(local_gzip_file, filename, overwrite=True)
     else:
         with ClosingGzipFile(filename=filename, mode=mode) as local_fp:
             yield local_fp
Exemple #30
0
def main(_):
  input_url = 's3://' + args.inputbucket + "/"
  output_url = 's3://' + args.outputbucket + "/"
  
  os.makedirs(args.datadir)

  # first, we copy files from pachyderm into a convenient
  # local directory for processing.  
  input_uri = os.path.join(input_url, args.trainingdata)
  training_data_path = os.path.join(args.datadir, args.trainingdata)
  print("copying {} to {}".format(input_uri, training_data_path))
  file_io.copy(input_uri, training_data_path, True)
  
  (train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data(path=training_data_path)
  train_labels = train_labels[:1000]
  test_labels = test_labels[:1000]

  train_images = train_images[:1000].reshape(-1, 28 * 28) / 255.0
  test_images = test_images[:1000].reshape(-1, 28 * 28) / 255.0

  # Returns a short sequential model
  def create_model():
    model = tf.keras.models.Sequential([
      keras.layers.Dense(512, activation=tf.keras.activations.relu, input_shape=(784,)),
      keras.layers.Dropout(0.2),
      keras.layers.Dense(10, activation=tf.keras.activations.softmax)
      ])

    model.compile(optimizer=tf.keras.optimizers.Adam(),
      loss=tf.keras.losses.sparse_categorical_crossentropy,
      metrics=['accuracy'])

    return model

  # Create a basic model instance
  model = create_model()
  model.summary()
  

  model.fit(train_images, train_labels, batch_size=32, epochs=5,
            validation_data=(test_images, test_labels))

  # Save entire model to a HDF5 file
  model_file =  os.path.join(args.datadir,args.modelfile)
  model.save(model_file)
  # Copy file over to Pachyderm
  output_uri = os.path.join(output_url,args.modelfile)
  print("copying {} to {}".format(model_file, output_uri))
  file_io.copy(model_file, output_uri, True)
Exemple #31
0
def recursive_copy(src_dir, dest_dir):
  """Copy the contents of src_dir into the folder dest_dir.
  Args:
    src_dir: gsc or local path.
    dest_dir: gcs or local path.
  """

  file_io.recursive_create_dir(dest_dir)
  for file_name in file_io.list_directory(src_dir):
    old_path = os.path.join(src_dir, file_name)
    new_path = os.path.join(dest_dir, file_name)

    if file_io.is_directory(old_path):
      recursive_copy(old_path, new_path)
    else:
      file_io.copy(old_path, new_path, overwrite=True)
Exemple #32
0
def recursive_copy(src_dir, dest_dir):
    """Copy the contents of src_dir into the folder dest_dir.
  Args:
    src_dir: gsc or local path.
    dest_dir: gcs or local path.
  """

    file_io.recursive_create_dir(dest_dir)
    for file_name in file_io.list_directory(src_dir):
        old_path = os.path.join(src_dir, file_name)
        new_path = os.path.join(dest_dir, file_name)

        if file_io.is_directory(old_path):
            recursive_copy(old_path, new_path)
        else:
            file_io.copy(old_path, new_path, overwrite=True)
def _recursive_copy(src_dir, dest_dir):
    """Copy the contents of src_dir into the folder dest_dir.
  Args:
    src_dir: gsc or local path.
    dest_dir: gcs or local path.
  When called, dest_dir should exist.
  """
    src_dir = python_portable_string(src_dir)
    dest_dir = python_portable_string(dest_dir)

    file_io.recursive_create_dir(dest_dir)
    for file_name in file_io.list_directory(src_dir):
        old_path = os.path.join(src_dir, file_name)
        new_path = os.path.join(dest_dir, file_name)

        if file_io.is_directory(old_path):
            _recursive_copy(old_path, new_path)
        else:
            file_io.copy(old_path, new_path, overwrite=True)
  def _copy_assets_to_destination_dir(self, asset_filename_map):
    """Copy all assets from source path to destination path."""
    assets_destination_dir = saved_model_utils.get_or_create_assets_dir(
        self._export_dir)

    # Copy each asset from source path to destination path.
    for asset_basename, asset_source_filepath in asset_filename_map.items():
      asset_destination_filepath = os.path.join(
          compat.as_bytes(assets_destination_dir),
          compat.as_bytes(asset_basename))

      # Only copy the asset file to the destination if it does not already
      # exist. This is to ensure that an asset with the same name defined as
      # part of multiple graphs is only copied the first time.
      if not file_io.file_exists(asset_destination_filepath):
        file_io.copy(asset_source_filepath, asset_destination_filepath)

    tf_logging.info("Assets written to: %s",
                    compat.as_text(assets_destination_dir))
Exemple #35
0
def _recursive_copy(src_dir, dest_dir):
  """Copy the contents of src_dir into the folder dest_dir.
  Args:
    src_dir: gsc or local path.
    dest_dir: gcs or local path.
  When called, dest_dir should exist.
  """
  src_dir = python_portable_string(src_dir)
  dest_dir = python_portable_string(dest_dir)

  file_io.recursive_create_dir(dest_dir)
  for file_name in file_io.list_directory(src_dir):
    old_path = os.path.join(src_dir, file_name)
    new_path = os.path.join(dest_dir, file_name)

    if file_io.is_directory(old_path):
      _recursive_copy(old_path, new_path)
    else:
      file_io.copy(old_path, new_path, overwrite=True)
Exemple #36
0
def copy_assets_to_destination_dir(asset_filename_map, destination_dir):
  """Copy all assets from source path to destination path."""
  assets_destination_dir = saved_model_utils.get_or_create_assets_dir(
      destination_dir)

  # Copy each asset from source path to destination path.
  for asset_basename, asset_source_filepath in asset_filename_map.items():
    asset_destination_filepath = os.path.join(
        compat.as_bytes(assets_destination_dir),
        compat.as_bytes(asset_basename))

    # Only copy the asset file to the destination if it does not already
    # exist. This is to ensure that an asset with the same name defined as
    # part of multiple graphs is only copied the first time.
    if not file_io.file_exists(asset_destination_filepath):
      file_io.copy(asset_source_filepath, asset_destination_filepath)

  tf_logging.info("Assets written to: %s",
                  compat.as_text(assets_destination_dir))
Exemple #37
0
def datahtml(bucket_name, commit_sha, train_file_path):
    import json
    import seaborn as sns
    import matplotlib.pyplot as plt
    import os
    image_path = os.path.join(bucket_name, commit_sha, 'visualization.png')
    image_url = os.path.join('https://storage.googleapis.com',
                             bucket_name.lstrip('gs://'), commit_sha,
                             'visualization.png')
    html_path = os.path.join(bucket_name, 'kaggle.html')
    # ouptut visualization to a file

    import pandas as pd
    df_train = pd.read_csv(train_file_path)
    sns.set()
    cols = [
        'SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF',
        'FullBath', 'YearBuilt'
    ]
    sns.pairplot(df_train[cols], size=3)
    plt.savefig('visualization.png')
    from tensorflow.python.lib.io import file_io
    file_io.copy('visualization.png', image_path)
    rendered_template = """
    <html>
        <head>
            <title>correlation image</title>
        </head>
        <body>
            <img src={}>
        </body>
    </html>""".format(image_url)
    file_io.write_string_to_file(html_path, rendered_template)

    metadata = {
        'outputs': [{
            'type': 'web-app',
            'storage': 'gcs',
            'source': html_path,
        }]
    }
    with file_io.FileIO('/mlpipeline-ui-metadata.json', 'w') as f:
        json.dump(metadata, f)
Exemple #38
0
    def load(self) -> T5ForConditionalGeneration:
        try:
            if not self.flush_cache:
                return self._fix_t5_model(
                    T5ForConditionalGeneration.from_pretrained(
                        str(self.model_cache_dir),
                        from_tf=True,
                        force_download=False))
        except (RuntimeError, OSError):
            logging.info('T5 model weights not in cache.')
        m = re.search(r'model_checkpoint_path: "(.+?)"', self.ckpt_prefix)
        assert m is not None, 'checkpoint file malformed'

        # Copy over checkpoint data
        ckpt_patt = re.compile(
            rf'^{m.group(1)}\.(data-\d+-of-\d+|index|meta)$')
        for name in file_io.list_directory(self.url):
            if not ckpt_patt.match(name):
                continue
            url = os.path.join(self.url, name)
            url_stat = file_io.stat(url)
            cache_file_path = self.model_cache_dir / ckpt_patt.sub(
                rf'{TRANSFO_PREFIX}.\1', name)
            try:
                cs = os.stat(str(cache_file_path))
                if cs.st_size == url_stat.length and cs.st_mtime_ns > url_stat.mtime_nsec and not self.flush_cache:
                    logging.info(f'Skipping {name}...')
                    continue
            except FileNotFoundError:
                pass
            logging.info(f'Caching {name}...')
            file_io.copy(url, str(cache_file_path), overwrite=True)

        # Transformers expects a model config.json
        config = T5Config.from_pretrained(self.model_type)
        with open(str(self.model_cache_dir / 'config.json'), 'w') as f:
            json.dump(config.__dict__, f, indent=4)
        return self._fix_t5_model(
            T5ForConditionalGeneration.from_pretrained(str(
                self.model_cache_dir),
                                                       from_tf=True,
                                                       force_download=False))
Exemple #39
0
def main(argv=None):
    args = parse_arguments(sys.argv if argv is None else argv)

    if args.cloud:
        tmpdir = tempfile.mkdtemp()
        try:
            local_packages = [
                os.path.join(tmpdir, os.path.basename(p))
                for p in args.extra_package
            ]
            for source, dest in zip(args.extra_package, local_packages):
                file_io.copy(source, dest, overwrite=True)

            options = {
                'staging_location': os.path.join(args.output_dir, 'tmp',
                                                 'staging'),
                'temp_location': os.path.join(args.output_dir, 'tmp',
                                              'staging'),
                'job_name': args.job_name,
                'project': args.project_id,
                'no_save_main_session': True,
                'extra_packages': local_packages,
                'teardown_policy': 'TEARDOWN_ALWAYS',
            }
            opts = beam.pipeline.PipelineOptions(flags=[], **options)
            # Or use BlockingDataflowPipelineRunner
            p = beam.Pipeline('DataflowRunner', options=opts)
            make_prediction_pipeline(p, args)
            print(
                ('Dataflow Job submitted, see Job %s at '
                 'https://console.developers.google.com/dataflow?project=%s') %
                (options['job_name'], args.project_id))
            sys.stdout.flush()
            runner_results = p.run()
        finally:
            shutil.rmtree(tmpdir)
    else:
        p = beam.Pipeline('DirectRunner')
        make_prediction_pipeline(p, args)
        runner_results = p.run()

    return runner_results
Exemple #40
0
def main(_):

    # The Tensorflow file_io.walk() function has an issue
    # with iterating over the top level of a bucket.
    # It requires a directory within the bucket.
    # So, we give it one.
    input_url = 's3://' + args.inputbucket + "/data/"
    output_url = 's3://' + args.outputbucket + "/data/"

    os.makedirs(args.datadir)

    # first, we copy files from pachyderm into a convenient
    # local directory for processing.  The files have been
    # placed into the inputpath directory in the s3path bucket.
    print("walking {} for copying files".format(input_url))
    for dirpath, dirs, files in file_io.walk(input_url, True):
        for file in files:
            uri = os.path.join(dirpath, file)
            newpath = os.path.join(args.datadir, file)
            print("copying {} to {}".format(uri, newpath))
            file_io.copy(uri, newpath, True)


    # here is where you would apply your training to the data in args.datadir
    # it might operate on the data directly, or place additional
    # data in the same directory

    # finally, we copy the output from those operations to
    # another pachyderm repo
    print("walking {} for copying to {}".format(args.datadir, output_url))
    for dirpath, dirs, files in os.walk(args.datadir, topdown=True):   
      for file in files:
        uri = os.path.join(dirpath, file)
        newpath = output_url + file
        print("copying {} to {}".format(uri, newpath))
        file_io.copy(uri, newpath, True)
Exemple #41
0
def main(argv=None):
  args = parse_arguments(sys.argv if argv is None else argv)

  if args.cloud:
    tmpdir = tempfile.mkdtemp()
    try:
      local_packages = [os.path.join(tmpdir, os.path.basename(p)) for p in args.extra_package]
      for source, dest in zip(args.extra_package, local_packages):
        file_io.copy(source, dest, overwrite=True)

      options = {
          'staging_location': os.path.join(args.output_dir, 'tmp', 'staging'),
          'temp_location': os.path.join(args.output_dir, 'tmp', 'staging'),
          'job_name': args.job_name,
          'project': args.project_id,
          'no_save_main_session': True,
          'extra_packages': local_packages,
          'teardown_policy': 'TEARDOWN_ALWAYS',
      }
      opts = beam.pipeline.PipelineOptions(flags=[], **options)
      # Or use BlockingDataflowPipelineRunner
      p = beam.Pipeline('DataflowRunner', options=opts)
      make_prediction_pipeline(p, args)
      print(('Dataflow Job submitted, see Job %s at '
             'https://console.developers.google.com/dataflow?project=%s') %
            (options['job_name'], args.project_id))
      sys.stdout.flush()
      runner_results = p.run()
    finally:
      shutil.rmtree(tmpdir)
  else:
    p = beam.Pipeline('DirectRunner')
    make_prediction_pipeline(p, args)
    runner_results = p.run()

  return runner_results
Exemple #42
0
def _copy_all(src_files, dest_dir):
  # file_io.copy does not copy files into folders directly.
  for src_file in src_files:
    file_name = os.path.basename(src_file)
    new_file_location = os.path.join(dest_dir, file_name)
    file_io.copy(src_file, new_file_location)
  def test_local_bigquery_transform(self):
    """Test transfrom locally, but the data comes from bigquery."""

    # Make a BQ table, and insert 1 row.
    try:
      bucket_name = 'temp_pydatalab_test_%s' % uuid.uuid4().hex
      bucket_root = 'gs://%s' % bucket_name
      bucket = storage.Bucket(bucket_name)
      bucket.create()

      project_id = dl.Context.default().project_id

      dataset_name = 'test_transform_raw_data_%s' % uuid.uuid4().hex
      table_name = 'tmp_table'

      dataset = bq.Dataset((project_id, dataset_name)).create()
      table = bq.Table((project_id, dataset_name, table_name))
      table.create([{'name': 'key_col', 'type': 'INTEGER'},
                    {'name': 'target_col', 'type': 'FLOAT'},
                    {'name': 'cat_col', 'type': 'STRING'},
                    {'name': 'num_col', 'type': 'FLOAT'},
                    {'name': 'img_col', 'type': 'STRING'}])

      img1_file = os.path.join(self.source_dir, 'img1.jpg')
      dest_file = os.path.join(bucket_root, 'img1.jpg')
      file_io.copy(img1_file, dest_file)

      data = [
          {
           'key_col': 1,
           'target_col': 1.0,
           'cat_col': 'Monday',
           'num_col': 23.0,
           'img_col': dest_file,
          },
      ]
      table.insert(data=data)

      cmd = ['python ' + os.path.join(CODE_PATH, 'transform.py'),
             '--bigquery=%s.%s.%s' % (project_id, dataset_name, table_name),
             '--analysis=' + self.analysis_dir,
             '--prefix=features',
             '--project-id=' + project_id,
             '--output=' + self.output_dir]
      print('cmd ', ' '.join(cmd))
      subprocess.check_call(' '.join(cmd), shell=True)

      # Read the tf record file. There should only be one file.
      record_filepath = os.path.join(self.output_dir,
                                     'features-00000-of-00001.tfrecord.gz')
      options = tf.python_io.TFRecordOptions(
          compression_type=tf.python_io.TFRecordCompressionType.GZIP)
      serialized_examples = list(tf.python_io.tf_record_iterator(record_filepath, options=options))
      self.assertEqual(len(serialized_examples), 1)

      example = tf.train.Example()
      example.ParseFromString(serialized_examples[0])

      transformed_number = example.features.feature['num_col'].float_list.value[0]
      self.assertAlmostEqual(transformed_number, 23.0)
      transformed_category = example.features.feature['cat_col'].int64_list.value[0]
      self.assertEqual(transformed_category, 2)
      image_bytes = example.features.feature['img_col'].float_list.value
      self.assertEqual(len(image_bytes), 2048)
      self.assertTrue(any(x != 0.0 for x in image_bytes))
    finally:
      dataset.delete(delete_contents=True)

      for obj in bucket.objects():
        obj.delete()
      bucket.delete()
Exemple #44
0
    def export_fn(estimator,
                  export_dir_base,
                  checkpoint_path=None,
                  eval_result=None):
        with ops.Graph().as_default() as g:
            contrib_variables.create_global_step(g)

            input_ops = feature_transforms.build_csv_serving_tensors(
                args.output_dir_from_analysis_step, features, schema, stats,
                keep_target)
            model_fn_ops = estimator._call_model_fn(
                input_ops.features, None, model_fn_lib.ModeKeys.INFER)
            output_fetch_tensors = make_prediction_output_tensors(
                args=args,
                features=features,
                input_ops=input_ops,
                model_fn_ops=model_fn_ops,
                keep_target=keep_target)

            # Don't use signature_def_utils.predict_signature_def as that renames
            # tensor names if there is only 1 input/output tensor!
            signature_inputs = {
                key: tf.saved_model.utils.build_tensor_info(tensor)
                for key, tensor in six.iteritems(input_ops.default_inputs)
            }
            signature_outputs = {
                key: tf.saved_model.utils.build_tensor_info(tensor)
                for key, tensor in six.iteritems(output_fetch_tensors)
            }
            signature_def_map = {
                'serving_default':
                signature_def_utils.build_signature_def(
                    signature_inputs, signature_outputs,
                    tf.saved_model.signature_constants.PREDICT_METHOD_NAME)
            }

            if not checkpoint_path:
                # Locate the latest checkpoint
                checkpoint_path = saver.latest_checkpoint(estimator._model_dir)
            if not checkpoint_path:
                raise ValueError("Couldn't find trained model at %s." %
                                 estimator._model_dir)

            export_dir = saved_model_export_utils.get_timestamped_export_dir(
                export_dir_base)

            with tf_session.Session('') as session:
                variables.local_variables_initializer()
                data_flow_ops.tables_initializer()
                saver_for_restore = saver.Saver(variables.global_variables(),
                                                sharded=True)
                saver_for_restore.restore(session, checkpoint_path)

                init_op = control_flow_ops.group(
                    variables.local_variables_initializer(),
                    data_flow_ops.tables_initializer())

                # Perform the export
                builder = saved_model_builder.SavedModelBuilder(export_dir)
                builder.add_meta_graph_and_variables(
                    session, [tag_constants.SERVING],
                    signature_def_map=signature_def_map,
                    assets_collection=ops.get_collection(
                        ops.GraphKeys.ASSET_FILEPATHS),
                    legacy_init_op=init_op)
                builder.save(False)

            # Add the extra assets
            if assets_extra:
                assets_extra_path = os.path.join(
                    compat.as_bytes(export_dir),
                    compat.as_bytes('assets.extra'))
                for dest_relative, source in assets_extra.items():
                    dest_absolute = os.path.join(
                        compat.as_bytes(assets_extra_path),
                        compat.as_bytes(dest_relative))
                    dest_path = os.path.dirname(dest_absolute)
                    file_io.recursive_create_dir(dest_path)
                    file_io.copy(source, dest_absolute)

        # only keep the last 3 models
        saved_model_export_utils.garbage_collect_exports(export_dir_base,
                                                         exports_to_keep=3)

        # save the last model to the model folder.
        # export_dir_base = A/B/intermediate_models/
        if keep_target:
            final_dir = os.path.join(args.job_dir, 'evaluation_model')
        else:
            final_dir = os.path.join(args.job_dir, 'model')
        if file_io.is_directory(final_dir):
            file_io.delete_recursively(final_dir)
        file_io.recursive_create_dir(final_dir)
        recursive_copy(export_dir, final_dir)

        return export_dir
Exemple #45
0
 def copy(cls, oldpath, newpath, overwrite=False):
     file_io.copy(oldpath, newpath, overwrite)
Exemple #46
0
 def copy(cls, oldpath, newpath, overwrite=False):
     file_io.copy(oldpath, newpath, overwrite)
Exemple #47
0
    def batch_predict(dataset, model_dir, output_csv, output_bq_table,
                      pipeline_option):
        """Batch predict running in cloud."""

        import apache_beam as beam
        import google.datalab.utils
        from . import _predictor

        if output_csv is None and output_bq_table is None:
            raise ValueError(
                'output_csv and output_bq_table cannot both be None.')
        if 'temp_location' not in pipeline_option:
            raise ValueError('"temp_location" is not set in cloud.')

        job_name = ('batch-predict-image-classification-' +
                    datetime.datetime.now().strftime('%y%m%d-%H%M%S'))
        staging_package_url = _util.repackage_to_staging(
            pipeline_option['temp_location'])
        tmpdir = tempfile.mkdtemp()
        # suppress DataFlow warnings about wheel package as extra package.
        original_level = logging.getLogger().getEffectiveLevel()
        logging.getLogger().setLevel(logging.ERROR)
        try:
            # Workaround for DataFlow 2.0, which doesn't work well with extra packages in GCS.
            # Remove when the issue is fixed and new version of DataFlow is included in Datalab.
            extra_packages = [
                staging_package_url, _TF_GS_URL, _PROTOBUF_GS_URL
            ]
            local_packages = [
                os.path.join(tmpdir, os.path.basename(p))
                for p in extra_packages
            ]
            for source, dest in zip(extra_packages, local_packages):
                file_io.copy(source, dest, overwrite=True)

            options = {
                'staging_location':
                os.path.join(pipeline_option['temp_location'], 'staging'),
                'job_name':
                job_name,
                'project':
                _util.default_project(),
                'extra_packages':
                local_packages,
                'teardown_policy':
                'TEARDOWN_ALWAYS',
                'no_save_main_session':
                True
            }
            options.update(pipeline_option)

            opts = beam.pipeline.PipelineOptions(flags=[], **options)
            p = beam.Pipeline('DataflowRunner', options=opts)
            _predictor.configure_pipeline(p, dataset, model_dir, output_csv,
                                          output_bq_table)
            job_results = p.run()
        finally:
            shutil.rmtree(tmpdir)
            logging.getLogger().setLevel(original_level)

        if (_util.is_in_IPython()):
            import IPython
            dataflow_url = (
                'https://console.developers.google.com/dataflow?project=%s' %
                _util.default_project())
            html = 'Job "%s" submitted.' % job_name
            html += (
                '<p>Click <a href="%s" target="_blank">here</a> to track batch prediction job. <br/>'
                % dataflow_url)
            IPython.display.display_html(html, raw=True)
        return google.datalab.utils.DataflowJob(job_results)
Exemple #48
0
    def preprocess(train_dataset, output_dir, eval_dataset, checkpoint,
                   pipeline_option):
        """Preprocess data in Cloud with DataFlow."""

        import apache_beam as beam
        import google.datalab.utils
        from . import _preprocess

        if checkpoint is None:
            checkpoint = _util._DEFAULT_CHECKPOINT_GSURL

        job_name = ('preprocess-image-classification-' +
                    datetime.datetime.now().strftime('%y%m%d-%H%M%S'))

        staging_package_url = _util.repackage_to_staging(output_dir)
        tmpdir = tempfile.mkdtemp()
        # suppress DataFlow warnings about wheel package as extra package.
        original_level = logging.getLogger().getEffectiveLevel()
        logging.getLogger().setLevel(logging.ERROR)
        try:
            # Workaround for DataFlow 2.0, which doesn't work well with extra packages in GCS.
            # Remove when the issue is fixed and new version of DataFlow is included in Datalab.
            extra_packages = [
                staging_package_url, _TF_GS_URL, _PROTOBUF_GS_URL
            ]
            local_packages = [
                os.path.join(tmpdir, os.path.basename(p))
                for p in extra_packages
            ]
            for source, dest in zip(extra_packages, local_packages):
                file_io.copy(source, dest, overwrite=True)

            options = {
                'staging_location': os.path.join(output_dir, 'tmp', 'staging'),
                'temp_location': os.path.join(output_dir, 'tmp'),
                'job_name': job_name,
                'project': _util.default_project(),
                'extra_packages': local_packages,
                'teardown_policy': 'TEARDOWN_ALWAYS',
                'no_save_main_session': True
            }
            if pipeline_option is not None:
                options.update(pipeline_option)

            opts = beam.pipeline.PipelineOptions(flags=[], **options)
            p = beam.Pipeline('DataflowRunner', options=opts)
            _preprocess.configure_pipeline(p, train_dataset, eval_dataset,
                                           checkpoint, output_dir, job_name)
            job_results = p.run()
        finally:
            shutil.rmtree(tmpdir)
            logging.getLogger().setLevel(original_level)

        if (_util.is_in_IPython()):
            import IPython
            dataflow_url = 'https://console.developers.google.com/dataflow?project=%s' % \
                           _util.default_project()
            html = 'Job "%s" submitted.' % job_name
            html += '<p>Click <a href="%s" target="_blank">here</a> to track preprocessing job. <br/>' \
                % dataflow_url
            IPython.display.display_html(html, raw=True)
        return google.datalab.utils.DataflowJob(job_results)
Exemple #49
0
  def test_local_bigquery_transform(self):
    """Test transfrom locally, but the data comes from bigquery."""

    # Make a BQ table, and insert 1 row.
    try:
      bucket_name = 'temp_pydatalab_test_%s' % uuid.uuid4().hex
      bucket_root = 'gs://%s' % bucket_name
      bucket = storage.Bucket(bucket_name)
      bucket.create()

      project_id = dl.Context.default().project_id

      dataset_name = 'test_transform_raw_data_%s' % uuid.uuid4().hex
      table_name = 'tmp_table'

      dataset = bq.Dataset((project_id, dataset_name)).create()
      table = bq.Table((project_id, dataset_name, table_name))
      table.create([{'name': 'key_col', 'type': 'INTEGER'},
                    {'name': 'target_col', 'type': 'FLOAT'},
                    {'name': 'cat_col', 'type': 'STRING'},
                    {'name': 'num_col', 'type': 'FLOAT'},
                    {'name': 'img_col', 'type': 'STRING'}])

      img1_file = os.path.join(self.source_dir, 'img1.jpg')
      dest_file = os.path.join(bucket_root, 'img1.jpg')
      file_io.copy(img1_file, dest_file)

      data = [
          {
           'key_col': 1,
           'target_col': 1.0,
           'cat_col': 'Monday',
           'num_col': 23.0,
           'img_col': dest_file,
          },
      ]
      table.insert(data=data)

      cmd = ['python ' + os.path.join(CODE_PATH, 'transform.py'),
             '--bigquery=%s.%s.%s' % (project_id, dataset_name, table_name),
             '--analysis=' + self.analysis_dir,
             '--prefix=features',
             '--project-id=' + project_id,
             '--output=' + self.output_dir]
      print('cmd ', ' '.join(cmd))
      subprocess.check_call(' '.join(cmd), shell=True)

      # Read the tf record file. There should only be one file.
      record_filepath = os.path.join(self.output_dir,
                                     'features-00000-of-00001.tfrecord.gz')
      options = tf.python_io.TFRecordOptions(
          compression_type=tf.python_io.TFRecordCompressionType.GZIP)
      serialized_examples = list(tf.python_io.tf_record_iterator(record_filepath, options=options))
      self.assertEqual(len(serialized_examples), 1)

      example = tf.train.Example()
      example.ParseFromString(serialized_examples[0])

      transformed_number = example.features.feature['num_col'].float_list.value[0]
      self.assertAlmostEqual(transformed_number, 23.0)
      transformed_category = example.features.feature['cat_col'].int64_list.value[0]
      self.assertEqual(transformed_category, 2)
      image_bytes = example.features.feature['img_col'].float_list.value
      self.assertEqual(len(image_bytes), 2048)
      self.assertTrue(any(x != 0.0 for x in image_bytes))
    finally:
      dataset.delete(delete_contents=True)

      for obj in bucket.objects():
        obj.delete()
      bucket.delete()
Exemple #50
0
  def export_fn(estimator, export_dir_base, checkpoint_path=None, eval_result=None):
    with ops.Graph().as_default() as g:
      contrib_variables.create_global_step(g)

      input_ops = feature_transforms.build_csv_serving_tensors_for_training_step(
          args.analysis, features, schema, stats, keep_target)
      model_fn_ops = estimator._call_model_fn(input_ops.features,
                                              None,
                                              model_fn_lib.ModeKeys.INFER)
      output_fetch_tensors = make_prediction_output_tensors(
          args=args,
          features=features,
          input_ops=input_ops,
          model_fn_ops=model_fn_ops,
          keep_target=keep_target)

      # Don't use signature_def_utils.predict_signature_def as that renames
      # tensor names if there is only 1 input/output tensor!
      signature_inputs = {key: tf.saved_model.utils.build_tensor_info(tensor)
                          for key, tensor in six.iteritems(input_ops.default_inputs)}
      signature_outputs = {key: tf.saved_model.utils.build_tensor_info(tensor)
                           for key, tensor in six.iteritems(output_fetch_tensors)}
      signature_def_map = {
          'serving_default':
              signature_def_utils.build_signature_def(
                  signature_inputs,
                  signature_outputs,
                  tf.saved_model.signature_constants.PREDICT_METHOD_NAME)}

      if not checkpoint_path:
        # Locate the latest checkpoint
        checkpoint_path = saver.latest_checkpoint(estimator._model_dir)
      if not checkpoint_path:
        raise ValueError("Couldn't find trained model at %s."
                         % estimator._model_dir)

      export_dir = saved_model_export_utils.get_timestamped_export_dir(
          export_dir_base)

      if (model_fn_ops.scaffold is not None and
         model_fn_ops.scaffold.saver is not None):
        saver_for_restore = model_fn_ops.scaffold.saver
      else:
        saver_for_restore = saver.Saver(sharded=True)

      with tf_session.Session('') as session:
        saver_for_restore.restore(session, checkpoint_path)
        init_op = control_flow_ops.group(
            variables.local_variables_initializer(),
            resources.initialize_resources(resources.shared_resources()),
            tf.tables_initializer())

        # Perform the export
        builder = saved_model_builder.SavedModelBuilder(export_dir)
        builder.add_meta_graph_and_variables(
            session, [tag_constants.SERVING],
            signature_def_map=signature_def_map,
            assets_collection=ops.get_collection(
                ops.GraphKeys.ASSET_FILEPATHS),
            legacy_init_op=init_op)
        builder.save(False)

      # Add the extra assets
      if assets_extra:
        assets_extra_path = os.path.join(compat.as_bytes(export_dir),
                                         compat.as_bytes('assets.extra'))
        for dest_relative, source in assets_extra.items():
          dest_absolute = os.path.join(compat.as_bytes(assets_extra_path),
                                       compat.as_bytes(dest_relative))
          dest_path = os.path.dirname(dest_absolute)
          file_io.recursive_create_dir(dest_path)
          file_io.copy(source, dest_absolute)

    # only keep the last 3 models
    saved_model_export_utils.garbage_collect_exports(
        export_dir_base,
        exports_to_keep=3)

    # save the last model to the model folder.
    # export_dir_base = A/B/intermediate_models/
    if keep_target:
      final_dir = os.path.join(args.job_dir, 'evaluation_model')
    else:
      final_dir = os.path.join(args.job_dir, 'model')
    if file_io.is_directory(final_dir):
      file_io.delete_recursively(final_dir)
    file_io.recursive_create_dir(final_dir)
    recursive_copy(export_dir, final_dir)

    return export_dir