def make_estimator_distributed_runconfig(FLAGS, estimator, is_distributed, save_checkpoints_steps=100): if is_distributed: cluster, task_type, task_index = make_distributed_info_without_evaluator( FLAGS) dump_into_tf_config(cluster, task_type, task_index) device_filters = None if estimator in (tf.estimator.BoostedTreesClassifier, tf.estimator.BoostedTreesRegressor): # TFBT doesn't work with tf.contrib.distribute at the moment. # Use estimator distributed training instead, see # https://github.com/tensorflow/tensorflow/issues/32081 dist_strategy = None if task_type != 'ps': # Disable communication between workers, see # https://github.com/tensorflow/tensorflow/issues/21745 device_filters = [ '/job:ps', '/job:%s/task:%d' % (task_type, task_index) ] else: dist_strategy = tf.contrib.distribute.ParameterServerStrategy() run_config = tf.estimator.RunConfig( tf_random_seed=get_tf_random_seed(), save_checkpoints_steps=save_checkpoints_steps, train_distribute=dist_strategy, session_config=tf.ConfigProto(log_device_placement=True, device_filters=device_filters)) else: run_config = tf.estimator.RunConfig( tf_random_seed=get_tf_random_seed(), save_checkpoints_steps=save_checkpoints_steps) return run_config
def estimator_train_and_save(estimator, model_params, save, train_dataset_fn, val_dataset_fn, train_max_steps, eval_start_delay_secs, eval_throttle_secs, save_checkpoints_steps, metric_names, load_pretrained_model, model_meta): print("Start training using estimator model...") model_params["model_dir"] = save model_params["config"] = tf.estimator.RunConfig( tf_random_seed=get_tf_random_seed(), save_checkpoints_steps=save_checkpoints_steps) warm_start_from = save if load_pretrained_model else None if warm_start_from: load_pretrained_model_estimator(estimator, model_params) classifier = init_model(estimator, model_params) # do not add default Accuracy metric when using estimator to train, it will # fail when the estimator is a regressor, and estimator seems automatically # add some metrics. Only add additional metrics when user specified with # `WITH`. if tf_is_version2() and metric_names != ["Accuracy"]: classifier = tf.estimator.add_metrics(classifier, get_tf_metrics(metric_names)) estimator_train_compiled(classifier, train_dataset_fn, val_dataset_fn, train_max_steps, eval_start_delay_secs, eval_throttle_secs) estimator_save(classifier, save, model_params, model_meta)
def keras_train_distributed(classifier, model_params, save, model_meta, FLAGS, train_dataset_fn, val_dataset_fn, is_pai=True): # train keras model distributed cluster, task_type, task_index = make_distributed_info_without_evaluator( FLAGS) dump_into_tf_config(cluster, task_type, task_index) dist_strategy = tf.contrib.distribute.ParameterServerStrategy() run_config = tf.estimator.RunConfig(tf_random_seed=get_tf_random_seed(), save_checkpoints_steps=100, train_distribute=dist_strategy, session_config=tf.ConfigProto( log_device_placement=True, device_filters=None)) model_dir = FLAGS.checkpointDir keras_estimator = tf.keras.estimator.model_to_estimator( classifier, model_dir=model_dir, config=run_config) estimator_train_compiled( keras_estimator, train_dataset_fn, val_dataset_fn, # TODO(typhoonzero): do pass train settings. 100, None, 60, 120) # FIXME(typhoonzero): predict keras distributed model should # also call model_to_estimator. # export saved model for prediction if "feature_columns" in model_params: all_feature_columns = model_params["feature_columns"] elif "linear_feature_columns" in model_params \ and "dnn_feature_columns" in model_params: import copy all_feature_columns = copy.copy(model_params["linear_feature_columns"]) all_feature_columns.extend(model_params["dnn_feature_columns"]) else: raise Exception("No expected feature columns in model params") serving_input_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn( # noqa: E501 tf.feature_column.make_parse_example_spec(all_feature_columns)) export_path = keras_estimator.export_saved_model(save, serving_input_fn) # write the path under current directory export_path_str = str(export_path.decode("utf-8")) with open("exported_path", "w") as fn: fn.write(export_path_str) # write model metadata to model_meta.json save_model_metadata("model_meta.json", model_meta) print("Done training, model exported to: %s" % export_path_str)
def estimator_train_and_save_legacy(estimator, model_params, save, FLAGS, train_dataset_fn, val_dataset_fn, train_max_steps, eval_start_delay_secs, eval_throttle_secs, save_checkpoints_steps, metric_names, load_pretrained_model, model_meta): print("Start training using estimator model...") is_distributed = False if len(FLAGS.worker_hosts.split(",")) > 1: is_distributed = True model_params["config"] = make_estimator_distributed_runconfig( FLAGS, estimator, is_distributed, save_checkpoints_steps=save_checkpoints_steps) ckpt_dir = FLAGS.checkpointDir if FLAGS.checkpointDir else save print("Using checkpoint path: %s" % ckpt_dir) model_params["model_dir"] = ckpt_dir model_params["config"] = tf.estimator.RunConfig( tf_random_seed=get_tf_random_seed(), save_checkpoints_steps=save_checkpoints_steps) warm_start_from = save if load_pretrained_model else None if warm_start_from: load_pretrained_model_estimator(estimator, model_params) classifier = init_model(estimator, model_params) # do not add default Accuracy metric when using estimator to train, it will # fail when the estimator is a regressor, and estimator seems automatically # add some metrics. Only add additional metrics when user specified with # `WITH`. if tf_is_version2() and metric_names != ["Accuracy"]: classifier = tf.estimator.add_metrics(classifier, get_tf_metrics(metric_names)) estimator_train_compiled(classifier, train_dataset_fn, val_dataset_fn, train_max_steps, eval_start_delay_secs, eval_throttle_secs) if FLAGS.task_index != 0: print("skip exporting model on worker != 0") return estimator_save(classifier, save, model_params, model_meta)
def keras_train_and_save(estimator, model_params, save, is_pai, FLAGS, train_dataset_fn, val_dataset_fn, label_meta, epochs, verbose, metric_names, validation_steps, load_pretrained_model, model_meta): print("Start training using keras model...") # remove optimizer param from model_params and use it when call "compile()" optimizer = None loss = None if "optimizer" in model_params: optimizer = model_params["optimizer"] del model_params["optimizer"] if "loss" in model_params: loss = model_params["loss"] del model_params["loss"] classifier_pkg = sys.modules[estimator.__module__] # setting training metrics model_metrics = [] if hasattr(classifier_pkg, "eval_metrics_fn"): metrics_functions = classifier_pkg.eval_metrics_fn() for key, func in metrics_functions.items(): func.__name__ = key model_metrics.append(func) # use WITH specified metrics if it's not default. if metric_names != ["Accuracy"]: keras_metrics = metrics.get_keras_metrics(metric_names) else: if len(model_metrics) > 0: keras_metrics = model_metrics else: # default keras_metrics = metrics.get_keras_metrics(["Accuracy"]) # setting optimizer has_none_optimizer = False if optimizer is None: # use keras model default optimizer if optimizer is not specified in WITH clause. members = inspect.getmembers(classifier_pkg) # default optimizer optimizer = tf.keras.optimizers.Adagrad(lr=0.001) for m, func in members: if m == "optimizer": optimizer = classifier_pkg.optimizer() if optimizer is None: has_none_optimizer = True warnings.warn('optimizer() returns None') if loss is None: members = inspect.getmembers(classifier_pkg) # FIXME(typhoonzero): default loss may cause error if model's output shape does not fit. loss = "sparse_categorical_crossentropy" for m, func in members: if m == "loss": loss = classifier_pkg.loss # setting datasets train_dataset = train_dataset_fn() if val_dataset_fn != None: validate_dataset = val_dataset_fn() else: validate_dataset = None classifier = init_model_with_feature_column( estimator, model_params, has_none_optimizer=has_none_optimizer) # FIXME(sneaxiy): some models defined by other framework (not TensorFlow or XGBoost) # may return None optimizer. # For example: https://github.com/sql-machine-learning/models/blob/ce970d14a524e20de10a645c99b6bf8724be17d9/sqlflow_models/arima_with_stl_decomposition.py#L123 if has_none_optimizer: assert hasattr( classifier, "sqlflow_train_loop"), "optimizer() should not return None" else: classifier.compile(optimizer=optimizer, loss=loss, metrics=keras_metrics) if load_pretrained_model: # Must run one batch to initialize parameters before load_weights inputs, targets = next(iter(train_dataset.take(1))) classifier.evaluate(inputs, targets) # NOTE(sneaxiy): should we save/load optimizer info for incremental training, or # let users to write the same WITH statements in SQL? classifier.load_weights(save) if is_pai and len(FLAGS.worker_hosts.split(",")) > 1: # train keras model distributed cluster, task_type, task_index = make_distributed_info_without_evaluator( FLAGS) dump_into_tf_config(cluster, task_type, task_index) dist_strategy = tf.contrib.distribute.ParameterServerStrategy() run_config = tf.estimator.RunConfig( tf_random_seed=get_tf_random_seed(), save_checkpoints_steps=100, train_distribute=dist_strategy, session_config=tf.ConfigProto(log_device_placement=True, device_filters=None)) model_dir = FLAGS.checkpointDir keras_estimator = tf.keras.estimator.model_to_estimator( classifier, model_dir=model_dir, config=run_config) estimator_train_compiled( keras_estimator, is_pai, FLAGS, train_dataset_fn, val_dataset_fn, # TODO(typhoonzero): do pass train settings. 100, None, 60, 120) # FIXME(typhoonzero): predict keras distributed model should also call model_to_estimator. # export saved model for prediction if "feature_columns" in model_params: all_feature_columns = model_params["feature_columns"] elif "linear_feature_columns" in model_params and "dnn_feature_columns" in model_params: import copy all_feature_columns = copy.copy( model_params["linear_feature_columns"]) all_feature_columns.extend(model_params["dnn_feature_columns"]) else: raise Exception("No expected feature columns in model params") serving_input_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn( tf.feature_column.make_parse_example_spec(all_feature_columns)) export_path = keras_estimator.export_saved_model( save, serving_input_fn) # write the path under current directory export_path_str = str(export_path.decode("utf-8")) with open("exported_path", "w") as fn: fn.write(export_path_str) # write model metadata to model_meta.json save_model_metadata("model_meta.json", model_meta) print("Done training, model exported to: %s" % export_path_str) return if hasattr(classifier, 'sqlflow_train_loop'): classifier.sqlflow_train_loop(train_dataset) else: if label_meta["feature_name"] != "": # FIXME(typhoonzero): this is why need to set validation_steps: https://github.com/tensorflow/tensorflow/issues/29743#issuecomment-502028891 # remove this argument when PAI fixes this. if tf_is_version2(): validation_steps = None else: if validate_dataset == None: validation_steps = None history = classifier.fit(train_dataset, validation_steps=validation_steps, epochs=epochs if epochs else classifier.default_training_epochs(), validation_data=validate_dataset, verbose=verbose) else: history = classifier.fit(train_dataset, validation_steps=validation_steps, epochs=epochs if epochs else classifier.default_training_epochs(), verbose=verbose) train_metrics = dict() val_metrics = dict() for k in history.history.keys(): if k.startswith("val_"): val_metrics[k] = float(history.history[k][-1]) else: train_metrics[k] = float(history.history[k][-1]) print("====== Result for training set: ======") for k, v in train_metrics.items(): print("%s: %s" % (k, v)) print("====== Result for validation set: ======") for k, v in val_metrics.items(): print("%s: %s" % (k, v)) model_meta["evaluation"] = val_metrics try: classifier.save_weights(save, save_format="h5") # write model metadata to model_meta.json save_model_metadata("model_meta.json", model_meta) if is_pai: print("saving keras model to: %s" % FLAGS.sqlflow_oss_modeldir) model.save_file(FLAGS.sqlflow_oss_modeldir, save) model.save_file(FLAGS.sqlflow_oss_modeldir, "model_meta.json") except: if has_none_optimizer: warnings.warn("Saving model with None optimizer fails") else: six.reraise(*sys.exc_info())