def main(_): # Users should always run this script under TF 2.x assert tf.version.VERSION.startswith('2.') with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader: input_meta_data = json.loads(reader.read().decode('utf-8')) if FLAGS.mode == 'export_only': export_squad(FLAGS.model_export_path, input_meta_data) return strategy = None if FLAGS.strategy_type == 'mirror': strategy = tf.distribute.MirroredStrategy() elif FLAGS.strategy_type == 'multi_worker_mirror': strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() elif FLAGS.strategy_type == 'tpu': cluster_resolver = tpu_lib.tpu_initialize(FLAGS.tpu) strategy = tf.distribute.experimental.TPUStrategy(cluster_resolver) else: raise ValueError('The distribution strategy type is not supported: %s' % FLAGS.strategy_type) if FLAGS.mode in ('train', 'train_and_predict'): train_squad(strategy, input_meta_data) if FLAGS.mode in ('predict', 'train_and_predict'): predict_squad(strategy, input_meta_data)
def main(_): # Users should always run this script under TF 2.x assert tf.version.VERSION.startswith('2.') with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader: input_meta_data = json.loads(reader.read().decode('utf-8')) if not FLAGS.model_dir: FLAGS.model_dir = '/tmp/bert20/' strategy = None if FLAGS.strategy_type == 'mirror': strategy = tf.distribute.MirroredStrategy() elif FLAGS.strategy_type == 'tpu': cluster_resolver = tpu_lib.tpu_initialize(FLAGS.tpu) strategy = tf.distribute.experimental.TPUStrategy(cluster_resolver) else: raise ValueError('The distribution strategy type is not supported: %s' % FLAGS.strategy_type) run_bert(strategy, input_meta_data)
def main(_): # Users should always run this script under TF 2.x assert tf.version.VERSION.startswith('2.') if not FLAGS.model_dir: FLAGS.model_dir = '/tmp/bert20/' strategy = None if FLAGS.strategy_type == 'mirror': strategy = tf.distribute.MirroredStrategy() elif FLAGS.strategy_type == 'tpu': cluster_resolver = tpu_lib.tpu_initialize(FLAGS.tpu) strategy = tf.distribute.experimental.TPUStrategy(cluster_resolver) else: raise ValueError( 'The distribution strategy type is not supported: %s' % FLAGS.strategy_type) if strategy: print('***** Number of cores used : ', strategy.num_replicas_in_sync) run_bert_pretrain(strategy)
def get_distribution_strategy(distribution_strategy="default", num_gpus=0, num_workers=1, all_reduce_alg=None, num_packs=1, tpu_address=None): """Return a DistributionStrategy for running the model. Args: distribution_strategy: a string specifying which distribution strategy to use. Accepted values are 'off', 'default', 'one_device', 'mirrored', 'parameter_server', 'multi_worker_mirrored', and 'tpu' -- case insensitive. 'off' means not to use Distribution Strategy; 'default' means to choose from `MirroredStrategy`, `MultiWorkerMirroredStrategy`, or `OneDeviceStrategy` according to the number of GPUs and number of workers. 'tpu' means to use TPUStrategy using `tpu_address`. num_gpus: Number of GPUs to run this model. num_workers: Number of workers to run this model. all_reduce_alg: Optional. Specifies which algorithm to use when performing all-reduce. For `MirroredStrategy`, valid values are "nccl" and "hierarchical_copy". For `MultiWorkerMirroredStrategy`, valid values are "ring" and "nccl". If None, DistributionStrategy will choose based on device topology. num_packs: Optional. Sets the `num_packs` in `tf.distribute.NcclAllReduce` or `tf.distribute.HierarchicalCopyAllReduce` for `MirroredStrategy`. tpu_address: Optional. String that represents TPU to connect to. Must not be None if `distribution_strategy` is set to `tpu`. Returns: tf.distribute.DistibutionStrategy object. Raises: ValueError: if `distribution_strategy` is 'off' or 'one_device' and `num_gpus` is larger than 1; or `num_gpus` is negative or if `distribution_strategy` is `tpu` but `tpu_address` is not specified. """ if num_gpus < 0: raise ValueError("`num_gpus` can not be negative.") distribution_strategy = distribution_strategy.lower() if distribution_strategy == "off": if num_gpus > 1: raise ValueError( "When {} GPUs and {} workers are specified, distribution_strategy " "flag cannot be set to 'off'.".format(num_gpus, num_workers)) return None if distribution_strategy == "tpu": # When tpu_address is an empty string, we communicate with local TPUs. cluster_resolver = tpu_lib.tpu_initialize(tpu_address) return tf.distribute.experimental.TPUStrategy(cluster_resolver) if distribution_strategy == "multi_worker_mirrored": return tf.distribute.experimental.MultiWorkerMirroredStrategy( communication=_collective_communication(all_reduce_alg)) if (distribution_strategy == "one_device" or (distribution_strategy == "default" and num_gpus <= 1)): if num_gpus == 0: return tf.distribute.OneDeviceStrategy("device:CPU:0") else: if num_gpus > 1: raise ValueError( "`OneDeviceStrategy` can not be used for more than " "one device.") return tf.distribute.OneDeviceStrategy("device:GPU:0") if distribution_strategy in ("mirrored", "default"): if num_gpus == 0: assert distribution_strategy == "mirrored" devices = ["device:CPU:0"] else: devices = ["device:GPU:%d" % i for i in range(num_gpus)] return tf.distribute.MirroredStrategy( devices=devices, cross_device_ops=_mirrored_cross_device_ops( all_reduce_alg, num_packs)) if distribution_strategy == "parameter_server": return tf.distribute.experimental.ParameterServerStrategy() raise ValueError("Unrecognized Distribution Strategy: %r" % distribution_strategy)