smp.init(cfg) cache_dir = os.path.join(os.path.expanduser("~"), ".keras", "datasets") if not os.path.exists(cache_dir): try: os.mkdir(cache_dir) except OSError as e: if e.errno == errno.EEXIST and os.path.isdir(cache_dir): pass else: raise # Download and load MNIST dataset. (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data( "MNIST-data-%d" % smp.rank() ) x_train, x_test = x_train / 255.0, x_test / 255.0 # Add a channels dimension x_train = x_train[..., tf.newaxis] x_test = x_test[..., tf.newaxis] train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(10000, seed=123).batch(32) test_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(32) class MyModel(smp.DistributedModel): def __init__(self): super(MyModel, self).__init__()
# Rubik: Initialize smp.init() cache_dir = os.path.join(os.path.expanduser("~"), ".keras", "datasets") if not os.path.exists(cache_dir): try: os.mkdir(cache_dir) except OSError as e: if e.errno == errno.EEXIST and os.path.isdir(cache_dir): pass else: raise # Download and load MNIST dataset. (x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data( "MNIST-data-%d" % smp.rank() ) x_train, x_test = x_train / 255.0, x_test / 255.0 # Add a channels dimension x_train = x_train[..., tf.newaxis] x_test = x_test[..., tf.newaxis] # Rubik: Seed the shuffle with smp.dp_rank(), and drop_remainder # in batching to make sure batch size is always divisible by number of microbatches train_ds = ( tf.data.Dataset.from_tensor_slices((x_train, y_train)) .shuffle(10000, seed=smp.dp_rank()) .batch(256, drop_remainder=True) ) test_ds = (
""" generate big binary file with the specified size in bytes :param filename: the filename :param size: the size in bytes :return:void """ with open("%s" % filename, "wb+") as fout: fout.write(os.urandom(size)) start_time = time.time() src_root_dir = "./send_receive_checkpoint_test" dst_root_dir = "./send_receive_checkpoint_result" filename = "data.bin" if smp.rank() != 0: file_path = os.path.join(src_root_dir, "mp_rank_" + str(smp.rank())) if os.path.exists(file_path): shutil.rmtree(file_path) os.makedirs(file_path, exist_ok=True) # creating a 1MB file. generate_big_random_bin_file(os.path.join(file_path, filename), 1024 * 1024) # sending to rank 0 utils.send_checkpoint_files(src_root_dir, 0) else: # receving from rank 1 utils.receive_checkpoint_files(dst_root_dir, 1)