smp.init(cfg)

cache_dir = os.path.join(os.path.expanduser("~"), ".keras", "datasets")

if not os.path.exists(cache_dir):
    try:
        os.mkdir(cache_dir)
    except OSError as e:
        if e.errno == errno.EEXIST and os.path.isdir(cache_dir):
            pass
        else:
            raise

# Download and load MNIST dataset.
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data(
    "MNIST-data-%d" % smp.rank()
)
x_train, x_test = x_train / 255.0, x_test / 255.0

# Add a channels dimension
x_train = x_train[..., tf.newaxis]
x_test = x_test[..., tf.newaxis]

train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(10000, seed=123).batch(32)

test_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(32)


class MyModel(smp.DistributedModel):
    def __init__(self):
        super(MyModel, self).__init__()
Esempio n. 2
0
# Rubik: Initialize
smp.init()

cache_dir = os.path.join(os.path.expanduser("~"), ".keras", "datasets")
if not os.path.exists(cache_dir):
    try:
        os.mkdir(cache_dir)
    except OSError as e:
        if e.errno == errno.EEXIST and os.path.isdir(cache_dir):
            pass
        else:
            raise

# Download and load MNIST dataset.
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data(
    "MNIST-data-%d" % smp.rank()
)
x_train, x_test = x_train / 255.0, x_test / 255.0

# Add a channels dimension
x_train = x_train[..., tf.newaxis]
x_test = x_test[..., tf.newaxis]

# Rubik: Seed the shuffle with smp.dp_rank(), and drop_remainder
# in batching to make sure batch size is always divisible by number of microbatches
train_ds = (
    tf.data.Dataset.from_tensor_slices((x_train, y_train))
    .shuffle(10000, seed=smp.dp_rank())
    .batch(256, drop_remainder=True)
)
test_ds = (
    """
    generate big binary file with the specified size in bytes
    :param filename: the filename
    :param size: the size in bytes
    :return:void
    """
    with open("%s" % filename, "wb+") as fout:
        fout.write(os.urandom(size))


start_time = time.time()
src_root_dir = "./send_receive_checkpoint_test"
dst_root_dir = "./send_receive_checkpoint_result"
filename = "data.bin"

if smp.rank() != 0:
    file_path = os.path.join(src_root_dir, "mp_rank_" + str(smp.rank()))

    if os.path.exists(file_path):
        shutil.rmtree(file_path)
    os.makedirs(file_path, exist_ok=True)
    # creating a 1MB file.
    generate_big_random_bin_file(os.path.join(file_path, filename),
                                 1024 * 1024)

    # sending to rank 0
    utils.send_checkpoint_files(src_root_dir, 0)

else:
    # receving from  rank 1
    utils.receive_checkpoint_files(dst_root_dir, 1)