Beispiel #1
0
    def get_dataset_from_directory(ds_path: str, split: str, seed=42):

        if not os.path.exists(ds_path):
            DataGenerator.logger.error(f'{ds_path} does not exist')
            raise NameError(f'Directory {ds_path} does not exist')

        builder = tfds.folder_dataset.ImageFolder(ds_path)

        info_ds = builder.info

        ds = builder.as_dataset(as_supervised=True, split=split)

        if not isinstance(ds, tf.data.Dataset):
            raise UnsupportedFormat(
                f'Type of ds is not the one expected (tf.data.Dataset) {type(ds)}'
            )

        num_examples = DataGenerator.evaluate_size_dataset(ds)

        iterator = iter(ds)
        first_elem = iterator.get_next()

        if len(first_elem[0].shape) != 3:
            raise DimensionError(
                f'Dataset input feature should have at least 3 dimensions (h,w,c) but it has {len(first_elem[0].shape)}'
            )

        img_shape = first_elem[0].shape

        num_classes = -1

        if len(info_ds.supervised_keys) == 2:
            label = info_ds.supervised_keys[1]
            num_classes = info_ds.features[label].num_classes
        else:
            raise UnsupportedFormat(
                f'This function only handle datasets like (features, labels) not {info_ds.supervised_keys}'
            )

        print(
            f'img shape {img_shape} number of examples {num_examples} number of classes {num_classes}'
        )

        if popdist.getNumInstances() > 1:
            ds = ds.shard(num_shards=popdist.getNumInstances(),
                          index=popdist.getInstanceIndex())

        return ds, img_shape, num_examples, num_classes
Beispiel #2
0
def init_popdist(args):
    hvd.init()
    args.use_popdist = True
    if popdist.getNumTotalReplicas() != args.replication_factor:
        print(f"The number of replicas is overridden by PopRun. "
              f"The new value is {popdist.getNumTotalReplicas()}.")
    args.replication_factor = int(popdist.getNumLocalReplicas())
    args.popdist_rank = popdist.getInstanceIndex()
    args.popdist_size = popdist.getNumInstances()
Beispiel #3
0
def init_popdist(args):
    hvd.init()
    args.use_popdist = True
    if popdist.getNumTotalReplicas() != args.replicas:
        logging.warn(f"The number of replicas is overridden by poprun. The new value is {popdist.getNumTotalReplicas()}.")
    args.replicas = int(popdist.getNumLocalReplicas())
    args.popdist_rank = popdist.getInstanceIndex()
    args.popdist_size = popdist.getNumInstances()
    args.popdist_local_rank = hvd.local_rank()
Beispiel #4
0
def set_distribution_defaults(opts):
    if opts['use_popdist']:
        opts['distributed_worker_count'] = popdist.getNumInstances()
        opts['distributed_worker_index'] = popdist.getInstanceIndex()
    else:
        opts['distributed_worker_count'] = 1
        opts['distributed_worker_index'] = 0

    if opts['distributed_worker_index'] != 0 and not opts['log_all_workers']:
        logger.setLevel(logging.ERROR)
Beispiel #5
0
def set_distribution_defaults(opts):
    if opts['use_popdist']:
        opts['distributed_worker_count'] = popdist.getNumInstances()
        opts['distributed_worker_index'] = popdist.getInstanceIndex()
        opts['distributed_cluster'] = None

        opts['summary_str'] += 'Popdist\n'
        opts['summary_str'] += ' Process count: {distributed_worker_count}\n'
        opts['summary_str'] += ' Process index: {distributed_worker_index}\n'
    else:
        opts['distributed_worker_count'] = 1
        opts['distributed_worker_index'] = 0
        opts['distributed_cluster'] = None
def set_distribution_defaults(opts):

    if opts['use_popdist']:
        opts['distributed_worker_count'] = popdist.getNumInstances()
        opts['distributed_worker_index'] = popdist.getInstanceIndex()
        opts['summary_str'] += 'Popdist\n'
        opts['summary_str'] += ' Process count: {distributed_worker_count}\n'
        opts['summary_str'] += ' Process index: {distributed_worker_index}\n'
    else:
        opts['distributed_worker_count'] = 1
        opts['distributed_worker_index'] = 0

    if opts['distributed_worker_index'] != 0 and not opts['log_all_workers']:
        logger.setLevel(logging.ERROR)
Beispiel #7
0
 def __iter__(self):
     worker_info = torch.utils.data.get_worker_info()
     if worker_info is not None:
         if popdist.isPopdistEnvSet():
             self.worker_id = worker_info.id + worker_info.num_workers * popdist.getInstanceIndex(
             )
             self.shard = worker_info.id + worker_info.num_workers * popdist.getInstanceIndex(
             ), worker_info.num_workers * popdist.getNumInstances()
         else:
             self.worker_id = worker_info.id
             self.shard = worker_info.id, worker_info.num_workers
     else:
         self.shard = None
     self.reset()
     if self.shuffle:
         np.random.shuffle(self.files)
     return self
Beispiel #8
0
    def get_imagenet(path: str,
                     split: str,
                     cycle_length: int = 4,
                     block_length: int = 4):

        # The path is the one of dataset under TFRecord format
        if not os.path.exists(path):
            DataGenerator.logger.error(f'{path} does not exist')
            raise NameError(f'Directory {path} does not exist')

        if split == 'train':
            filenames = glob.glob1(path, 'train*')
            if len(filenames) != 1024:
                DataGenerator.logger.error(
                    f'train directory should contain 1024 tf-record files but it contains {len(filenames)} instead'
                )
                raise ValueError(
                    f'train directory should contain 1024 files but it contains {len(filenames)} instead'
                )

        else:
            filenames = glob.glob1(path, 'validation*')
            if len(filenames) != 128:
                DataGenerator.logger.error(
                    f'validation directory should contain 128 tf-record files but it contains {len(filenames)} instead'
                )
                raise ValueError(
                    f'validation directory should contain 128 tf-record files but it contains {len(filenames)} instead'
                )

        num_files = len(filenames)

        filenames = list(
            map(lambda filename: os.path.join(path, filename), filenames))
        DataGenerator.logger.debug(f'filenames = {filenames}')
        ds = tf.data.Dataset.from_tensor_slices(filenames)

        if split == 'train':
            # Shuffle the input files
            ds = ds.shuffle(buffer_size=num_files)

        if popdist.getNumInstances() > 1:
            ds = ds.shard(num_shards=popdist.getNumInstances(),
                          index=popdist.getInstanceIndex())

        ds = ds.interleave(tf.data.TFRecordDataset,
                           cycle_length=cycle_length,
                           block_length=block_length,
                           num_parallel_calls=cycle_length)

        DataGenerator.logger.info(f'dataset = {ds}')

        num_examples = IMAGENET_DS_SIZE[split]

        DataGenerator.logger.info(f'number of examples {num_examples}')

        iterator = iter(ds)
        first_elem = iterator.get_next()

        feature, _ = imagenet_processing.parse_record(first_elem, True,
                                                      tf.float32)

        if len(feature.shape) != 3:
            raise DimensionError(
                f'Dataset input feature should have at least 3 dimensions (h,w,c) but it has {len(first_elem[0].shape)}'
            )

        num_classes = 1000
        ds = ds.cache()

        return ds, feature.shape, num_examples, num_classes
Beispiel #9
0
    max_reduce_many_buffer_size = args.max_reduce_many_buffer_size
    gather_conv_output = args.gather_conv_output
    pipeline_num_parallel = args.pipeline_num_parallel

    # check if the script has been called by poprun
    distributed_training = popdist.isPopdistEnvSet()

    if distributed_training:
        if num_replicas != popdist.getNumTotalReplicas():
            logging.warning(
                f'Replication factor given to poprun (=={popdist.getNumTotalReplicas()}) '
                f'does not match the config (=={num_replicas}). Poprun will override the config.'
            )
            num_replicas = popdist.getNumTotalReplicas()

        max_threads_per_instance = os.cpu_count() // popdist.getNumInstances()
        if pipeline_num_parallel > max_threads_per_instance:
            logging.warning(
                f'The number of chosen threads {pipeline_num_parallel} is bigger than the total number of physical threads '
                f'divided by the number of instances,  Poprun will override the config. '
            )
            # Limit the maximal number of threads to the total of physical threads divided by the number of instances
            pipeline_num_parallel = max_threads_per_instance

        if popdist.getInstanceIndex() != 0:
            checkpoints = False
            log_to_wandb = False

    # when neither option is specified, assume gradient accumulation count 1
    if gradient_accumulation_count is None and global_batch_size is None:
        gradient_accumulation_count = 1
Beispiel #10
0
def replicated_tensor_sharding_core():
    parser = argparse.ArgumentParser(description="Parse launch parameters.")
    parser.add_argument("--tensors", nargs="*")
    parser.add_argument("--optim", nargs="?")
    parser.add_argument("--tmpdir", nargs="?")
    parser.add_argument("--filename", nargs="?")
    parser.add_argument("--compute_batch", nargs="?")
    args = parser.parse_args(sys.argv[2:])

    ipus_per_replica = 1

    batches_per_step = 10
    accumulation_factor = 4
    compute_batch = int(args.compute_batch)
    hidden_size = 4
    reduction = popart.ReductionType.Sum

    deviceInfo = popdist.popart.getDevice(ipus_per_replica)
    num_local_replicas = popdist.getNumLocalReplicas()
    num_total_replicas = popdist.getNumTotalReplicas()

    builder = popart.Builder()

    np.random.seed(12321)
    weight_data = np.random.rand(hidden_size, hidden_size).astype(np.float32)

    input_data = []
    label_data = []

    for i in range(
            0, batches_per_step * num_local_replicas * accumulation_factor *
            compute_batch):
        np.random.seed(popdist.getInstanceIndex() +
                       i * popdist.getNumInstances())
        input_data += [np.random.rand(hidden_size).astype(np.float32)]
        label_data += [np.random.randint(0, hidden_size, size=1)]

    input_data = np.concatenate(input_data)
    label_data = np.concatenate(label_data)

    builder = popart.Builder()

    d0 = builder.addInputTensor(
        popart.TensorInfo("FLOAT", (compute_batch, hidden_size)), "d0")
    l0 = builder.addInputTensor(popart.TensorInfo("UINT32", (compute_batch, )),
                                "l0")

    data = {}

    data[d0] = input_data.reshape((batches_per_step, num_local_replicas,
                                   accumulation_factor, compute_batch, -1))

    w0 = builder.addInitializedInputTensor(weight_data, 'weight0')
    x = builder.aiOnnx.matmul([d0, w0])

    x = builder.aiOnnx.softmax([x])

    data[l0] = label_data.reshape((batches_per_step,
                    num_local_replicas,
                    accumulation_factor,
                    compute_batch,
                    -1))\
                .astype(np.uint32)
    loss = builder.aiGraphcore.nllloss([x, l0],
                                       reduction=reduction,
                                       debugContext='loss')

    proto = builder.getModelProto()

    dataFlow = popart.DataFlow(
        batches_per_step,
        {av: popart.AnchorReturnType("ALL")
         for av in [x, loss]})

    opts = popart.SessionOptions()
    if accumulation_factor > 1:
        opts.enableGradientAccumulation = True
        opts.accumulationFactor = accumulation_factor
    opts.explicitRecomputation = True
    opts.enableExplicitMainLoops = True
    opts.useHostCopyOps = True
    # Let popdist handle distributed settings, such as:
    # opts.enableDistributedReplicatedGraphs
    # opts.globalReplicaOffset
    # opts.globalReplicationFactor
    popdist.popart.configureSessionOptions(opts)

    for tensor in ["weight", "optimizerState", "accumulator"]:
        userOption = tensor + "TensorLocationSettings"
        print(
            f"Setting RTS: {userOption}, num_total_replicas: {num_total_replicas} num_local_replicas: {num_local_replicas}"
        )
        locationSetting = getattr(opts, userOption)
        locationSetting.minElementsForOffChip = 0
        locationSetting.minElementsForReplicatedTensorSharding = num_total_replicas
        if tensor in args.tensors:
            locationSetting.location.replicatedTensorSharding = popart.ReplicatedTensorSharding.On
        if num_total_replicas > num_local_replicas:
            locationSetting.location.shardingDomain = popart.CommGroup(
                popart.CommGroupType.Consecutive, num_local_replicas)
        setattr(opts, userOption, locationSetting)

    if args.optim == "Adam":
        optimizer = popart.Adam(
            {
                "defaultLearningRate": (0.01, False),
                "defaultBeta1": (0.9, False),
                "defaultBeta2": (0.999, False),
                "defaultEps": (1e-06, False),
                "defaultWeightDecay": (0.1, False),
                "lossScaling": (10, False),
            },
            weight_decay_mode=popart.WeightDecayMode.Decay,
            mode=popart.AdamMode.LambNoBias)
    if args.optim == "SGD":
        optimizer = popart.ConstSGD(0.01)

    session = popart.TrainingSession(fnModel=proto,
                                     dataFlow=dataFlow,
                                     deviceInfo=deviceInfo,
                                     userOptions=opts,
                                     loss=loss,
                                     optimizer=optimizer)

    session.prepareDevice()

    session.weightsFromHost()

    anchors = session.initAnchorArrays()

    stepio = popart.PyStepIO(data, anchors)

    session.run(stepio)

    tmp_path = Path(args.tmpdir)
    tmp_path.mkdir(parents=True, exist_ok=True)
    file_path = str(tmp_path / args.filename)
    session.modelToHost(file_path)
    post_proto = onnx.load(file_path)
Beispiel #11
0
    def ipu_prog(num_replicas, gradient_accumulation):
        import logging
        import sys
        logging.basicConfig(stream=sys.stdout, level=logging.INFO)
        popdist_on = popdist.isPopdistEnvSet()

        num_global_replicas = popdist.getNumTotalReplicas(
        ) if popdist_on else num_replicas
        num_instances = popdist.getNumInstances() if popdist_on else 1

        dataset_size = global_batch_size = 16
        micro_batch_size = int(global_batch_size / num_global_replicas /
                               gradient_accumulation)

        X = np.arange(1, dataset_size + 1, 1, dtype=float)
        Y = [0] * dataset_size
        ds = tf.data.Dataset.from_tensor_slices((X, Y))
        if popdist_on:
            ds = ds.shard(num_instances, index=popdist.getInstanceIndex())
        ds = ds.batch(micro_batch_size, drop_remainder=True)
        ds = ds.repeat()

        cfg = ipu.config.IPUConfig()
        if popdist_on:
            cfg = popdist.tensorflow.set_ipu_config(
                cfg,
                ipus_per_replica=popdist.getNumIpusPerReplica(),
                configure_device=True)
            hvd.init()
        else:
            cfg.auto_select_ipus = num_global_replicas
        cfg.configure_ipu_system()

        strategy = popdist_strategy.PopDistStrategy(
        ) if popdist_on else ipu.ipu_strategy.IPUStrategy()

        with strategy.scope():

            def get_model():
                input_layer = tf.keras.Input(shape=1)
                kernel_initializer = tf.keras.initializers.Constant(1)
                x = tf.keras.layers.Dense(
                    1, use_bias=False,
                    kernel_initializer=kernel_initializer)(input_layer)
                return tf.keras.Model(input_layer, x)

            model = get_model()
            model.set_gradient_accumulation_options(
                gradient_accumulation_steps_per_replica=gradient_accumulation)
            model.build(input_shape=(micro_batch_size, 1))

            if popdist_on:

                def gradient_normalizer(grads_and_vars):
                    return [(grad / gradient_accumulation, var)
                            for grad, var in grads_and_vars]
            else:

                def gradient_normalizer(grads_and_vars):
                    return [
                        (grad / num_global_replicas / gradient_accumulation,
                         var) for grad, var in grads_and_vars
                    ]

            optimizer = tf.keras.optimizers.SGD(
                learning_rate=1.0, gradient_transformers=[gradient_normalizer])

            loss_class = tf.keras.losses.MeanSquaredError
            loss_outfeed_queue = ipu.ipu_outfeed_queue.IPUOutfeedQueue()
            loss_class = wrap_loss_in_enqueuer(loss_class, loss_outfeed_queue)
            loss = loss_class()

            micro_batches_per_weight_update = num_global_replicas * gradient_accumulation
            steps_per_execution = dataset_size // (
                micro_batch_size * micro_batches_per_weight_update
            ) * micro_batches_per_weight_update

            model.compile(optimizer=optimizer,
                          loss=loss,
                          metrics=[tf.keras.losses.MSE],
                          steps_per_execution=steps_per_execution)

            callbacks = [
                OutFeedQueueCallback(queue=loss_outfeed_queue,
                                     name='average_loss')
            ]
            if num_instances > 1:
                callbacks += [AllReduceMetricsCallback()]
            callbacks += [LoggingCallback(1)]

            model.fit(ds,
                      steps_per_epoch=steps_per_execution,
                      callbacks=callbacks)

            return model.get_weights()[0][0][0]
Beispiel #12
0
    def get_dataset(dataset_name: str,
                    dataset_path: str,
                    split: str,
                    img_datatype: tf.dtypes.DType,
                    micro_batch_size: int,
                    shuffle: bool = False,
                    accelerator_side_preprocess: bool = True,
                    eight_bit_transfer: Optional[EightBitTransfer] = None,
                    apply_preprocessing: bool = True,
                    pipeline_num_parallel: int = 48,
                    seed: Optional[int] = None):

        logging.info(f'dataset_name = {dataset_name}')

        if popdist.getNumInstances() == 1:
            logging.info(f'Since the training is run in a single process, setting dataset pipeline threading '
                         f'and prefetching buffer size to tf.data.AUTOTUNE.')
            pipeline_num_parallel = prefetch_size = tf.data.AUTOTUNE
        else:
            prefetch_size = PREFETCH_BUFFER_SIZE
            logging.info(f'Setting number of threads for the dataset pipeline to {pipeline_num_parallel}, '
                         f'and the prefetching buffer size to {prefetch_size}.')

        ds, img_shape, dataset_size, num_classes = DataGenerator.get_dataset_from_name(
            ds_name=dataset_name, ds_path=dataset_path, split=split)

        preprocess_fn = None
        if apply_preprocessing:
            if dataset_name == 'cifar10':
                ds, preprocess_fn = DataTransformer.cifar_preprocess(
                    ds,
                    buffer_size=dataset_size,
                    img_type=img_datatype,
                    is_training=(split == 'train'),
                    accelerator_side_preprocess=accelerator_side_preprocess,
                    pipeline_num_parallel=pipeline_num_parallel,
                )
            elif dataset_name == 'imagenet':
                ds, preprocess_fn = DataTransformer.imagenet_preprocessing(
                    ds,
                    img_type=img_datatype,
                    is_training=(split == 'train'),
                    accelerator_side_preprocess=accelerator_side_preprocess,
                    pipeline_num_parallel=pipeline_num_parallel,
                    seed=seed
                )
                if shuffle:
                    # Shuffle the input files
                    ds = ds.shuffle(buffer_size=IMAGENET_SHUFFLE_BUFFER_SIZE)
            else:
                ds = DataTransformer.cache_shuffle(ds, buffer_size=dataset_size, shuffle=(split == 'train'))
                ds = DataTransformer.normalization(ds, img_type=img_datatype)
                preprocess_fn = None


            if eight_bit_transfer is not None:
                ds = ds.map(lambda x, y: (eight_bit_transfer.compress(x), y), num_parallel_calls=pipeline_num_parallel)


            ds = ds.batch(batch_size=micro_batch_size, drop_remainder=True)
            ds = ds.repeat().prefetch(prefetch_size)

            cpu_memory_usage = psutil.virtual_memory().percent

            if cpu_memory_usage > 100:
                logging.warning(f'cpu_memory_usage is {cpu_memory_usage} > 100% so your program is likely to crash')

        return ds, img_shape, dataset_size, num_classes, preprocess_fn
Beispiel #13
0
    return args


def benchmark_throughput(dataloader, iteration=2):
    for _ in range(iteration):
        total_sample_size = 0
        start_time = time.perf_counter()
        for input_data, _ in tqdm(dataloader, total=len(dataloader)):
            total_sample_size += input_data.size()[0]
        elapsed_time = time.perf_counter() - start_time

        if popdist.isPopdistEnvSet():
            elapsed_time, total_sample_size = utils.synchronize_throughput_values(
                elapsed_time,
                total_sample_size,
            )

        iteration_throughput = total_sample_size / elapsed_time
        print(f"Throughput of the iteration:{iteration_throughput:0.1f} img/sec")


if __name__ == '__main__':
    args = get_args()
    opts = poptorch.Options()
    if popdist.isPopdistEnvSet():
        hvd.init()
        opts.Distributed.configureProcessId(popdist.getInstanceIndex(), popdist.getNumInstances())
    opts.randomSeed(0)
    dataloader = get_data(args, opts, train=True, async_dataloader=not(args.disable_async_loading))
    benchmark_throughput(dataloader)