예제 #1
0
def validation_graph(model, opts):
    valid_graph = tf.Graph()
    with valid_graph.as_default():
        # datasets must be defined outside the ipu device scope
        valid_iterator = ipu_infeed_queue.IPUInfeedQueue(
            dataset.data(opts, is_training=False),
            feed_name='validation_feed',
            replication_factor=opts['replicas'] * opts['shards'])

        with ipu_scope('/device:IPU:0'):

            def comp_fn():
                def body(total_accuracy, image, label):
                    accuracy = validation_graph_builder(
                        model, image, label, opts)
                    return total_accuracy + (
                        tf.cast(accuracy, tf.float32) /
                        opts["validation_batches_per_step"])

                accuracy = loops.repeat(
                    int(opts["validation_batches_per_step"]), body,
                    [tf.constant(0, tf.float32)], valid_iterator)
                if opts['replicas'] > 1:
                    accuracy = cross_replica_ops.cross_replica_sum(
                        accuracy) / (opts['replicas'] * opts['shards'])
                return accuracy

            (accuracy, ) = xla.compile(comp_fn, [])

        accuracy = 100 * accuracy

        valid_saver = tf.train.Saver()

        ipu.utils.move_variable_initialization_to_cpu()
        valid_init = tf.global_variables_initializer()

    globalAMP = None
    if opts["available_memory_proportion"] and len(
            opts["available_memory_proportion"]) == 1:
        globalAMP = opts["available_memory_proportion"][0]

    ipu_options = get_config(
        ipu_id=opts["select_ipu"],
        prng=not opts["no_stochastic_rounding"],
        shards=1,
        number_of_replicas=opts['replicas'] * opts['shards'],
        max_cross_replica_buffer_size=opts["max_cross_replica_buffer_size"],
        fp_exceptions=opts["fp_exceptions"],
        xla_recompute=opts["xla_recompute"],
        seed=opts["seed"],
        profile=opts['profile'],
        availableMemoryProportion=globalAMP,
        stable_norm=opts["stable_norm"])
    ipu.utils.configure_ipu_system(ipu_options)

    valid_sess = tf.Session(graph=valid_graph, config=tf.ConfigProto())

    return train.GraphOps(valid_graph, valid_sess, valid_init, [accuracy],
                          None, valid_iterator, None, valid_saver, None)
예제 #2
0
def training_graph(model, opts, iterations_per_step=1):

    train_graph = tf.Graph()
    with train_graph.as_default():
        placeholders = dict()
        datatype = tf.float16 if opts["precision"].split(
            '.') == '16' else tf.float32
        placeholders['learning_rate'] = tf.placeholder(datatype, shape=[])
        learning_rate = placeholders['learning_rate']

        # datasets must be defined outside the ipu device scope
        train_iterator = ipu_infeed_queue.IPUInfeedQueue(
            dataset.data(opts, is_training=True),
            feed_name='training_feed',
            replication_factor=opts['replicas'])
        outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(
            feed_name="outfeed", replication_factor=opts['replicas'])

        with ipu_scope('/device:IPU:0'):
            train = training_step_with_infeeds_and_outfeeds(
                train_iterator, outfeed_queue, model, opts, learning_rate,
                iterations_per_step)

        outfeed = outfeed_queue.dequeue()

        logging.print_trainable_variables(opts)

        train_saver = tf.train.Saver(max_to_keep=999999)

        ipu.utils.move_variable_initialization_to_cpu()
        train_init = tf.global_variables_initializer()

    globalAMP = None
    if opts["available_memory_proportion"] and len(
            opts["available_memory_proportion"]) == 1:
        globalAMP = opts["available_memory_proportion"][0]

    ipu_options = get_config(
        ipu_id=opts["select_ipu"],
        prng=not opts["no_stochastic_rounding"],
        shards=opts["shards"],
        number_of_replicas=opts['replicas'],
        max_cross_replica_buffer_size=opts["max_cross_replica_buffer_size"],
        fp_exceptions=opts["fp_exceptions"],
        xla_recompute=opts["xla_recompute"],
        seed=opts["seed"],
        availableMemoryProportion=globalAMP)

    ipu.utils.configure_ipu_system(ipu_options)
    train_sess = tf.Session(graph=train_graph, config=tf.ConfigProto())

    return GraphOps(train_graph, train_sess, train_init, [train], placeholders,
                    train_iterator, outfeed, train_saver)
예제 #3
0
def validation_graph(model, opts):
    valid_graph = tf.Graph()
    with valid_graph.as_default():
        # datasets must be defined outside the ipu device scope
        valid_iterator = ipu_infeed_queue.IPUInfeedQueue(
            dataset.data(opts, is_training=False),
            feed_name='validation_feed',
            replication_factor=opts['replicas'] * opts['shards'])

        with ipu_scope('/device:IPU:0'):

            def comp_fn():
                def body(total_accuracy, image, label):
                    accuracy = validation_graph_builder(
                        model, image, label, opts)
                    return total_accuracy + (
                        tf.cast(accuracy, tf.float32) /
                        opts["validation_batches_per_step"])

                accuracy = loops.repeat(
                    int(opts["validation_batches_per_step"]), body,
                    [tf.constant(0, tf.float32)], valid_iterator)
                if opts['replicas'] > 1:
                    accuracy = cross_replica_ops.cross_replica_sum(
                        accuracy) / (opts['replicas'] * opts['shards'])
                return accuracy

            (accuracy, ) = xla.compile(comp_fn, [])

        accuracy = 100 * accuracy

        valid_saver = tf.train.Saver()

        ipu.utils.move_variable_initialization_to_cpu()
        valid_init = tf.global_variables_initializer()

    valid_sess = tf.Session(graph=valid_graph, config=tf.ConfigProto())

    return train.GraphOps(valid_graph, valid_sess, valid_init, [accuracy],
                          None, valid_iterator, None, valid_saver)
예제 #4
0
def inference_run(exec_filename, ckpt_name, iteration, epoch, first_run, opts):
    """Run inference for multiple iterations and collect latency values."""
    logging.mlperf_logging(key="EVAL_START",
                           log_type="start",
                           metadata={"epoch_num": round(epoch)})
    engine_name = "my_engine"
    ctx = embedded_runtime.embedded_runtime_start(exec_filename, [],
                                                  engine_name,
                                                  timeout=1000)

    input_placeholder = tf.placeholder(
        tf.uint8,
        (opts['micro_batch_size'], opts['image_size'], opts['image_size'], 3))

    num_iters = opts['iterations']
    if opts['generated_data']:
        placeholders = [input_placeholder]
        images = np.random.normal(size=(opts['micro_batch_size'],
                                        opts['image_size'], opts['image_size'],
                                        3)).astype(np.uint8)
        labels = None
    else:
        label_placeholder = tf.placeholder(tf.int32,
                                           (opts['micro_batch_size']))
        placeholders = [input_placeholder, label_placeholder]

        with tf.Graph().as_default():
            inference_dataset = dataset.data(
                opts, is_training=False).map(lambda x: {'data_dict': x})
            images, labels = dataset_to_list(
                inference_dataset, num_iters * opts['micro_batch_size'])

    call_result = embedded_runtime.embedded_runtime_call(placeholders, ctx)

    ipu.config.reset_ipu_configuration()
    gc.collect()

    thread_queue = Queue()
    with tf.Session() as session:
        # do not include time of the first iteration in stats
        initial_feed_dict = prepare_feed_dict(placeholders, images, labels,
                                              opts['micro_batch_size'],
                                              opts['generated_data'], 0)
        session.run(call_result, initial_feed_dict)

        def runner(session, thread_idx):
            thread_channel = pvti.createTraceChannel(f"Thread {thread_idx}")
            latencies = []
            accuracies = []
            for iter_idx in range(num_iters):
                feed_dict = prepare_feed_dict(placeholders, images, labels,
                                              opts['micro_batch_size'],
                                              opts['generated_data'], iter_idx)
                with pvti.Tracepoint(thread_channel, f"Iteration {iter_idx}"):
                    start_iter = time.time()
                    _, predictions = session.run(call_result, feed_dict)
                    end_iter = time.time()
                latencies.append(end_iter - start_iter)
                if not opts['generated_data']:
                    expected = feed_dict[label_placeholder]
                    accuracy = np.mean(
                        np.equal(predictions, expected).astype(np.float32))
                    accuracies.append(accuracy)
            thread_queue.put((latencies, accuracies), timeout=10)

        thp = [
            Thread(target=runner, args=(session, thread_idx))
            for thread_idx in range(opts['num_inference_thread'])
        ]
        inference_start = time.time()
        for idx, _thread in enumerate(thp):
            _thread.start()
            print(f"Thread {idx} started")

        for idx, _thread in enumerate(thp):
            _thread.join()
            print(f"Thread {idx} joined")
        val_time = time.time() - inference_start

    latencies, accuracies = [], []
    while not thread_queue.empty():
        lat_acc = thread_queue.get()
        latencies.extend(lat_acc[0])
        accuracies.extend(lat_acc[1])

    if opts['generated_data']:
        total_accuracy = -1
    else:
        total_accuracy = sum(accuracies) / len(accuracies)
        total_accuracy *= 100

    # convert latencies to miliseconds
    latencies = [1000 * latency_s for latency_s in latencies]

    max_latency = max(latencies)
    mean_latency = np.mean(latencies)
    perc_99 = np.percentile(latencies, 99)
    perc_99_9 = np.percentile(latencies, 99.9)

    print(
        f"Latencies - avg: {mean_latency:8.4f}, 99th percentile: {perc_99:8.4f}, "
        f"99.9th percentile: {perc_99_9:8.4f}, max: {max_latency:8.4f}")

    valid_format = (
        "Validation top-1 accuracy [{name}] (iteration: {iteration:6d}, epoch: {epoch:6.2f}, "
        "img/sec: {img_per_sec:6.2f}, time: {val_time:8.6f}, "
        "latency (ms): {latency:8.4f}: {val_acc:6.3f}%")

    val_size = (num_iters * opts['num_inference_thread'] *
                opts['validation_total_batch_size'])

    stats = OrderedDict([
        ('name', ckpt_name),
        ('iteration', iteration),
        ('epoch', epoch),
        ('val_acc', total_accuracy),
        ('val_time', val_time),
        ('val_size', val_size),
        ('img_per_sec', val_size / val_time),
        ('latency', mean_latency),
    ])
    logging.print_to_file_and_screen(valid_format.format(**stats), opts)
    logging.write_to_csv(stats, first_run, False, opts)
    if opts['wandb'] and opts['distributed_worker_index'] == 0:
        logging.log_to_wandb(stats)
    logging.mlperf_logging(key="EVAL_STOP",
                           log_type="stop",
                           metadata={"epoch_num": round(epoch)})
    logging.mlperf_logging(key="EVAL_ACCURACY",
                           value=float(stats['val_acc']) / 100,
                           metadata={"epoch_num": round(epoch)})
    return stats
예제 #5
0
def create_poplar_exec(model, opts, poplar_exec_path):
    """Create graph and save it to the file."""
    valid_graph = tf.Graph()

    with valid_graph.as_default():
        # datasets must be defined outside the ipu device scope
        if opts['generated_data']:
            # create dummy dataset with images only
            dummy_image = np.zeros((opts['micro_batch_size'],
                                    opts['image_size'], opts['image_size'], 3),
                                   dtype=np.uint8)
            inference_dataset = tf.data.Dataset.from_tensors(
                {"image": dummy_image})
        else:
            # create dataset with images and labels
            inference_dataset = dataset.data(opts, is_training=False)
        inference_dataset = inference_dataset.map(lambda x: {'data_dict': x})

        inference_infeed_iterator = \
            ipu_infeed_queue.IPUInfeedQueue(inference_dataset,
                                            prefetch_depth=opts['prefetch_depth'])

        acc_queue = ipu_outfeed_queue.IPUOutfeedQueue()
        with ipu_scope('/device:IPU:0'):

            def comp_fn():
                def body(data_dict):
                    accuracy = validation_graph_builder(model, data_dict, opts)
                    accuracy_enqueue = acc_queue.enqueue(accuracy)
                    return accuracy_enqueue

                accuracy = loops.repeat(
                    int(opts['validation_batches_per_step']), body, [],
                    inference_infeed_iterator)
                return accuracy

        filenames, _ = get_ckpt_filenames(opts)

        accuracy = application_compile_op.experimental_application_compile_op(
            comp_fn, output_path=poplar_exec_path, freeze_variables=True)

        outfeed = acc_queue.dequeue()
        valid_saver = tf.train.Saver()

        ipu.utils.move_variable_initialization_to_cpu()

    with tf.Session(graph=valid_graph, config=tf.ConfigProto()) as sess:
        if len(filenames) == 1:
            print("Restoring from a snapshot: ", filenames[0])
            sess.run(inference_infeed_iterator.initializer)
            init = tf.global_variables_initializer()
            sess.run(init)
            valid_saver.restore(sess, filenames[0])
        else:
            print(
                "Warning: no restore point found - randomly initialising weights instead"
            )
            init = tf.global_variables_initializer()
            sess.run(init)

        path = sess.run(accuracy)
        print(f"Poplar executable: {path}")

    valid_graph.finalize()
예제 #6
0
def build_graph(bert_config,
                opts,
                iterations_per_step=1,
                is_training=True,
                feed_name=None):
    """Build the graph for training.

    Args:
        bert_config: configuration for the BERT model.
        opts: a dictionary containing all global options.
        iterations_per_step: number of iterations per step
        is_training (bool): if true return a graph with trainable variables.
        feed_name: name of the IPU infeed.

    Returns:
        a GraphOps containing a BERT graph and session prepared for inference or training.
    """
    train_graph = tf.Graph()
    with train_graph.as_default():

        placeholders = dict()
        placeholders['learning_rate'] = tf.placeholder(bert_config.dtype,
                                                       shape=[])
        learning_rate = placeholders['learning_rate']

        train_iterator = ipu.ipu_infeed_queue.IPUInfeedQueue(
            dataset.data(opts, is_training=is_training),
            feed_name=feed_name + "_in",
            replication_factor=opts['replicas'])

        outfeed_queue = ipu.ipu_outfeed_queue.IPUOutfeedQueue(
            feed_name=feed_name + "_out", replication_factor=opts['replicas'])

        with ipu.scopes.ipu_scope('/device:IPU:0'):
            train = training_step_with_infeeds_and_outfeeds(
                bert_config,
                train_iterator,
                outfeed_queue,
                opts,
                learning_rate,
                iterations_per_step,
                is_training=is_training)

        outfeed = outfeed_queue.dequeue()

        bert_logging.print_trainable_variables(opts['logs_path'])

        model_variables = tf.trainable_variables() + tf.get_collection(
            tf.GraphKeys.TRAINABLE_RESOURCE_VARIABLES)
        model_and_optimiser_variables = tf.global_variables()

        restore = tf.train.Saver(
            var_list=model_and_optimiser_variables
            if opts['restore_optimiser_from_ckpt'] else model_variables)

        # We store two savers: one for the standard training and another one for the best checkpoint
        savers = {
            "train_saver":
            tf.train.Saver(var_list=model_variables if opts['ckpt_model_only']
                           else model_and_optimiser_variables,
                           name='latest',
                           max_to_keep=5),
            "best_saver":
            tf.train.Saver(var_list=model_variables if opts['ckpt_model_only']
                           else model_and_optimiser_variables,
                           name='best',
                           max_to_keep=1)
        }

        ipu.utils.move_variable_initialization_to_cpu()
        train_init = tf.global_variables_initializer()
        tvars = tf.trainable_variables()

    # Calculate number of IPUs required for pretraining pipeline.
    num_embedding_ipu = {
        'two_ipus': 2,
        'same_ipu': 1,
        'same_as_hidden_layers': 0
    }[opts['embeddings_placement']]

    num_hidden_layer_stages = len(bert_config.hidden_layers_per_stage)
    num_ipus_required = opts['replicas'] * next_power_of_two(
        num_hidden_layer_stages + num_embedding_ipu)

    # Configure the IPU options.
    ipu_options = get_ipu_config(
        fp_exceptions=opts["fp_exceptions"],
        stochastic_rounding=opts['stochastic_rounding'],
        xla_recompute=opts["xla_recompute"],
        available_memory_proportion=opts['available_memory_proportion'],
        disable_graph_outlining=opts["disable_graph_outlining"],
        num_ipus_required=num_ipus_required,
        max_cross_replica_sum_buffer_size=opts[
            'max_cross_replica_sum_buffer_size'],
        scheduler_selection=opts['scheduler'],
        compile_only=opts['compile_only'],
        partials_type=opts['partials_type'])
    ipu.utils.configure_ipu_system(ipu_options)

    train_sess = tf.Session(graph=train_graph, config=tf.ConfigProto())

    return GraphOps(train_graph, train_sess, train_init, [train], placeholders,
                    train_iterator, outfeed, savers, restore, tvars)
예제 #7
0
def training_graph(model, opts, iterations_per_step=1):
    train_graph = tf.Graph()
    sess_config = tf.ConfigProto()
    sess_target = None
    strategy = None

    if opts['distributed_cluster']:
        strategy, sess_target, sess_config = configure_distribution(
            opts, sess_config)

    with train_graph.as_default(), ExitStack() as stack:
        if strategy:
            stack.enter_context(strategy.scope())

        placeholders = dict()
        datatype = tf.float16 if opts["precision"].split(
            '.') == '16' else tf.float32
        placeholders['learning_rate'] = tf.placeholder(datatype, shape=[])
        learning_rate = placeholders['learning_rate']

        # datasets must be defined outside the ipu device scope
        train_iterator = ipu_infeed_queue.IPUInfeedQueue(
            dataset.data(opts, is_training=True),
            feed_name='training_feed',
            replication_factor=opts['replicas'])
        outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue(
            feed_name="outfeed", replication_factor=opts['replicas'])

        with ipu_scope('/device:IPU:0'):
            train = training_step_with_infeeds_and_outfeeds(
                train_iterator, outfeed_queue, model, opts, learning_rate,
                iterations_per_step)

        outfeed = outfeed_queue.dequeue()
        if strategy:
            # Take the mean of all the outputs across the distributed workers
            outfeed = [
                strategy.reduce(tf.distribute.ReduceOp.MEAN, v)
                for v in outfeed
            ]

        logging.print_trainable_variables(opts)

        train_saver = tf.train.Saver(max_to_keep=999999)
        with tf.device('cpu'):
            profile_report = gen_ipu_ops.ipu_event_trace()
        ipu.utils.move_variable_initialization_to_cpu(graph=None)
        train_init = tf.global_variables_initializer()

    globalAMP = None
    if opts["available_memory_proportion"] and len(
            opts["available_memory_proportion"]) == 1:
        globalAMP = opts["available_memory_proportion"][0]

    ipu_options = get_config(
        ipu_id=opts["select_ipu"],
        prng=not opts["no_stochastic_rounding"],
        shards=opts["shards"],
        number_of_replicas=opts['replicas'],
        max_cross_replica_buffer_size=opts["max_cross_replica_buffer_size"],
        fp_exceptions=opts["fp_exceptions"],
        xla_recompute=opts["xla_recompute"],
        seed=opts["seed"],
        profile=opts['profile'],
        availableMemoryProportion=globalAMP)

    ipu.utils.configure_ipu_system(ipu_options)
    train_sess = tf.Session(graph=train_graph,
                            config=sess_config,
                            target=sess_target)

    return GraphOps(train_graph, train_sess, train_init, [train], placeholders,
                    train_iterator, outfeed, train_saver, profile_report)
예제 #8
0
def validation_graph(model, opts):
    reconfigure = not opts.get('reuse_IPUs', False)
    if opts['use_popdist'] and reconfigure:
        hvd.init()

    valid_graph = tf.Graph()
    with valid_graph.as_default():
        # datasets must be defined outside the ipu device scope
        valid_dataset = dataset.data(
            opts, is_training=False).map(lambda x: {'data_dict': x})

        valid_iterator = ipu_infeed_queue.IPUInfeedQueue(
            valid_dataset, prefetch_depth=opts['prefetch_depth'])

        if opts['latency']:
            timestamp_queue = ipu_outfeed_queue.IPUOutfeedQueue()

        with ipu_scope('/device:IPU:0'):

            def comp_fn():
                def body(total_accuracy, data_dict):
                    accuracy = validation_graph_builder(model, data_dict, opts)
                    if opts['latency']:
                        timestamp_enqueue = timestamp_queue.enqueue(
                            data_dict['timestamp'])
                        return (total_accuracy +
                                (tf.cast(accuracy, tf.float32) /
                                 opts["validation_batches_per_step"]),
                                timestamp_enqueue)
                    else:
                        return total_accuracy + (
                            tf.cast(accuracy, tf.float32) /
                            opts["validation_batches_per_step"])

                accuracy = loops.repeat(
                    int(opts["validation_batches_per_step"]), body,
                    [tf.constant(0, tf.float32)], valid_iterator)
                if opts['total_replicas'] * opts['shards'] > 1 and not opts.get(
                        'inference', False):
                    accuracy = cross_replica_ops.cross_replica_sum(
                        accuracy) / (opts['total_replicas'] * opts['shards'])
                return accuracy

            (accuracy, ) = xla.compile(comp_fn, [])

        accuracy = 100 * accuracy

        if opts['latency']:
            print(f'relative_timer start {relative_timer.get_start()}')
            timestamp = tf.cast(tf.timestamp() - relative_timer.get_start(),
                                tf.float32)
            latency_per_batch = tf.reshape(
                timestamp - timestamp_queue.dequeue(), [-1])
        else:
            latency_per_batch = None

        valid_saver = tf.train.Saver()

        ipu.utils.move_variable_initialization_to_cpu()
        valid_init = tf.global_variables_initializer()

        if opts['use_popdist']:
            broadcast_weights = []
            for var in tf.global_variables():
                broadcast_weights.append(
                    var.assign(hvd.broadcast(var, root_rank=0)))
            global_batch_size_ph = tf.placeholder(dtype=tf.int32, shape=())
            broadcast_global_batch_size = hvd.broadcast(global_batch_size_ph,
                                                        root_rank=0)
            num_files_ph = tf.placeholder(dtype=tf.int32, shape=())
            broadcast_num_files = hvd.broadcast(num_files_ph, root_rank=0)
            iteration_ph = tf.placeholder(dtype=tf.int32, shape=())
            broadcast_iteration = hvd.broadcast(iteration_ph, root_rank=0)
        else:
            broadcast_weights = None
            broadcast_global_batch_size, global_batch_size_ph = None, None
            broadcast_num_files, num_files_ph = None, None
            broadcast_iteration, iteration_ph = None, None

    globalAMP = None
    if opts["available_memory_proportion"] and len(
            opts["available_memory_proportion"]) == 1:
        globalAMP = opts["available_memory_proportion"][0]

    ipu_options = get_config(
        ipu_id=opts["select_ipu"],
        prng=False,  # disable Stochastic Rounding for validation
        shards=opts['shards'],
        number_of_replicas=opts['total_replicas'],
        max_cross_replica_buffer_size=opts["max_cross_replica_buffer_size"],
        fp_exceptions=opts["fp_exceptions"],
        half_partials=opts["enable_half_partials"],
        conv_dithering=opts["enable_conv_dithering"],
        enable_recomputation=opts["enable_recomputation"],
        seed=opts["seed"],
        availableMemoryProportion=globalAMP,
        stable_norm=opts["stable_norm"],
        compile_only=opts["compile_only"],
        internalExchangeOptimisationTarget=opts[
            "internal_exchange_optimisation_target"],
        num_io_tiles=opts["num_io_tiles"],
        number_of_distributed_batch_norm_replicas=opts.get("BN_span", 1),
        nanoo=not opts["saturate_on_overflow"],
    )

    if opts['use_popdist'] and reconfigure:
        ipu_options = popdist.tensorflow.set_ipu_config(ipu_options,
                                                        opts['shards'],
                                                        configure_device=False)

    if opts['on_demand'] and reconfigure:
        ipu_options.device_connection.enable_remote_buffers = True
        ipu_options.device_connection.type = ipu.utils.DeviceConnectionType.ON_DEMAND

    if reconfigure:
        ipu_options.configure_ipu_system()

    valid_sess = tf.Session(graph=valid_graph, config=tf.ConfigProto())

    ops = {
        'accuracy': accuracy,
        'broadcast_weights': broadcast_weights,
        'broadcast_global_batch_size': broadcast_global_batch_size,
        'broadcast_num_files': broadcast_num_files,
        'broadcast_iteration': broadcast_iteration,
        'latency_per_batch': latency_per_batch
    }

    placeholders = {
        'global_batch_size': global_batch_size_ph,
        'num_files': num_files_ph,
        'iteration': iteration_ph
    }

    valid_graph.finalize()

    return train.GraphOps(valid_graph, valid_sess, valid_init, ops,
                          placeholders, valid_iterator, None, valid_saver)