Esempio n. 1
0
def main():
    #    if tf.__version__.split('.')[0] != "1":
    #        raise Exception("Tensorflow version 1 required")

    if a.seed is None:
        a.seed = random.randint(0, 2**31 - 1)

    tf.set_random_seed(a.seed)
    np.random.seed(a.seed)
    random.seed(a.seed)

    if not os.path.exists(a.output_dir):
        os.makedirs(a.output_dir)

    for k, v in a._get_kwargs():
        print(k, "=", v)

    with open(os.path.join(a.output_dir, "options.json"), "w") as filename:
        filename.write(json.dumps(vars(a), sort_keys=True, indent=4))

    examples = load_examples()

    model = create_model(examples.inputs, examples.targets)

    # encoding images for saving
    with tf.name_scope("encode_images"):
        display_fetches = {}
        for name, value in examples._asdict().items():
            if "path" in name:
                display_fetches[name] = value
            elif tf.is_numeric_tensor(value):
                display_fetches[name] = tf.map_fn(tf.image.encode_png,
                                                  deprocess(value),
                                                  dtype=tf.string,
                                                  name=name + "_pngs")
        for name, value in model._asdict().items():
            if tf.is_numeric_tensor(value) and "predict_" not in name:
                display_fetches[name] = tf.map_fn(tf.image.encode_png,
                                                  deprocess(value),
                                                  dtype=tf.string,
                                                  name=name + "_pngs")

    # progress report for all losses
    with tf.name_scope("progress_summary"):
        progress_fetches = {}
        for name, value in model._asdict().items():
            if not tf.is_numeric_tensor(
                    value
            ) and "grads_and_vars" not in name and not name == "train":
                progress_fetches[name] = value

    # summaries for model: images, scalars, histograms
    for name, value in examples._asdict().items():
        if tf.is_numeric_tensor(value):
            with tf.name_scope(name + "_summary"):
                tf.summary.image(name, deprocess(value))
    for name, value in model._asdict().items():
        if tf.is_numeric_tensor(value):
            with tf.name_scope(name + "_summary"):
                if "predict_" in name:  # discriminators produce values in [0, 1]
                    tf.summary.image(
                        name,
                        tf.image.convert_image_dtype(value, dtype=tf.uint8))
                else:  # generators produce values in [-1, 1]
                    tf.summary.image(name, deprocess(value))
        elif "grads_and_vars" in name:
            for grad, var in value:
                tf.summary.histogram(var.op.name + "/gradients", grad)
        elif not name == "train":
            tf.summary.scalar(name, value)

    for var in tf.trainable_variables():
        tf.summary.histogram(var.op.name + "/values", var)

    with tf.name_scope("parameter_count"):
        parameter_count = tf.reduce_sum(
            [tf.reduce_prod(tf.shape(v)) for v in tf.trainable_variables()])

    saver = tf.train.Saver(max_to_keep=1)

    logdir = a.output_dir if (a.trace_freq > 0 or a.summary_freq > 0) else None
    sv = tf.train.Supervisor(logdir=logdir, save_summaries_secs=0, saver=None)
    with sv.managed_session() as sess:
        print("parameter_count =", sess.run(parameter_count))

        if a.checkpoint is not None:
            checkpoint = tf.train.latest_checkpoint(a.checkpoint)
            saver.restore(sess, checkpoint)

        max_steps = 2**32
        if a.max_epochs is not None:
            max_steps = examples.steps_per_epoch * a.max_epochs
        if a.max_steps is not None:
            max_steps = a.max_steps

        if a.mode == "test":
            # testing
            # at most, process the test data once
            max_steps = min(examples.steps_per_epoch, max_steps)
            for step in range(max_steps):
                results = sess.run(display_fetches)
                filesets = save_images(results)
                for i, filename in enumerate(filesets):
                    print("evaluated image", filename["name"])
                index_path = append_index(filesets)

            print("wrote index at %s" % index_path)

        if a.mode == "predict":
            # predicting
            # at most, process the test data once
            max_steps = min(examples.steps_per_epoch, max_steps)
            for step in range(max_steps):
                results = sess.run(display_fetches)
                fileset = save_predicted_images(results)
                for filename in fileset:
                    print("predicted image", filename)
            print("wrote predicted labels at %s" % a.output_dir)

        if a.mode == "train":
            # training
            start = time.time()

            for step in range(max_steps):

                def should(freq):
                    return freq > 0 and ((step + 1) % freq == 0
                                         or step == max_steps - 1)

                options = None
                run_metadata = None
                if should(a.trace_freq):
                    options = tf.RunOptions(
                        trace_level=tf.RunOptions.FULL_TRACE)
                    run_metadata = tf.RunMetadata()

                fetches = {
                    "train": model.train,
                    "global_step": sv.global_step,
                }

                if should(a.progress_freq):
                    fetches["progress"] = progress_fetches

                if should(a.summary_freq):
                    fetches["summary"] = sv.summary_op

                if should(a.display_freq):
                    fetches["display"] = display_fetches

                results = sess.run(fetches,
                                   options=options,
                                   run_metadata=run_metadata)

                if should(a.summary_freq):
                    print("recording summary")
                    sv.summary_writer.add_summary(results["summary"],
                                                  results["global_step"])

                if should(a.display_freq):
                    print("saving display images")
                    filesets = save_images(results["display"],
                                           step=results["global_step"])
                    append_index(filesets, step=True)

                if should(a.trace_freq):
                    print("recording trace")
                    sv.summary_writer.add_run_metadata(
                        run_metadata, "step_%d" % results["global_step"])

                if should(a.progress_freq):
                    # global_step will have the correct step count if we resume from a checkpoint
                    train_epoch = math.ceil(results["global_step"] /
                                            examples.steps_per_epoch)
                    train_step = (results["global_step"] -
                                  1) % examples.steps_per_epoch + 1
                    rate = (step + 1) * a.batch_size / (time.time() - start)
                    remaining = (max_steps - step) * a.batch_size / rate
                    print(
                        "progress  epoch %d  step %d  image/sec %0.1f  remaining %d min"
                        % (train_epoch, train_step, rate, remaining / 60))
                    for name, value in results["progress"].items():
                        print(name, value)

                if should(a.save_freq):
                    print("saving model")
                    saver.save(sess,
                               os.path.join(a.output_dir, "model"),
                               global_step=sv.global_step)

                if sv.should_stop():
                    break
Esempio n. 2
0
def main():
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    if a.seed is None:
        a.seed = random.randint(0, 2**31 - 1)

    tf.set_random_seed(a.seed)
    np.random.seed(a.seed)
    random.seed(a.seed)

    if not os.path.exists(a.output_dir):
        os.makedirs(a.output_dir)

    if a.mode == "test":
        if a.checkpoint is None:
            raise Exception("checkpoint required for test mode")

        # load some options from the checkpoint
        options = {"which_direction", "ngf", "ndf", "lab_colorization"}
        with open(os.path.join(a.checkpoint, "options.json")) as f:
            for key, val in json.loads(f.read()).items():
                if key in options:
                    print("loaded", key, "=", val)
                    setattr(a, key, val)
        # disable these features in test mode
        a.flip = False

    for k, v in a._get_kwargs():
        print(k, "=", v)

    with open(os.path.join(a.output_dir, "options.json"), "w") as f:
        f.write(json.dumps(vars(a), sort_keys=True, indent=4))

    examples = load_examples()
    print("examples count = %d" % examples.count)

    # inputs and targets are [batch_size, height, width, channels]
    if a.mode == "test":
        patch_h_cnt, padding_h = find_patch_and_padding(
            IMAGE_HEIGHT, CROP_SIZE)
        patch_w_cnt, padding_w = find_patch_and_padding(IMAGE_WIDTH, CROP_SIZE)

        paddings = [[0, 0], [padding_h, padding_h], [padding_w, padding_w],
                    [0, 0]]
        inputs_pad = tf.pad(examples.inputs, paddings, "REFLECT")
        targets_pad = tf.pad(examples.targets, paddings, "REFLECT")

        IMAGE_PADDING_HEIGHT = IMAGE_HEIGHT + 2 * padding_h
        IMAGE_PADDING_WIDTH = IMAGE_WIDTH + 2 * padding_w
        outputs = tf.zeros([1, IMAGE_PADDING_HEIGHT, IMAGE_PADDING_WIDTH, 1],
                           dtype=tf.float32)

        first = True
        # combine patchs into images
        for row in range(patch_h_cnt):
            for col in range(patch_w_cnt):
                row_index = int(row * CROP_SIZE / 2)
                col_index = int(col * CROP_SIZE / 2)
                if first == True:
                    with tf.variable_scope("create_model"):
                        model = create_model(
                            tf.slice(inputs_pad, [0, row_index, col_index, 0],
                                     [1, CROP_SIZE, CROP_SIZE, 1]),
                            tf.slice(targets_pad, [0, row_index, col_index, 0],
                                     [1, CROP_SIZE, CROP_SIZE, 1]))
                    first = False
                else:
                    with tf.variable_scope("create_model", reuse=True):
                        model = create_model(
                            tf.slice(inputs_pad, [0, row_index, col_index, 0],
                                     [1, CROP_SIZE, CROP_SIZE, 1]),
                            tf.slice(targets_pad, [0, row_index, col_index, 0],
                                     [1, CROP_SIZE, CROP_SIZE, 1]))
                paddings = [
                    [0, 0],
                    [row_index, IMAGE_PADDING_HEIGHT - CROP_SIZE - row_index],
                    [col_index, IMAGE_PADDING_WIDTH - CROP_SIZE - col_index],
                    [0, 0]
                ]
                outputs = outputs + tf.pad(model.outputs, paddings, "CONSTANT")

        CROP_HALF = int(CROP_SIZE / 2)
        o_11 = tf.pad(
            tf.slice(outputs, [0, 0, 0, 0], [1, CROP_HALF, CROP_HALF, 1]),
            [[0, 0], [0, IMAGE_PADDING_HEIGHT - CROP_HALF],
             [0, IMAGE_PADDING_WIDTH - CROP_HALF], [0, 0]], "CONSTANT")
        o_12 = tf.pad(
            tf.slice(outputs, [0, 0, IMAGE_PADDING_WIDTH - CROP_HALF, 0],
                     [1, CROP_HALF, CROP_HALF, 1]),
            [[0, 0], [0, IMAGE_PADDING_HEIGHT - CROP_HALF],
             [IMAGE_PADDING_WIDTH - CROP_HALF, 0], [0, 0]], "CONSTANT")
        o_13 = tf.pad(
            tf.slice(outputs, [0, IMAGE_PADDING_HEIGHT - CROP_HALF, 0, 0],
                     [1, CROP_HALF, CROP_HALF, 1]),
            [[0, 0], [IMAGE_PADDING_HEIGHT - CROP_HALF, 0],
             [0, IMAGE_PADDING_WIDTH - CROP_HALF], [0, 0]], "CONSTANT")
        o_14 = tf.pad(
            tf.slice(outputs, [
                0, IMAGE_PADDING_HEIGHT - CROP_HALF,
                IMAGE_PADDING_WIDTH - CROP_HALF, 0
            ], [1, CROP_HALF, CROP_HALF, 1]),
            [[0, 0], [IMAGE_PADDING_HEIGHT - CROP_HALF, 0],
             [IMAGE_PADDING_WIDTH - CROP_HALF, 0], [0, 0]], "CONSTANT")

        o_21 = tf.pad(
            tf.slice(outputs, [0, 0, CROP_HALF, 0],
                     [1, CROP_HALF, IMAGE_PADDING_WIDTH - 2 * CROP_HALF, 1]),
            [[0, 0], [0, IMAGE_PADDING_HEIGHT - CROP_HALF],
             [CROP_HALF, CROP_HALF], [0, 0]], "CONSTANT")
        o_22 = tf.pad(
            tf.slice(outputs, [0, CROP_HALF, 0, 0],
                     [1, IMAGE_PADDING_HEIGHT - 2 * CROP_HALF, CROP_HALF, 1]),
            [[0, 0], [CROP_HALF, CROP_HALF],
             [0, IMAGE_PADDING_WIDTH - CROP_HALF], [0, 0]], "CONSTANT")
        o_23 = tf.pad(
            tf.slice(outputs,
                     [0, IMAGE_PADDING_HEIGHT - CROP_HALF, CROP_HALF, 0],
                     [1, CROP_HALF, IMAGE_PADDING_WIDTH - 2 * CROP_HALF, 1]),
            [[0, 0], [IMAGE_PADDING_HEIGHT - CROP_HALF, 0],
             [CROP_HALF, CROP_HALF], [0, 0]], "CONSTANT")
        o_24 = tf.pad(
            tf.slice(outputs,
                     [0, CROP_HALF, IMAGE_PADDING_WIDTH - CROP_HALF, 0],
                     [1, IMAGE_PADDING_HEIGHT - 2 * CROP_HALF, CROP_HALF, 1]),
            [[0, 0], [CROP_HALF, CROP_HALF],
             [IMAGE_PADDING_WIDTH - CROP_HALF, 0], [0, 0]], "CONSTANT")
        o_4 = tf.pad(
            tf.slice(outputs, [0, CROP_HALF, CROP_HALF, 0], [
                1, IMAGE_PADDING_HEIGHT - 2 * CROP_HALF,
                IMAGE_PADDING_WIDTH - 2 * CROP_HALF, 1
            ]),
            [[0, 0], [CROP_HALF, CROP_HALF], [CROP_HALF, CROP_HALF], [0, 0]],
            "CONSTANT")

        outputs = o_11 + o_12 + o_13 + o_14 + (o_21 + o_22 + o_23 +
                                               o_24) / 2 + o_4 / 4
        outputs = tf.slice(outputs, [0, padding_h, padding_w, 0],
                           [1, IMAGE_HEIGHT, IMAGE_WIDTH, 1])
        outputs = deprocess(outputs)
    else:
        with tf.variable_scope("create_model"):
            model = create_model(examples.inputs, examples.targets)
        outputs = deprocess(model.outputs)

    inputs = deprocess(examples.inputs)
    targets = deprocess(examples.targets)

    def convert(image):
        return tf.image.convert_image_dtype(image,
                                            dtype=tf.uint8,
                                            saturate=True)

    # reverse any processing on images so they can be written to disk or displayed to user
    with tf.name_scope("convert_inputs"):
        converted_inputs = convert(inputs)

    with tf.name_scope("convert_targets"):
        converted_targets = convert(targets)

    with tf.name_scope("convert_outputs"):
        converted_outputs = convert(outputs)

    with tf.name_scope("encode_images"):
        display_fetches = {
            "paths":
            examples.paths,
            "inputs":
            tf.map_fn(tf.image.encode_png,
                      converted_inputs,
                      dtype=tf.string,
                      name="input_pngs"),
            "targets":
            tf.map_fn(tf.image.encode_png,
                      converted_targets,
                      dtype=tf.string,
                      name="target_pngs"),
            "outputs":
            tf.map_fn(tf.image.encode_png,
                      converted_outputs,
                      dtype=tf.string,
                      name="output_pngs"),
        }

    # summaries
    with tf.name_scope("inputs_summary"):
        tf.summary.image("inputs", converted_inputs)

    with tf.name_scope("targets_summary"):
        tf.summary.image("targets", converted_targets)

    with tf.name_scope("outputs_summary"):
        tf.summary.image("outputs", converted_outputs)

    tf.summary.scalar("generator_loss_L1", model.gen_loss_L1)

    for var in tf.trainable_variables():
        tf.summary.histogram(var.op.name + "/values", var)
    if a.mode == "train":
        for grad, var in model.gen_grads_and_vars:
            tf.summary.histogram(var.op.name + "/gradients", grad)

    with tf.name_scope("parameter_count"):
        parameter_count = tf.reduce_sum(
            [tf.reduce_prod(tf.shape(v)) for v in tf.trainable_variables()])

    saver = tf.train.Saver(max_to_keep=1)

    logdir = a.output_dir if (a.trace_freq > 0 or a.summary_freq > 0) else None
    sv = tf.train.Supervisor(logdir=logdir, save_summaries_secs=0, saver=None)
    with sv.managed_session() as sess:
        print("parameter_count =", sess.run(parameter_count))

        if a.checkpoint is not None:
            print("loading model from checkpoint")
            checkpoint = tf.train.latest_checkpoint(a.checkpoint)
            saver.restore(sess, checkpoint)

        max_steps = 2**32
        if a.max_epochs is not None:
            max_steps = examples.steps_per_epoch * a.max_epochs
        if a.max_steps is not None:
            max_steps = a.max_steps

        if a.mode == "test":
            # testing
            # at most, process the test data once
            start = time.time()
            max_steps = min(examples.steps_per_epoch, max_steps)
            for step in range(max_steps):
                results = sess.run(display_fetches)
                filesets = save_images_test(results, step)
                for i, f in enumerate(filesets):
                    print("evaluated image", f["name"])
                #index_path = append_index(filesets)
            #print("wrote index at", index_path)
            print("rate", (time.time() - start) / max_steps)
        else:
            # training
            start = time.time()

            for step in range(max_steps):

                def should(freq):
                    return freq > 0 and ((step + 1) % freq == 0
                                         or step == max_steps - 1)

                options = None
                run_metadata = None
                if should(a.trace_freq):
                    options = tf.RunOptions(
                        trace_level=tf.RunOptions.FULL_TRACE)
                    run_metadata = tf.RunMetadata()

                fetches = {
                    "train": model.train,
                    "global_step": sv.global_step,
                }

                if should(a.progress_freq):
                    fetches["gen_loss_L1"] = model.gen_loss_L1

                if should(a.summary_freq):
                    fetches["summary"] = sv.summary_op

                if should(a.display_freq):
                    fetches["display"] = display_fetches

                results = sess.run(fetches,
                                   options=options,
                                   run_metadata=run_metadata)

                if should(a.summary_freq):
                    print("recording summary")
                    sv.summary_writer.add_summary(results["summary"],
                                                  results["global_step"])

                if should(a.display_freq):
                    print("saving display images")
                    filesets = save_images(results["display"],
                                           step=results["global_step"])
                    append_index(filesets, step=True)

                if should(a.trace_freq):
                    print("recording trace")
                    sv.summary_writer.add_run_metadata(
                        run_metadata, "step_%d" % results["global_step"])

                if should(a.progress_freq):
                    # global_step will have the correct step count if we resume from a checkpoint
                    train_epoch = math.ceil(results["global_step"] /
                                            examples.steps_per_epoch)
                    train_step = (results["global_step"] -
                                  1) % examples.steps_per_epoch + 1
                    rate = (step + 1) * a.batch_size / (time.time() - start)
                    remaining = (max_steps - step) * a.batch_size / rate
                    print(
                        "progress  epoch %d  step %d  image/sec %0.1f  remaining %dm"
                        % (train_epoch, train_step, rate, remaining / 60))
                    print("gen_loss_L1", results["gen_loss_L1"])

                if should(a.save_freq):
                    print("saving model")
                    saver.save(sess,
                               os.path.join(a.output_dir, "model"),
                               global_step=sv.global_step)

                if sv.should_stop():
                    break
Esempio n. 3
0
    def benchmark_model(self,
                        warmup_runs,
                        bm_runs,
                        num_threads,
                        trace_filename=None):
        """Benchmark model."""
        if self.tensorrt:
            print('Using tensorrt ', self.tensorrt)
            self.build_and_save_model()
            graphdef = self.freeze_model()

        if num_threads > 0:
            print('num_threads for benchmarking: {}'.format(num_threads))
            sess_config = tf.ConfigProto(
                intra_op_parallelism_threads=num_threads,
                inter_op_parallelism_threads=1)
        else:
            sess_config = tf.ConfigProto()

        # rewriter_config_pb2.RewriterConfig.OFF
        sess_config.graph_options.rewrite_options.dependency_optimization = 2
        if self.use_xla:
            sess_config.graph_options.optimizer_options.global_jit_level = (
                tf.OptimizerOptions.ON_2)

        with tf.Graph().as_default(), tf.Session(config=sess_config) as sess:
            inputs = tf.placeholder(tf.float32,
                                    name='input',
                                    shape=self.inputs_shape)
            output = self.build_model(inputs, is_training=False)

            img = np.random.uniform(size=self.inputs_shape)

            sess.run(tf.global_variables_initializer())
            if self.tensorrt:
                fetches = [inputs.name] + [i.name for i in output]
                goutput = self.convert_tr(graphdef, fetches)
                inputs, output = goutput[0], goutput[1:]

            if not self.use_xla:
                # Don't use tf.group because XLA removes the whole graph for tf.group.
                output = tf.group(*output)
            for i in range(warmup_runs):
                start_time = time.time()
                sess.run(output, feed_dict={inputs: img})
                print('Warm up: {} {:.4f}s'.format(i,
                                                   time.time() - start_time))
            print('Start benchmark runs total={}'.format(bm_runs))
            timev = []
            for i in range(bm_runs):
                if trace_filename and i == (bm_runs // 2):
                    run_options = tf.RunOptions()
                    run_options.trace_level = tf.RunOptions.FULL_TRACE
                    run_metadata = tf.RunMetadata()
                    sess.run(output,
                             feed_dict={inputs: img},
                             options=run_options,
                             run_metadata=run_metadata)
                    logging.info('Dumping trace to %s', trace_filename)
                    trace_dir = os.path.dirname(trace_filename)
                    if not tf.io.gfile.exists(trace_dir):
                        tf.io.gfile.makedirs(trace_dir)
                    with tf.io.gfile.GFile(trace_filename, 'w') as trace_file:
                        from tensorflow.python.client import timeline  # pylint: disable=g-direct-tensorflow-import,g-import-not-at-top
                        trace = timeline.Timeline(
                            step_stats=run_metadata.step_stats)
                        trace_file.write(
                            trace.generate_chrome_trace_format(
                                show_memory=True))

                start_time = time.time()
                sess.run(output, feed_dict={inputs: img})
                timev.append(time.time() - start_time)

            timev.sort()
            timev = timev[2:bm_runs - 2]
            print(
                '{} {}runs {}threads: mean {:.4f} std {:.4f} min {:.4f} max {:.4f}'
                .format(self.model_name, len(timev), num_threads,
                        np.mean(timev), np.std(timev), np.min(timev),
                        np.max(timev)))
            print('Images per second FPS = {:.1f}'.format(
                self.batch_size / float(np.mean(timev))))
Esempio n. 4
0
def train(train_phases,model,minibatch,\
            sess,train_stat,ph_misc_stat,summary_writer):
    import time
    BYPASS = True
    # saver = tf.train.Saver(var_list=tf.trainable_variables())
    saver = tf.train.Saver()

    epoch_ph_start = 0
    f1mic_best, e_best = 0, 0
    time_calc_f1, time_train, time_prepare = 0, 0, 0
    options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE,
                            report_tensor_allocations_upon_oom=True)
    run_metadata = tf.RunMetadata()
    many_runs_timeline = []  # only used when TF timeline is enabled
    for ip, phase in enumerate(train_phases):
        # We normally only have a single phase of training (see README for defn of 'phase').
        # On the other hand, our implementation does support multi-phase training.
        # e.g., you can use smaller subgraphs during initial epochs and larger subgraphs
        #       when closer to convergence. -- This might speed up convergence.
        minibatch.set_sampler(phase)
        num_batches = minibatch.num_training_batches()
        #printf('START PHASE {:4d}'.format(ip),style='underline')
        for e in range(epoch_ph_start, int(phase['end'])):
            #printf('Epoch {:4d}'.format(e),style='bold')
            minibatch.shuffle()
            l_loss_tr, l_f1mic_tr, l_f1mac_tr, l_size_subg = [], [], [], []
            time_train_ep, time_prepare_ep = 0, 0
            while not minibatch.end():
                t0 = time.time()
                feed_dict, labels = minibatch.feed_dict(mode='train')
                t1 = time.time()
                if BYPASS:
                    continue
                if args_global.timeline:  # profile the code with Tensorflow Timeline
                    _,__,loss_train,pred_train = sess.run([train_stat[0], \
                            model.opt_op, model.loss, model.preds], feed_dict=feed_dict, \
                            options=options, run_metadata=run_metadata)
                    fetched_timeline = timeline.Timeline(
                        run_metadata.step_stats)
                    chrome_trace = fetched_timeline.generate_chrome_trace_format(
                    )
                    many_runs_timeline.append(chrome_trace)
                else:
                    _,__,loss_train,pred_train = sess.run([train_stat[0], \
                            model.opt_op, model.loss, model.preds], feed_dict=feed_dict, \
                            options=tf.RunOptions(report_tensor_allocations_upon_oom=True))
                t2 = time.time()
                time_train_ep += t2 - t1
                time_prepare_ep += t1 - t0
                if not minibatch.batch_num % args_global.eval_train_every:
                    f1_mic, f1_mac = calc_f1(labels, pred_train,
                                             model.sigmoid_loss)
                    l_loss_tr.append(loss_train)
                    l_f1mic_tr.append(f1_mic)
                    l_f1mac_tr.append(f1_mac)
                    l_size_subg.append(minibatch.size_subgraph)
            if BYPASS:
                continue
            time_train += time_train_ep
            time_prepare += time_prepare_ep
            if args_global.cpu_eval:  # Full batch evaluation using CPU
                # we have to start a new session so that CPU can perform full-batch eval.
                # current model params are communicated to the new session via tmp.chkpt
                saver.save(sess, './tmp.chkpt')
                with tf.device('/cpu:0'):
                    sess_cpu = tf.Session(config=tf.ConfigProto(
                        device_count={'GPU': 0}))
                    sess_cpu.run(tf.global_variables_initializer())
                    saver = tf.train.Saver()
                    saver.restore(sess_cpu, './tmp.chkpt')
                    sess_eval = sess_cpu
            else:
                sess_eval = sess
            loss_val,f1mic_val,f1mac_val,time_eval = \
                evaluate_full_batch(sess_eval,model,minibatch,many_runs_timeline,mode='val')
            #printf(' TRAIN (Ep avg): loss = {:.4f}\tmic = {:.4f}\tmac = {:.4f}\ttrain time = {:.4f} sec'.format(f_mean(l_loss_tr),f_mean(l_f1mic_tr),f_mean(l_f1mac_tr),time_train_ep))
            #printf(' VALIDATION:     loss = {:.4f}\tmic = {:.4f}\tmac = {:.4f}'.format(loss_val,f1mic_val,f1mac_val),style='yellow')
            if f1mic_val > f1mic_best:
                f1mic_best, e_best = f1mic_val, e
                if not os.path.exists(args_global.dir_log + '/models'):
                    os.makedirs(args_global.dir_log + '/models')
                #print('  Saving models ...')
                savepath = saver.save(sess,
                                      '{}/models/saved_model_{}.chkpt'.format(
                                          args_global.dir_log,
                                          timestamp).replace(' ', '_'),
                                      write_meta_graph=False,
                                      write_state=False)

            if args_global.tensorboard:
                misc_stat = sess.run([train_stat[1]],feed_dict={\
                                        ph_misc_stat['val_f1_micro']: f1mic_val,
                                        ph_misc_stat['val_f1_macro']: f1mac_val,
                                        ph_misc_stat['train_f1_micro']: f_mean(l_f1mic_tr),
                                        ph_misc_stat['train_f1_macro']: f_mean(l_f1mac_tr),
                                        ph_misc_stat['time_per_epoch']: time_train_ep+time_prepare_ep,
                                        ph_misc_stat['size_subgraph']: f_mean(l_size_subg)})
                # tensorboard visualization
                summary_writer.add_summary(_, e)
                summary_writer.add_summary(misc_stat[0], e)
        epoch_ph_start = int(phase['end'])
    printf("Optimization Finished!", style='yellow')
    timelines = TimeLiner()
    for tl in many_runs_timeline:
        timelines.update_timeline(tl)
    timelines.save('timeline.json')
    '''
    saver.restore(sess_eval, '{}/models/saved_model_{}.chkpt'.format(args_global.dir_log,timestamp).replace(' ','_'))
    loss_val, f1mic_val, f1mac_val, duration = evaluate_full_batch(sess_eval,model,minibatch,many_runs_timeline,mode='val')
    printf("Full validation (Epoch {:4d}): \n  F1_Micro = {:.4f}\tF1_Macro = {:.4f}".format(e_best,f1mic_val,f1mac_val),style='red')
    loss_test, f1mic_test, f1mac_test, duration = evaluate_full_batch(sess_eval,model,minibatch,many_runs_timeline,mode='test')
    printf("Full test stats: \n  F1_Micro = {:.4f}\tF1_Macro = {:.4f}".format(f1mic_test,f1mac_test),style='red')
    '''
    printf('Total training time: {:6.2f} sec'.format(time_train), style='red')
    #ret = {'loss_val_opt':loss_val,'f1mic_val_opt':f1mic_val,'f1mac_val_opt':f1mac_val,\
    #        'loss_test_opt':loss_test,'f1mic_test_opt':f1mic_test,'f1mac_test_opt':f1mac_test,\
    #        'epoch_best':e_best,
    #        'time_train': time_train}
    print("sampling_time (graphsaint)", minibatch.sampling_time)
    print("training_time:", time_train)
    return  # everything is logged by TF. no need to return anything
Esempio n. 5
0
def word2vec_run(config: Config.ConfigCls, identifier:str, doenload:bool, sim_chk:bool):
	#global data_index
	out_dir = os.path.join(config.OutDirGet(), identifier)
	config.OutDirSet(out_dir)
	Base.SaveDir(out_dir)

	data_index = 0
	LogWrite(config, """\n\n\n\nWord to Vec """ + identifier)
	LogWrite(config, """Example of building, training and visualizing a word2vec model.""")

	###########################################################################
	LogWrite(config,'\n\nStep 1: Download the data.')
	url = config.DownloadUrl();dir=config.DownloadDir(); file = config.DownloadFile(); size = config.DownloadSize();
	if(doenload):
		filename = Base.maybe_download(dir, url, file, size)
	else:
		filename = os.path.join(dir, file)

	LogWrite(config,'Read the data into a list of strings.')
	vocabulary = Base.read_data(filename)
	LogWrite(config,'Data size' , len(vocabulary))



	###########################################################################
	LogWrite(config,'\n\nStep 2: Build the dictionary and replace rare words with UNK token.')
	#vocabulary_size = 50000
	vocabulary_size = config.SessionVocSizeGet()
	data, count, unused_dictionary, reverse_dictionary = DataSet.build_dataset(vocabulary, vocabulary_size)
	del vocabulary  # Hint to reduce memory.
	LogWrite(config,'Most common words (+UNK)', count[:5])
	LogWrite(config,'Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])



	###########################################################################
	LogWrite(config,'\n\nStep 3: Function to generate a training batch for the skip-gram model.')
	batch, labels, data_index = DataSet.generate_batch(data=data, data_index=data_index,batch_size=8, num_skips=2, skip_window=1)
	for i in range(8):
		LogWrite(config,batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0],
		      reverse_dictionary[labels[i, 0]])



	###########################################################################
	LogWrite(config,'\n\nStep 4: Build and train a skip-gram model.')
	batch_size = config.ModelBatchSizeGet()      #128
	embedding_size = config.ModelEmbedSizeGet()  #128  # Dimension of the embedding vector.
	skip_window = config.ModelSkipWindowGet()    #1    # How many words to consider left and right.
	num_skips = config.ModelNumSkipsGet()        #2    # How many times to reuse an input to generate a label.
	num_sampled =config.ModelNumSampledGet()     #64   # Number of negative examples to sample.

	# We pick a random validation set to sample nearest neighbors. Here we limit
	# the validation samples to the words that have a low numeric ID, which by
	# construction are also the most frequent. These 3 variables are used only for
	# displaying model accuracy, they don't affect calculation.
	valid_size = config.ValidationSize()      #16   # Random set of words to evaluate similarity on.
	valid_window = config.ValidationWindow()  #100  # Only pick dev samples in the head of the distribution.
	valid_examples = np.random.choice(valid_window, valid_size, replace=False)

	graph = tf.Graph()

	with graph.as_default():

		# Input data.
		with tf.name_scope('inputs'):
			train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
			train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
			valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

		# Ops and variables pinned to the CPU because of missing GPU implementation
		with tf.device('/cpu:0'):
			# Look up embeddings for inputs.
			with tf.name_scope('embeddings'):
				embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
				embed = tf.nn.embedding_lookup(embeddings, train_inputs)

			# Construct the variables for the NCE loss
			with tf.name_scope('weights'):
				nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size], stddev=1.0 / math.sqrt(embedding_size)))

			with tf.name_scope('biases'):
				nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

		# Compute the average NCE loss for the batch.
		# tf.nce_loss automatically draws a new sample of the negative labels each
		# time we evaluate the loss.
		# Explanation of the meaning of NCE loss:
		#   http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
		with tf.name_scope('loss'):
			loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights, biases=nce_biases, labels=train_labels,
					inputs=embed, num_sampled=num_sampled,	num_classes=vocabulary_size))

		# Add the loss value as a scalar to summary.
		tf.summary.scalar('loss', loss)

		# Construct the SGD optimizer using a learning rate of 1.0.
		with tf.name_scope('optimizer'):
			optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

		# Compute the cosine similarity between minibatch examples and all
		# embeddings.
		norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
		normalized_embeddings = embeddings / norm
		valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
		similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)

		# Merge all summaries.
		merged = tf.summary.merge_all()

		# Add variable initializer.
		init = tf.global_variables_initializer()

		# Create a saver.
		saver = tf.train.Saver()



	###########################################################################
	LogWrite(config,'\n\nStep 5: Begin training.')
	#num_steps = 100001

	with tf.Session(graph=graph) as session:
		# Open a writer to write summaries.
		writer = tf.summary.FileWriter(out_dir, session.graph)

		# We must initialize all variables before we use them.
		init.run()
		LogWrite(config,'Initialized')

		average_loss = 0
		num_steps = config.SessionStepsGet();
		for step in xrange(num_steps):
			batch_inputs, batch_labels, data_index = DataSet.generate_batch(data, data_index, batch_size, num_skips, skip_window)
			feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

			# Define metadata variable.
			run_metadata = tf.RunMetadata()

			# We perform one update step by evaluating the optimizer op (including it
			# in the list of returned values for session.run()
			# Also, evaluate the merged op to get all summaries from the returned
			# "summary" variable. Feed metadata variable to session for visualizing
			# the graph in TensorBoard.
			_, summary, loss_val = session.run([optimizer, merged, loss], feed_dict=feed_dict, run_metadata=run_metadata)
			average_loss += loss_val

			# Add returned summaries to writer in each step.
			writer.add_summary(summary, step)
			# Add metadata to visualize the graph for the last run.
			if step == (num_steps - 1):
				writer.add_run_metadata(run_metadata, 'step%d' % step)

			loss_step = config.RepLossStep()
			if 0x00 == (step % loss_step):     #2000 == 0:
				if step > 0:
					average_loss /= loss_step  #2000
				# The average loss is an estimate of the loss over the last 2000 batches.
				LogWrite(config,'Average loss at step ', step, ': ', average_loss)
				average_loss = 0

			# Note that this is expensive (~20% slowdown if computed every 500 steps)
			if(sim_chk):
				sim_eval_step = config.RepSimStep();
				if 0x00 == (step % sim_eval_step):         #10000 == 0:
					sim = similarity.eval()#.analogys_evaluate()
					for i in xrange(valid_size):
						valid_word = reverse_dictionary[valid_examples[i]]
						top_k = 8  # number of nearest neighbors
						nearest = (-sim[i, :]).argsort()[1:top_k + 1]
						log_str = 'Nearest to %s:' % valid_word
						for k in xrange(top_k):
							close_word = reverse_dictionary[nearest[k]]
							log_str = '%s %s,' % (log_str, close_word)
						LogWrite(config,log_str)
		final_embeddings = normalized_embeddings.eval()#.analogys_evaluate()

		# Write corresponding labels for the embeddings.
		meta_file = config.OutMetaFile();            #  out_dir + '/metadata.tsv'
		with open(meta_file, 'w') as f:
			for i in xrange(vocabulary_size):
				f.write(reverse_dictionary[i] + '\n')

		# Save the model for checkpoints.
		mode_file = config.OutModelFile()            #os.path.join(out_dir, 'model.ckpt')
		saver.save(session, mode_file)

		# Create a configuration for visualizing embeddings with the labels in TensorBoard.
		vis_config = projector.ProjectorConfig()
		embedding_conf = vis_config.embeddings.add()
		embedding_conf.tensor_name = embeddings.name
		embedding_conf.metadata_path = meta_file     #os.path.join(out_dir, 'metadata.tsv')
		projector.visualize_embeddings(writer, vis_config)

	writer.close()






	###########################################################################
	LogWrite(config,'\n\nStep 6: Visualize the embeddings.')
	# pylint: disable=missing-docstring
	# Function to draw visualization of distance between embeddings.
	# def plot_with_labels(low_dim_embs, labels, filename):
	# 	assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings'
	# 	plt.figure(figsize=(18, 18))  # in inches
	# 	for i, label in enumerate(labels):
	# 		x, y = low_dim_embs[i, :]
	# 		plt.scatter(x, y)
	# 		plt.annotate(
	# 			label,
	# 			xy=(x, y),
	# 			xytext=(5, 2),
	# 			textcoords='offset points',
	# 			ha='right',
	# 			va='bottom')
	#
	# 	plt.savefig(filename)

	# try:
	# 	# pylint: disable=g-import-not-at-top
	# 	from sklearn.manifold import TSNE
	# 	import matplotlib.pyplot as plt
	#
	# 	tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
	# 	plot_only = 500
	# 	low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
	# 	labels = [reverse_dictionary[i] for i in xrange(plot_only)]
	# 	Plotter.plot_with_labels(low_dim_embs, labels, os.path.join(gettempdir(), 'tsne.png'))
	#
	# except ImportError as ex:
	# 	LogWrite(config,'Please install sklearn, matplotlib, and scipy to show embeddings.')
	# 	LogWrite(config,ex)
	plot_file=config.OutPlotFile()#'tsne.png'
	Plotter.PlotGraph(final_embeddings, reverse_dictionary, plot_file)
Esempio n. 6
0
def main():
    if a.seed is None:
        a.seed = random.randint(0, 2**31 - 1)

    tf.set_random_seed(a.seed)
    np.random.seed(a.seed)
    random.seed(a.seed)

    if not os.path.exists(a.output_dir):
        os.makedirs(a.output_dir)

    if a.mode == "test" or a.mode == "export":
        if a.checkpoint is None:
            raise Exception("checkpoint required for test mode")

        # load some options from the checkpoint
        options = {"which_direction", "ngf", "ndf", "lab_colorization"}
        with open(os.path.join(a.checkpoint, "options.json")) as f:
            for key, val in json.loads(f.read()).items():
                if key in options:
                    print("loaded", key, "=", val)
                    setattr(a, key, val)
        # disable these features in test mode
        a.scale_size = CROP_SIZE
        a.flip = False

    for k, v in a._get_kwargs():
        print(k, "=", v)

    with open(os.path.join(a.output_dir, "options.json"), "w") as f:
        f.write(json.dumps(vars(a), sort_keys=True, indent=4))

    if a.mode == "export":
        # export the generator to a meta graph that can be imported later for standalone generation
        if a.lab_colorization:
            raise Exception("export not supported for lab_colorization")

        input = tf.placeholder(tf.string, shape=[1])
        input_data = tf.decode_base64(input[0])
        input_image = tf.image.decode_png(input_data)

        # remove alpha channel if present
        input_image = tf.cond(tf.equal(tf.shape(input_image)[2], 4), lambda: input_image[:,:,:3], lambda: input_image)
        # convert grayscale to RGB
        input_image = tf.cond(tf.equal(tf.shape(input_image)[2], 1), lambda: tf.image.grayscale_to_rgb(input_image), lambda: input_image)

        input_image = tf.image.convert_image_dtype(input_image, dtype=tf.float32)
        input_image.set_shape([CROP_SIZE, CROP_SIZE, 3])
        batch_input = tf.expand_dims(input_image, axis=0)

        with tf.variable_scope("generator"):
            batch_output = deprocess(create_generator(preprocess(batch_input), 3))

        output_image = tf.image.convert_image_dtype(batch_output, dtype=tf.uint8)[0]
        if a.output_filetype == "png":
            output_data = tf.image.encode_png(output_image)
        elif a.output_filetype == "jpeg":
            output_data = tf.image.encode_jpeg(output_image, quality=80)
        else:
            raise Exception("invalid filetype")
        output = tf.convert_to_tensor([tf.encode_base64(output_data)])

        key = tf.placeholder(tf.string, shape=[1])
        inputs = {
            "key": key.name,
            "input": input.name
        }
        tf.add_to_collection("inputs", json.dumps(inputs))
        outputs = {
            "key":  tf.identity(key).name,
            "output": output.name,
        }
        tf.add_to_collection("outputs", json.dumps(outputs))

        init_op = tf.global_variables_initializer()
        restore_saver = tf.train.Saver()
        export_saver = tf.train.Saver()
        
        config = tf.ConfigProto()  

        config.gpu_options.allow_growth=True
        
        with tf.Session(config=config) as sess:
            sess.run(init_op)
            print("loading model from checkpoint")
            checkpoint = tf.train.latest_checkpoint(a.checkpoint)
            restore_saver.restore(sess, checkpoint)
            print("exporting model")
            export_saver.export_meta_graph(filename=os.path.join(a.output_dir, "export.meta"))
            export_saver.save(sess, os.path.join(a.output_dir, "export"), write_meta_graph=False)

        return

    examples = load_examples()
    print("examples count = %d" % examples.count)

    # inputs and targets are [batch_size, height, width, channels]
    model = create_model(examples.inputs, examples.targets)

    # undo colorization splitting on images that we use for display/output
    if a.lab_colorization:
        if a.which_direction == "AtoB":
            # inputs is brightness, this will be handled fine as a grayscale image
            # need to augment targets and outputs with brightness
            targets = augment(examples.targets, examples.inputs)
            outputs = augment(model.outputs, examples.inputs)
            # inputs can be deprocessed normally and handled as if they are single channel
            # grayscale images
            inputs = deprocess(examples.inputs)
        elif a.which_direction == "BtoA":
            # inputs will be color channels only, get brightness from targets
            inputs = augment(examples.inputs, examples.targets)
            targets = deprocess(examples.targets)
            outputs = deprocess(model.outputs)
        else:
            raise Exception("invalid direction")
    else:
        inputs = deprocess(examples.inputs)
        targets = deprocess(examples.targets)
        outputs = deprocess(model.outputs)

    def convert(image):
        if a.aspect_ratio != 1.0:
            # upscale to correct aspect ratio
            size = [CROP_SIZE, int(round(CROP_SIZE * a.aspect_ratio))]
            image = tf.image.resize_images(image, size=size, method=tf.image.ResizeMethod.BICUBIC)

        return tf.image.convert_image_dtype(image, dtype=tf.uint8, saturate=True)

    # reverse any processing on images so they can be written to disk or displayed to user
    with tf.name_scope("convert_inputs"):
        converted_inputs = convert(inputs)

    with tf.name_scope("convert_targets"):
        converted_targets = convert(targets)

    with tf.name_scope("convert_outputs"):
        converted_outputs = convert(outputs)

    with tf.name_scope("encode_images"):
        display_fetches = {
            "paths": examples.paths,
            "inputs": tf.map_fn(tf.image.encode_png, converted_inputs, dtype=tf.string, name="input_pngs"),
            "targets": tf.map_fn(tf.image.encode_png, converted_targets, dtype=tf.string, name="target_pngs"),
            "outputs": tf.map_fn(tf.image.encode_png, converted_outputs, dtype=tf.string, name="output_pngs"),
        }

    # summaries
    with tf.name_scope("inputs_summary"):
        tf.summary.image("inputs", converted_inputs)

    with tf.name_scope("targets_summary"):
        tf.summary.image("targets", converted_targets)

    with tf.name_scope("outputs_summary"):
        tf.summary.image("outputs", converted_outputs)

    with tf.name_scope("predict_real_summary"):
        tf.summary.image("predict_real", tf.image.convert_image_dtype(model.predict_real, dtype=tf.uint8))

    with tf.name_scope("predict_fake_summary"):
        tf.summary.image("predict_fake", tf.image.convert_image_dtype(model.predict_fake, dtype=tf.uint8))

    tf.summary.scalar("discriminator_loss", model.discrim_loss)
    tf.summary.scalar("generator_loss_GAN", model.gen_loss_GAN)
    tf.summary.scalar("generator_loss_L1", model.gen_loss_L1)
    tf.summary.scalar("generator_loss_L1_1", model.gen_loss_L1_1)

    for var in tf.trainable_variables():
        tf.summary.histogram(var.op.name + "/values", var)

    for grad, var in model.discrim_grads_and_vars + model.gen_grads_and_vars:
        tf.summary.histogram(var.op.name + "/gradients", grad)

    with tf.name_scope("parameter_count"):
        parameter_count = tf.reduce_sum([tf.reduce_prod(tf.shape(v)) for v in tf.trainable_variables()])

    saver = tf.train.Saver(max_to_keep=1)

    logdir = a.output_dir if (a.trace_freq > 0 or a.summary_freq > 0) else None
    sv = tf.train.Supervisor(logdir=logdir, save_summaries_secs=0, saver=None)
    
    config = tf.ConfigProto()  

    config.gpu_options.allow_growth=True
    #os.environ["CUDA_VISIBLE_DEVICES"] = "2"
    with sv.managed_session(config=config) as sess:
        print("parameter_count =", sess.run(parameter_count))

        if a.checkpoint is not None:
            print("loading model from checkpoint")
            checkpoint = tf.train.latest_checkpoint(a.checkpoint)
            saver.restore(sess, checkpoint)

        max_steps = 2**32
        if a.max_epochs is not None:
            max_steps = examples.steps_per_epoch * a.max_epochs
        if a.max_steps is not None:
            max_steps = a.max_steps

        if a.mode == "test":
            # testing
            # at most, process the test data once
            start = time.time()
            max_steps = min(examples.steps_per_epoch, max_steps)
            for step in range(max_steps):
                results = sess.run(display_fetches)
                filesets = save_images(results)
                for i, f in enumerate(filesets):
                    print("evaluated image", f["name"])
                index_path = append_index(filesets)
            print("wrote index at", index_path)
            print("rate", (time.time() - start) / max_steps)
        else:
            # training
            start = time.time()

            for step in range(max_steps):
                def should(freq):
                    return freq > 0 and ((step + 1) % freq == 0 or step == max_steps - 1)

                options = None
                run_metadata = None
                if should(a.trace_freq):
                    options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
                    run_metadata = tf.RunMetadata()

                fetches = {
                    "train": model.train,
                    "global_step": sv.global_step,
                }

                if should(a.progress_freq):
                    fetches["discrim_loss"] = model.discrim_loss
                    fetches["gen_loss_GAN"] = model.gen_loss_GAN
                    fetches["gen_loss_L1"] = model.gen_loss_L1
                    fetches["gen_loss_L1_1"]= model.gen_loss_L1_1

                if should(a.summary_freq):
                    fetches["summary"] = sv.summary_op

                if should(a.display_freq):
                    fetches["display"] = display_fetches

                results = sess.run(fetches, options=options, run_metadata=run_metadata)

                if should(a.summary_freq):
                    print("recording summary")
                    sv.summary_writer.add_summary(results["summary"], results["global_step"])

                if should(a.display_freq):
                    print("saving display images")
                    filesets = save_images(results["display"], step=results["global_step"])
                    append_index(filesets, step=True)

                if should(a.trace_freq):
                    print("recording trace")
                    sv.summary_writer.add_run_metadata(run_metadata, "step_%d" % results["global_step"])

                if should(a.progress_freq):
                    # global_step will have the correct step count if we resume from a checkpoint
                    train_epoch = math.ceil(results["global_step"] / examples.steps_per_epoch)
                    train_step = (results["global_step"] - 1) % examples.steps_per_epoch + 1
                    rate = (step + 1) * a.batch_size / (time.time() - start)
                    remaining = (max_steps - step) * a.batch_size / rate
                    print("progress  epoch %d  step %d  image/sec %0.1f  remaining %dm" % (train_epoch, train_step, rate, remaining / 60))
                    print("discrim_loss", results["discrim_loss"])
                    print("gen_loss_GAN", results["gen_loss_GAN"])
                    print("gen_loss_L1", results["gen_loss_L1"])
                    print("gen_loss_L1_1", results["gen_loss_L1_1"])

                if should(a.save_freq):
                    print("saving model")
                    saver.save(sess, os.path.join(a.output_dir, "model"), global_step=sv.global_step)

                if sv.should_stop():
                    break
Esempio n. 7
0
    def benchmark(self, ckpt_dir, outer_steps=100, inner_steps=1000):
        """Run repeatedly on dummy data to benchmark inference."""
        # Turn off Grappler optimizations.
        options = {"disable_meta_optimizer": True}
        tf.config.optimizer.set_experimental_options(options)

        # Create the model outside the loop body.
        hparams = registry.hparams(self.hparams_set)
        hparams_lib.add_problem_hparams(hparams, self.problem_name)
        model_cls = registry.model(self.model_name)
        model = model_cls(hparams, tf.estimator.ModeKeys.EVAL)

        # Run only the model body (no data pipeline) on device.
        feature_shape = [
            hparams.batch_size, 3 * self.image_size * self.image_size
        ]
        features = {"targets": tf.zeros(feature_shape, dtype=tf.int32)}

        # Call the model once to initialize the variables. Note that
        # this should never execute.
        with tf.variable_scope(self.model_name) as vso:
            transformed_features = model.bottom(features)
            with tf.variable_scope("body") as vsi:
                body_out = model.body(transformed_features)
            logits = model.top(body_out, features)
            model.loss(logits, features)

        def call_model(features):
            with tf.variable_scope(vso, reuse=tf.AUTO_REUSE):
                transformed_features = model.bottom(features)
                with tf.variable_scope(vsi, reuse=tf.AUTO_REUSE):
                    body_out = model.body(transformed_features)
                logits = model.top(body_out, features)
                return model.loss(logits, features)

        # Run the function body in a loop to amortize session overhead.
        loop_index = tf.zeros([], dtype=tf.int32)
        initial_loss = (tf.zeros([]), tf.zeros([]))

        def loop_cond(idx, _):
            return tf.less(idx, tf.constant(inner_steps, dtype=tf.int32))

        def loop_body(idx, _):
            return idx + 1, call_model(features)

        benchmark_op = tf.while_loop(loop_cond,
                                     loop_body, [loop_index, initial_loss],
                                     parallel_iterations=1,
                                     back_prop=False)

        session_config = tf.ConfigProto(gpu_options=tf.GPUOptions(
            allow_growth=False, per_process_gpu_memory_fraction=0.95))
        run_metadata = tf.RunMetadata()
        with tf.Session(config=session_config) as sess:
            self.restore_model(sess, ckpt_dir)
            tps = []
            for idx in range(outer_steps):
                start_time = time.time()
                sess.run(benchmark_op, run_metadata=run_metadata)
                elapsed_time = time.time() - start_time
                tps.append(inner_steps * hparams.batch_size * (64 * 64 * 3) /
                           elapsed_time)
                logging.error("Iterations %d processed %f TPS.", idx, tps[-1])
            # Skip the first iteration where all the setup and allocation happens.
            tps = np.asarray(tps[1:])
            logging.error("Mean/Std/Max/Min throughput = %f / %f / %f / %f",
                          np.mean(tps), np.std(tps), tps.max(), tps.min())
Esempio n. 8
0
    def _train_step(self,
                    learning_rate,
                    cliprange,
                    obs,
                    returns,
                    masks,
                    actions,
                    values,
                    neglogpacs,
                    update,
                    writer,
                    states=None,
                    cliprange_vf=None):
        """
        Training of PPO2 Algorithm

        :param learning_rate: (float) learning rate
        :param cliprange: (float) Clipping factor
        :param obs: (np.ndarray) The current observation of the environment
        :param returns: (np.ndarray) the rewards
        :param masks: (np.ndarray) The last masks for done episodes (used in recurent policies)
        :param actions: (np.ndarray) the actions
        :param values: (np.ndarray) the values
        :param neglogpacs: (np.ndarray) Negative Log-likelihood probability of Actions
        :param update: (int) the current step iteration
        :param writer: (TensorFlow Summary.writer) the writer for tensorboard
        :param states: (np.ndarray) For recurrent policies, the internal state of the recurrent model
        :return: policy gradient loss, value function loss, policy entropy,
                approximation of kl divergence, updated clipping range, training update operation
        :param cliprange_vf: (float) Clipping factor for the value function
        """
        advs = returns - values
        advs = (advs - advs.mean()) / (advs.std() + 1e-8)
        td_map = {
            self.train_model.obs_ph: obs,
            self.action_ph: actions,
            self.advs_ph: advs,
            self.rewards_ph: returns,
            self.learning_rate_ph: learning_rate,
            self.clip_range_ph: cliprange,
            self.old_neglog_pac_ph: neglogpacs,
            self.old_vpred_ph: values
        }
        if states is not None:
            td_map[self.train_model.states_ph] = states
            td_map[self.train_model.dones_ph] = masks

        if cliprange_vf is not None and cliprange_vf >= 0:
            td_map[self.clip_range_vf_ph] = cliprange_vf

        if states is None:
            update_fac = max(
                self.n_batch // self.nminibatches // self.noptepochs, 1)
        else:
            update_fac = max(
                self.n_batch // self.nminibatches // self.noptepochs //
                self.n_steps, 1)

        if writer is not None:
            # run loss backprop with summary, but once every 10 runs save the metadata (memory, compute time, ...)
            if self.full_tensorboard_log and (1 + update) % 10 == 0:
                run_options = tf.RunOptions(
                    trace_level=tf.RunOptions.FULL_TRACE)
                run_metadata = tf.RunMetadata()
                summary, policy_loss, value_loss, policy_entropy, approxkl, clipfrac, _ = self.sess.run(
                    [
                        self.summary, self.pg_loss, self.vf_loss, self.entropy,
                        self.approxkl, self.clipfrac, self._train
                    ],
                    td_map,
                    options=run_options,
                    run_metadata=run_metadata)
                writer.add_run_metadata(run_metadata,
                                        'step%d' % (update * update_fac))
            else:
                summary, policy_loss, value_loss, policy_entropy, approxkl, clipfrac, _ = self.sess.run(
                    [
                        self.summary, self.pg_loss, self.vf_loss, self.entropy,
                        self.approxkl, self.clipfrac, self._train
                    ], td_map)
            writer.add_summary(summary, (update * update_fac))
        else:
            policy_loss, value_loss, policy_entropy, approxkl, clipfrac, _ = self.sess.run(
                [
                    self.pg_loss, self.vf_loss, self.entropy, self.approxkl,
                    self.clipfrac, self._train
                ], td_map)

        return policy_loss, value_loss, policy_entropy, approxkl, clipfrac
Esempio n. 9
0
def train(args, data, show_loss, show_topk):
    n_user, n_item, n_entity, n_relation = data[0], data[1], data[2], data[3]
    train_data, eval_data, test_data = data[4], data[5], data[6]
    adj_entity, adj_relation = data[7], data[8]

    model = KGCN(args, n_user, n_entity, n_relation, adj_entity, adj_relation)

    # top-K evaluation settings
    user_list, train_record, test_record, item_set, k_list = topk_settings(
        show_topk, train_data, test_data, n_item)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        # monitor the usage of memory while training the model
        profiler = model_analyzer.Profiler(graph=sess.graph)
        run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
        run_metadata = tf.RunMetadata()
        # tensor-board
        writer = tf.summary.FileWriter('../data/' + args.dataset + '/logs',
                                       tf.get_default_graph())

        for step in range(args.n_epochs):
            # training
            t = time.time()
            np.random.shuffle(train_data)
            start = 0
            i = 0
            # skip the last incomplete minibatch if its size < batch size
            while start + args.batch_size <= train_data.shape[0]:
                _, loss = model.train(
                    sess,
                    get_feed_dict(model, train_data, start,
                                  start + args.batch_size), run_options,
                    run_metadata)
                # add the data into tfprofiler
                profiler.add_step(step=step, run_meta=run_metadata)
                if i == 0:
                    writer.add_run_metadata(run_metadata, 'step %d' % step)
                i += 1
                start += args.batch_size
                # if show_loss:
                #     print(start, loss)

            # CTR evaluation
            train_auc, train_f1 = ctr_eval(sess, model, train_data,
                                           args.batch_size)
            eval_auc, eval_f1 = ctr_eval(sess, model, eval_data,
                                         args.batch_size)
            test_auc, test_f1 = ctr_eval(sess, model, test_data,
                                         args.batch_size)

            # values = ps.virtual_memory()
            # used_memory = values.used / (1024.0 ** 3)
            train_time = time.time() - t

            # print('epoch %d    train auc: %.4f  f1: %.4f    eval auc: %.4f  f1: %.4f    test auc: %.4f  f1: %.4f'
            #       % (step, train_auc, train_f1, eval_auc, eval_f1, test_auc, test_f1))
            print(
                'epoch %d   training time: %.5f    train auc: %.4f  f1: %.4f    eval auc: %.4f  f1: %.4f    test auc: %.4f  f1: %.4f'
                % (step, train_time, train_auc, train_f1, eval_auc, eval_f1,
                   test_auc, test_f1))

        # # 统计模型的memory使用大小
        profile_scope_opt_builder = option_builder.ProfileOptionBuilder(
            option_builder.ProfileOptionBuilder.trainable_variables_parameter(
            ))
        # 显示字段是params,即参数
        profile_scope_opt_builder.select(['params'])
        # 根据params数量进行显示结果排序
        profile_scope_opt_builder.order_by('params')
        # 显示视图为scope view
        profiler.profile_name_scope(profile_scope_opt_builder.build())

        # ------------------------------------
        # 最耗时top 5 ops
        profile_op_opt_builder = option_builder.ProfileOptionBuilder()

        # 显示字段:op执行时间,使用该op的node的数量。 注意:op的执行时间即所有使用该op的node的执行时间总和。
        profile_op_opt_builder.select(['micros', 'occurrence'])
        # 根据op执行时间进行显示结果排序
        profile_op_opt_builder.order_by('micros')
        # 过滤条件:只显示排名top 7
        profile_op_opt_builder.with_max_depth(6)

        # 显示视图为op view
        profiler.profile_operations(profile_op_opt_builder.build())

        # ------------------------------------
        writer.close()
Esempio n. 10
0
test_writer = tf.summary.FileWriter(log_dir + '/test')

# 运行初始化所有变量
tf.global_variables_initializer().run()

#### 如何merge的情况:

for i in range(max_steps):
  if i % 10 == 0: # 记录测试集的summary与accuracy
   summary, acc = sess.run([merged, accuracy], feed_dict=feed_dict(False))
   test_writer.add_summary(summary, i)
   print('Accuracy at step %s: %s' % (i, acc))
  else: # 记录训练集的summary
   if i % 100 == 99: # Record execution stats
    run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
    run_metadata = tf.RunMetadata()
    summary, _ = sess.run([merged, train_step],
               feed_dict=feed_dict(True),
               options=run_options,
               run_metadata=run_metadata)
    train_writer.add_run_metadata(run_metadata, 'step%03d' % i)
    train_writer.add_summary(summary, i)
    print('Adding run metadata for', i)
   else: # Record a summary
    summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True))
    train_writer.add_summary(summary, i)
 train_writer.close()
 test_writer.close()

# scalar中将要显示的量:
#
Esempio n. 11
0
def train(sv,
          sess,
          data,
          max_steps,
          display_fetches,
          display_fetches_test,
          dataTest,
          saver,
          loss,
          output_dir=a.output_dir):
    sess.run(data.iterator.initializer)
    try:
        # training
        start_time = time.time()

        for step in range(max_steps):
            options = None
            run_metadata = None
            if helpers.should(a.trace_freq, max_steps, step):
                options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
                run_metadata = tf.RunMetadata()

            fetches = {
                "train": loss.trainOp,
                "global_step": sv.global_step,
            }

            if helpers.should(a.progress_freq, max_steps, step) or step <= 1:
                fetches["loss_value"] = loss.lossValue

            if helpers.should(a.summary_freq, max_steps, step):
                fetches["summary"] = sv.summary_op

            fetches["display"] = display_fetches
            try:
                currentLrValue = a.lr
                if a.checkpoint is None and step < 500:
                    currentLrValue = step * (
                        0.002
                    ) * a.lr  # ramps up to a.lr in the 2000 first iterations to avoid crazy first gradients to have too much impact.

                results = sess.run(fetches,
                                   feed_dict={loss.lr: currentLrValue},
                                   options=options,
                                   run_metadata=run_metadata)
            except tf.errors.OutOfRangeError:
                print(
                    "training fails in OutOfRangeError, probably a problem with the iterator"
                )
                continue

            global_step = results["global_step"]

            #helpers.saveInputs(a.output_dir, results["display"], step)

            if helpers.should(a.summary_freq, max_steps, step):
                sv.summary_writer.add_summary(results["summary"], global_step)

            if helpers.should(a.trace_freq, max_steps, step):
                print("recording trace")
                sv.summary_writer.add_run_metadata(run_metadata,
                                                   "step_%d" % global_step)

            if helpers.should(a.progress_freq, max_steps, step):
                # global_step will have the correct step count if we resume from a checkpoint
                train_epoch = math.ceil(global_step / data.stepsPerEpoch)
                train_step = global_step - (train_epoch -
                                            1) * data.stepsPerEpoch
                imagesPerSecond = global_step * a.batch_size / (time.time() -
                                                                start_time)
                remainingMinutes = ((max_steps - global_step) *
                                    a.batch_size) / (imagesPerSecond * 60)
                print("progress  epoch %d  step %d  image/sec %0.1f" %
                      (train_epoch, global_step, imagesPerSecond))
                print("Remaining %0.1f minutes" % (remainingMinutes))
                print("loss_value", results["loss_value"])

            if helpers.should(a.save_freq, max_steps, step):
                print("saving model")
                try:
                    saver.save(sess,
                               os.path.join(output_dir, "model"),
                               global_step=sv.global_step)
                except Exception as e:
                    print(
                        "Didn't manage to save model (trainining continues): "
                        + str(e))

            if helpers.should(a.test_freq, max_steps,
                              step) or global_step == 1:
                outputTestDir = os.path.join(a.output_dir, str(global_step))
                try:
                    test(sess, dataTest, max_steps, display_fetches_test,
                         outputTestDir)
                except Exception as e:
                    print(
                        "Didn't manage to do a recurrent test (trainining continues): "
                        + str(e))

            if sv.should_stop():
                break
    finally:
        saver.save(sess,
                   os.path.join(output_dir, "model"),
                   global_step=sv.global_step
                   )  #Does the saver saves everything still ?
        sess.run(data.iterator.initializer)
        outputTestDir = os.path.join(a.output_dir, "final")
        test(sess, dataTest, max_steps, display_fetches_test, outputTestDir)
Esempio n. 12
0
 def run(self, sess):
     import time
     # restore from checkpoint
     if self.restore and os.path.exists(os.path.join(self.train_dir, 'checkpoint')):
         latest_ckpt = tf.train.latest_checkpoint(self.train_dir, 'checkpoint')
         self.saver_ckpt.restore(sess, latest_ckpt)
     # otherwise, initialize from start
     else:
         initializers = (tf.initializers.global_variables(),
             tf.initializers.local_variables())
         sess.run(initializers)
     # restore pre-trained model
     if self.pretrain_dir:
         latest_ckpt = tf.train.latest_checkpoint(self.pretrain_dir, 'checkpoint')
         self.saver_pt.restore(sess, latest_ckpt)
     # profiler
     # profile_offset = -1
     profile_offset = 100 + self.log_frequency // 2
     profile_step = 10000
     builder = tf.profiler.ProfileOptionBuilder
     profiler = tf.profiler.Profiler(sess.graph)
     # initialization
     self.log_last = time.time()
     ckpt_last = time.time()
     # dataset generator
     global_step = tf.train.global_step(sess, self.global_step)
     data_gen = self.data.gen_main(global_step)
     # run training session
     while True:
         # global step
         global_step = tf.train.global_step(sess, self.global_step)
         if global_step >= self.max_steps:
             eprint('Training finished at step={}'.format(global_step))
             break
         # run session
         if global_step % profile_step == profile_offset:
             # profiling every few steps
             options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
             run_meta = tf.RunMetadata()
             self.run_sess(sess, global_step, data_gen, options, run_meta)
             profiler.add_step(global_step, run_meta)
             # profile the parameters
             if global_step == profile_offset:
                 ofile = os.path.join(self.train_dir, 'parameters.log')
                 profiler.profile_name_scope(
                     builder(builder.trainable_variables_parameter())
                     .with_file_output(ofile).build())
             # profile the timing of model operations
             ofile = os.path.join(self.train_dir,
                 'time_and_memory_{:0>7}.log'.format(global_step))
             profiler.profile_operations(builder(builder.time_and_memory())
                 .with_file_output(ofile).build())
             # generate a timeline
             timeline = os.path.join(self.train_dir, 'timeline')
             profiler.profile_graph(builder(builder.time_and_memory())
                 .with_step(global_step).with_timeline_output(timeline).build())
         else:
             self.run_sess(sess, global_step, data_gen)
         # save checkpoints periodically or when training finished
         if self.ckpt_period > 0:
             time_current = time.time()
             if time_current - ckpt_last >= self.ckpt_period or global_step + 1 >= self.max_steps:
                 ckpt_last = time_current
                 self.saver_ckpt.save(sess, os.path.join(self.train_dir, 'model.ckpt'),
                     global_step, 'checkpoint')
         # save model every few steps
         if self.save_steps > 0 and global_step % self.save_steps == 0:
             self.saver.save(sess, os.path.join(self.train_dir,
                 'model_{:0>7}'.format(global_step)),
                 write_meta_graph=False, write_state=False)
     # auto detect problems and generate advice
     ALL_ADVICE = {
         'ExpensiveOperationChecker': {},
         'AcceleratorUtilizationChecker': {},
         'JobChecker': {},
         'OperationChecker': {}
     }
     profiler.advise(ALL_ADVICE)
Esempio n. 13
0
def train(flags):
    """Training entry point."""
    log_dir = flags.log_dir
    flags.pretrained_model_dir = log_dir
    log_dir = os.path.join(log_dir, 'train')
    flags.eval_interval_secs = 0
    with tf.Graph().as_default():
        global_step = tf.Variable(0,
                                  trainable=False,
                                  name='global_step',
                                  dtype=tf.int64)
        global_step_confidence = tf.Variable(0,
                                             trainable=False,
                                             name='global_step_confidence',
                                             dtype=tf.int64)

        model = build_model(flags)
        images_query_pl, labels_query_pl, \
        images_support_pl, labels_support_pl = \
          build_episode_placeholder(flags)

        # Augments the input.
        if flags.dataset == 'cifar10' or flags.dataset == 'cifar100':
            images_query_pl_aug = data_loader.augment_cifar(images_query_pl,
                                                            is_training=True)
            images_support_pl_aug = data_loader.augment_cifar(
                images_support_pl, is_training=True)
        elif flags.dataset == 'tinyimagenet':
            images_query_pl_aug = data_loader.augment_tinyimagenet(
                images_query_pl, is_training=True)
            images_support_pl_aug = data_loader.augment_tinyimagenet(
                images_support_pl, is_training=True)

        logits, logits_z = build_proto_train_graph(
            images_query=images_query_pl_aug,
            images_support=images_support_pl_aug,
            flags=flags,
            is_training=True,
            model=model)
        # Losses and optimizer
        ## Classification loss
        loss_classification = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(
                logits=logits,
                labels=tf.one_hot(labels_query_pl, flags.num_classes_train)))

        # Confidence loss
        _, top_k_indices = tf.nn.top_k(logits, k=1)
        pred = tf.squeeze(top_k_indices)
        incorrect_mask = tf.math.logical_not(
            tf.math.equal(pred, labels_query_pl))
        incorrect_logits_z = tf.boolean_mask(logits_z, incorrect_mask)
        incorrect_labels_z = tf.boolean_mask(labels_query_pl, incorrect_mask)
        signal_variance = tf.math.reduce_sum(tf.cast(incorrect_mask, tf.int32))
        loss_variance_incorrect = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(
                logits=incorrect_logits_z,
                labels=tf.one_hot(incorrect_labels_z,
                                  flags.num_classes_train)))
        loss_variance_zero = 0.0
        loss_confidence = tf.cond(tf.greater(signal_variance, 0),
                                  lambda: loss_variance_incorrect,
                                  lambda: loss_variance_zero)

        regu_losses = tf.losses.get_regularization_losses()
        loss = tf.add_n([loss_classification] + regu_losses)

        # Learning rate
        if flags.lr_anneal == 'const':
            learning_rate = flags.init_learning_rate
        elif flags.lr_anneal == 'pwc':
            learning_rate = get_pwc_learning_rate(global_step, flags)
        elif flags.lr_anneal == 'exp':
            lr_decay_step = flags.number_of_steps // flags.n_lr_decay
            learning_rate = tf.train.exponential_decay(
                flags.init_learning_rate,
                global_step,
                lr_decay_step,
                1.0 / flags.lr_decay_rate,
                staircase=True)
        else:
            raise Exception('Not implemented')

        # Optimizer
        optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,
                                               momentum=0.9)
        optimizer_confidence = tf.train.MomentumOptimizer(
            learning_rate=learning_rate, momentum=0.9)

        train_op = contrib_slim.learning.create_train_op(
            total_loss=loss,
            optimizer=optimizer,
            global_step=global_step,
            clip_gradient_norm=flags.clip_gradient_norm)
        variable_variance = []
        for v in tf.trainable_variables():
            if 'fc_variance' in v.name:
                variable_variance.append(v)
        train_op_confidence = contrib_slim.learning.create_train_op(
            total_loss=loss_confidence,
            optimizer=optimizer_confidence,
            global_step=global_step_confidence,
            clip_gradient_norm=flags.clip_gradient_norm,
            variables_to_train=variable_variance)

        tf.summary.scalar('loss', loss)
        tf.summary.scalar('loss_classification', loss_classification)
        tf.summary.scalar('loss_variance', loss_confidence)
        tf.summary.scalar('regu_loss', tf.add_n(regu_losses))
        tf.summary.scalar('learning_rate', learning_rate)
        # Merges all summaries except for pretrain
        summary = tf.summary.merge(
            tf.get_collection('summaries', scope='(?!pretrain).*'))

        # Gets datasets
        few_shot_data_train, test_dataset, train_dataset = get_train_datasets(
            flags)
        # Defines session and logging
        summary_writer_train = tf.summary.FileWriter(log_dir, flush_secs=1)
        saver = tf.train.Saver(max_to_keep=1, save_relative_paths=True)
        print(saver.saver_def.filename_tensor_name)
        print(saver.saver_def.restore_op_name)
        # pylint: disable=unused-variable
        run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
        run_metadata = tf.RunMetadata()
        supervisor = tf.train.Supervisor(
            logdir=log_dir,
            init_feed_dict=None,
            summary_op=None,
            init_op=tf.global_variables_initializer(),
            summary_writer=summary_writer_train,
            saver=saver,
            global_step=global_step,
            save_summaries_secs=flags.save_summaries_secs,
            save_model_secs=0)

        with supervisor.managed_session() as sess:
            checkpoint_step = sess.run(global_step)
            if checkpoint_step > 0:
                checkpoint_step += 1
            eval_interval_steps = flags.eval_interval_steps
            for step in range(checkpoint_step, flags.number_of_steps):
                # Computes the classification loss using a batch of data.
                images_query, labels_query,\
                images_support, labels_support = \
                  few_shot_data_train.next_few_shot_batch(
                      query_batch_size_per_task=flags.train_batch_size,
                      num_classes_per_task=flags.num_classes_train,
                      num_supports_per_class=flags.num_shots_train,
                      num_tasks=flags.num_tasks_per_batch)

                feed_dict = {
                    images_query_pl: images_query.astype(dtype=np.float32),
                    labels_query_pl: labels_query,
                    images_support_pl: images_support.astype(dtype=np.float32),
                    labels_support_pl: labels_support
                }

                t_batch = time.time()
                dt_batch = time.time() - t_batch

                t_train = time.time()
                loss, loss_confidence = sess.run(
                    [train_op, train_op_confidence], feed_dict=feed_dict)
                dt_train = time.time() - t_train

                if step % 100 == 0:
                    summary_str = sess.run(summary, feed_dict=feed_dict)
                    summary_writer_train.add_summary(summary_str, step)
                    summary_writer_train.flush()
                    logging.info(
                        'step %d, loss : %.4g, dt: %.3gs, dt_batch: %.3gs',
                        step, loss, dt_train, dt_batch)

                if float(step) / flags.number_of_steps > 0.5:
                    eval_interval_steps = flags.eval_interval_fine_steps

                if eval_interval_steps > 0 and step % eval_interval_steps == 0:
                    saver.save(sess,
                               os.path.join(log_dir, 'model'),
                               global_step=step)
                    eval(flags=flags,
                         train_dataset=train_dataset,
                         test_dataset=test_dataset)

                if float(
                        step
                ) > 0.5 * flags.number_of_steps + flags.number_of_steps_to_early_stop:
                    break
def main():
    if a.seed is None:
        a.seed = random.randint(0, 2**31 - 1)

    tf.set_random_seed(a.seed)
    np.random.seed(a.seed)
    random.seed(a.seed)

    if not os.path.exists(a.output_dir):
        os.makedirs(a.output_dir)

    if a.mode == "test" or a.mode == "export":
        if a.checkpoint is None:
            raise Exception("checkpoint required for test mode")

        # load some options from the checkpoint
        options = {"which_direction", "ngf", "ndf", "lab_colorization"}
        with open(os.path.join(a.checkpoint, "options.json")) as f:
            for key, val in json.loads(f.read()).items():
                if key in options:
                    print("loaded", key, "=", val)
                    setattr(a, key, val)
        # disable these features in test mode
        a.scale_size = CROP_SIZE
        a.flip = False

    for k, v in a._get_kwargs():
        print(k, "=", v)

    with open(os.path.join(a.output_dir, "options.json"), "w") as f:
        f.write(json.dumps(vars(a), sort_keys=True, indent=4))

    examples = load_examples()
    print("examples count = %d" % examples.count)

    # inputs and targets are [batch_size, height, width, channels]
    model = create_model(examples.inputs, examples.targets)

    # undo colorization splitting on images that we use for display/output
    if a.lab_colorization:
        if a.which_direction == "AtoB":
            # inputs is brightness, this will be handled fine as a grayscale image
            # need to augment targets and outputs with brightness
            targets = augment(examples.targets, examples.inputs)
            outputs = augment(model.outputs, examples.inputs)
            # inputs can be deprocessed normally and handled as if they are single channel
            # grayscale images
            inputs = deprocess(examples.inputs)
        elif a.which_direction == "BtoA":
            # inputs will be color channels only, get brightness from targets
            inputs = augment(examples.inputs, examples.targets)
            targets = deprocess(examples.targets)
            outputs = deprocess(model.outputs)
        else:
            raise Exception("invalid direction")
    else:
        inputs = deprocess(examples.inputs)
        targets = deprocess(examples.targets)
        outputs = deprocess(model.outputs)

    def convert(image):
        if a.aspect_ratio != 1.0:
            # upscale to correct aspect ratio
            size = [CROP_SIZE, int(round(CROP_SIZE * a.aspect_ratio))]
            image = tf.image.resize_images(
                image, size=size, method=tf.image.ResizeMethod.BICUBIC)

        return tf.image.convert_image_dtype(image,
                                            dtype=tf.uint8,
                                            saturate=True)

    # reverse any processing on images so they can be written to disk or displayed to user
    with tf.name_scope("convert_inputs"):
        converted_inputs = convert(inputs)

    with tf.name_scope("convert_targets"):
        converted_targets = convert(targets)

    with tf.name_scope("convert_outputs"):
        converted_outputs = convert(outputs)

    with tf.name_scope("encode_images"):
        display_fetches = {
            "paths":
            examples.paths,
            "inputs":
            tf.map_fn(tf.image.encode_png,
                      converted_inputs,
                      dtype=tf.string,
                      name="input_pngs"),
            "targets":
            tf.map_fn(tf.image.encode_png,
                      converted_targets,
                      dtype=tf.string,
                      name="target_pngs"),
            "outputs":
            tf.map_fn(tf.image.encode_png,
                      converted_outputs,
                      dtype=tf.string,
                      name="output_pngs"),
        }

    # summaries
    with tf.name_scope("inputs_summary"):
        tf.summary.image("inputs", converted_inputs)

    with tf.name_scope("targets_summary"):
        tf.summary.image("targets", converted_targets)

    with tf.name_scope("outputs_summary"):
        tf.summary.image("outputs", converted_outputs)

    with tf.name_scope("predict_real_summary"):
        tf.summary.image(
            "predict_real",
            tf.image.convert_image_dtype(model.predict_real, dtype=tf.uint8))

    with tf.name_scope("predict_fake_summary"):
        tf.summary.image(
            "predict_fake",
            tf.image.convert_image_dtype(model.predict_fake, dtype=tf.uint8))

    tf.summary.scalar("discriminator_loss", model.discrim_loss)
    tf.summary.scalar("generator_loss_GAN", model.gen_loss_GAN)
    tf.summary.scalar("generator_loss_L1", model.gen_loss_L1)

    for var in tf.trainable_variables():
        tf.summary.histogram(var.op.name + "/values", var)

    for grad, var in model.discrim_grads_and_vars + model.gen_grads_and_vars:
        tf.summary.histogram(var.op.name + "/gradients", grad)

    with tf.name_scope("parameter_count"):
        parameter_count = tf.reduce_sum(
            [tf.reduce_prod(tf.shape(v)) for v in tf.trainable_variables()])

    saver = tf.train.Saver(max_to_keep=1)

    logdir = a.output_dir if (a.trace_freq > 0 or a.summary_freq > 0) else None
    sv = tf.train.Supervisor(logdir=logdir, save_summaries_secs=0, saver=None)
    with sv.managed_session() as sess:
        print("parameter_count =", sess.run(parameter_count))

        if a.checkpoint is not None:
            print("#############################")
            print(
                "loading model from checkpoint for continue training or test phase"
            )
            print("#############################")
            try:
                checkpoint = tf.train.latest_checkpoint(a.checkpoint)
                saver.restore(sess, checkpoint)
            except:
                print("loading was unsuccessful-it will train from scratch")
                print("#############################")

        max_steps = 2**32
        if a.max_epochs is not None:
            max_steps = examples.steps_per_epoch * a.max_epochs
        if a.max_steps is not None:
            max_steps = a.max_steps

        if a.mode == "test":
            # testing
            # at most, process the test data once
            start = time.time()
            max_steps = min(examples.steps_per_epoch, max_steps)
            for step in range(max_steps):
                results = sess.run(display_fetches)
                filesets = save_images(results)
                for i, f in enumerate(filesets):
                    print("evaluated image", f["name"])
                index_path = append_index(filesets)
            print("wrote index at", index_path)
            print("rate", (time.time() - start) / max_steps)
        else:
            # training
            start = time.time()

            discrim_loss_pre = 10000
            gan_loss_pre = 10000
            patience_counter = 0

            for step in range(max_steps):

                def should(freq):
                    return freq > 0 and ((step + 1) % freq == 0
                                         or step == max_steps - 1)

                options = None
                run_metadata = None
                if should(a.trace_freq):
                    options = tf.RunOptions(
                        trace_level=tf.RunOptions.FULL_TRACE)
                    run_metadata = tf.RunMetadata()

                fetches = {
                    "train": model.train,
                    "global_step": sv.global_step,
                }

                if should(a.progress_freq):
                    fetches["discrim_loss"] = model.discrim_loss
                    fetches["gen_loss_GAN"] = model.gen_loss_GAN
                    fetches["gen_loss_L1"] = model.gen_loss_L1

                if should(a.summary_freq):
                    fetches["summary"] = sv.summary_op

                if should(a.display_freq):
                    fetches["display"] = display_fetches

                results = sess.run(fetches,
                                   options=options,
                                   run_metadata=run_metadata)

                if should(a.summary_freq):
                    print("recording summary")
                    sv.summary_writer.add_summary(results["summary"],
                                                  results["global_step"])

                if should(a.display_freq):
                    print("saving display images")
                    filesets = save_images(results["display"],
                                           step=results["global_step"])
                    append_index(filesets, step=True)

                if should(a.trace_freq):
                    print("recording trace")
                    sv.summary_writer.add_run_metadata(
                        run_metadata, "step_%d" % results["global_step"])

                if should(a.progress_freq):
                    # global_step will have the correct step count if we resume from a checkpoint
                    train_epoch = math.ceil(results["global_step"] /
                                            examples.steps_per_epoch)
                    train_step = (results["global_step"] -
                                  1) % examples.steps_per_epoch + 1
                    rate = (step + 1) * a.batch_size / (time.time() - start)
                    remaining = (max_steps - step) * a.batch_size / rate
                    print(
                        "progress  epoch %d  step %d  image/sec %0.1f  remaining %dm"
                        % (train_epoch, train_step, rate, remaining / 60))
                    print("discrim_loss", results["discrim_loss"])
                    print("gen_loss_GAN", results["gen_loss_GAN"])
                    print("gen_loss_L1", results["gen_loss_L1"])

                    if discrim_loss_pre >= results["discrim_loss"]:
                        if gan_loss_pre <= results["gen_loss_GAN"]:
                            patience_counter = patience_counter + 1

                    discrim_loss_pre = results["discrim_loss"]
                    gan_loss_pre = results["gen_loss_GAN"]

                    if patience_counter >= float(a.patience_epochs):
                        print("###################")
                        print("early stop, disc is winning")
                        print(
                            "progress  epoch %d  step %d  image/sec %0.1f  remaining %dm"
                            % (train_epoch, train_step, rate, remaining / 60))
                        print("saving model")
                        saver.save(sess,
                                   os.path.join(a.output_dir, "model"),
                                   global_step=sv.global_step)
                        break

                    if results["gen_loss_L1"] < float(a.desired_l1_loss):
                        print("###################")
                        print("Reached desired error")
                        print(
                            "progress  epoch %d  step %d  image/sec %0.1f  remaining %dm"
                            % (train_epoch, train_step, rate, remaining / 60))
                        print("saving model")
                        saver.save(sess,
                                   os.path.join(a.output_dir, "model"),
                                   global_step=sv.global_step)
                        break

                if should(a.save_freq):
                    print("saving model")
                    saver.save(sess,
                               os.path.join(a.output_dir, "model"),
                               global_step=sv.global_step)

                if sv.should_stop():
                    break
Esempio n. 15
0
    def log_to_tensorboard(self, test_filename, psnr, save_meta_data=True):

        if self.enable_log is False:
            return

        # todo
        save_meta_data = False

        org_image = util.set_image_alignment(
            util.load_image(test_filename, print_console=False), self.scale)

        if len(org_image.shape
               ) >= 3 and org_image.shape[2] == 3 and self.channels == 1:
            org_image = util.convert_rgb_to_y(org_image)

        input_image = util.resize_image_by_pil(
            org_image,
            1.0 / self.scale,
            resampling_method=self.resampling_method)
        bicubic_image = util.resize_image_by_pil(
            input_image, self.scale, resampling_method=self.resampling_method)

        if self.max_value != 255.0:
            input_image = np.multiply(input_image, self.max_value /
                                      255.0)  # type: np.ndarray
            bicubic_image = np.multiply(bicubic_image, self.max_value /
                                        255.0)  # type: np.ndarray
            org_image = np.multiply(org_image,
                                    self.max_value / 255.0)  # type: np.ndarray

        feed_dict = {
            self.x:
            input_image.reshape([
                1, input_image.shape[0], input_image.shape[1],
                input_image.shape[2]
            ]),
            self.x2:
            bicubic_image.reshape([
                1, bicubic_image.shape[0], bicubic_image.shape[1],
                bicubic_image.shape[2]
            ]),
            self.y:
            org_image.reshape([
                1, org_image.shape[0], org_image.shape[1], org_image.shape[2]
            ]),
            self.dropout:
            1.0,
            self.is_training:
            0
        }

        if save_meta_data:
            # profiler = tf.profiler.Profile(self.sess.graph)

            run_metadata = tf.RunMetadata()
            run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
            summary_str, _ = self.sess.run([self.summary_op, self.loss],
                                           feed_dict=feed_dict,
                                           options=run_options,
                                           run_metadata=run_metadata)
            self.test_writer.add_run_metadata(run_metadata,
                                              "step%d" % self.epochs_completed)

            filename = self.checkpoint_dir + "/" + self.name + "_metadata.txt"
            with open(filename, "w") as out:
                out.write(str(run_metadata))

            # filename = self.checkpoint_dir + "/" + self.name + "_memory.txt"
            # tf.profiler.write_op_log(
            # 	tf.get_default_graph(),
            # 	log_dir=self.checkpoint_dir,
            # 	#op_log=op_log,
            # 	run_meta=run_metadata)

            tf.contrib.tfprof.model_analyzer.print_model_analysis(
                tf.get_default_graph(),
                run_meta=run_metadata,
                tfprof_options=tf.contrib.tfprof.model_analyzer.
                PRINT_ALL_TIMING_MEMORY)

        else:
            summary_str, _ = self.sess.run([self.summary_op, self.loss],
                                           feed_dict=feed_dict)

        self.train_writer.add_summary(summary_str, self.epochs_completed)
        if not self.use_l1_loss:
            if self.training_step != 0:
                util.log_scalar_value(
                    self.train_writer, 'PSNR',
                    self.training_psnr_sum / self.training_step,
                    self.epochs_completed)
        util.log_scalar_value(self.train_writer, 'LR', self.lr,
                              self.epochs_completed)
        self.train_writer.flush()

        util.log_scalar_value(self.test_writer, 'PSNR', psnr,
                              self.epochs_completed)
        self.test_writer.flush()
Esempio n. 16
0
def train(config):
    Model_cls = HandwritingVRNNGmmModel
    Dataset_cls = HandWritingDatasetConditionalTF

    # Dataset
    training_dataset = Dataset_cls(config['training_data'],
                                   use_bow_labels=config['use_bow_labels'])

    num_training_iterations = int(training_dataset.num_samples /
                                  config['batch_size'])
    print("# training steps per epoch: " + str(num_training_iterations))

    # Create a tensorflow sub-graph that loads batches of samples.
    if config.get('use_bucket_feeder', True) and training_dataset.is_dynamic:
        bucket_edges = training_dataset.get_seq_len_histogram(
            num_bins=15, collapse_first_and_last_bins=[2, -2])
        data_feeder = DataFeederTF(training_dataset,
                                   config['num_epochs'],
                                   config['batch_size'],
                                   queue_capacity=1024)

        sequence_length, inputs, targets = data_feeder.batch_queue_bucket(
            bucket_edges,
            dynamic_pad=training_dataset.is_dynamic,
            queue_capacity=300,
            queue_threads=4)
    else:
        # Training data
        data_feeder = DataFeederTF(training_dataset,
                                   config['num_epochs'],
                                   config['batch_size'],
                                   queue_capacity=1024)
        sequence_length, inputs, targets = data_feeder.batch_queue(
            dynamic_pad=training_dataset.is_dynamic,
            queue_capacity=512,
            queue_threads=4)

    if config.get('use_staging_area', False):
        staging_area = TFStagingArea([sequence_length, inputs, targets],
                                     device_name="/gpu:0")
        sequence_length, inputs, targets = staging_area.tensors

    # Create step counter (used by optimization routine and learning rate function.)
    global_step = tf.compat.v1.get_variable(name='global_step',
                                            trainable=False,
                                            initializer=1)

    # Annealing KL-divergence loss.
    kld_loss_weight_backup = config['loss_weights']['kld_loss']
    if type(config['loss_weights']['kld_loss']) == np.ndarray:
        # Create a piecewise increasing kld loss weight.
        num_steps = len(config['loss_weights']['kld_loss'])
        values = np.linspace(0, 1, num_steps + 1).tolist()
        boundaries = (config['loss_weights']['kld_loss'] *
                      num_training_iterations).tolist()

        config['loss_weights']['kld_loss'] = tf.train.piecewise_constant(
            global_step, boundaries=boundaries, values=values)
        tf.summary.scalar('training/kld_loss_weight',
                          config['loss_weights']['kld_loss'],
                          collections=["training_status"])

    # Create training graph.
    with tf.name_scope("training"):
        model = Model_cls(config,
                          reuse=False,
                          input_op=inputs,
                          target_op=targets,
                          input_seq_length_op=sequence_length,
                          input_dims=training_dataset.input_dims,
                          target_dims=training_dataset.target_dims,
                          mode="training",
                          data_processor=training_dataset)

        model.build_graph()
        model.create_image_summary(training_dataset.prepare_for_visualization)

    # Create sampling graph.
    with tf.name_scope("sampling"):
        sampling_input_op = tf.compat.v1.placeholder(
            tf.float32,
            shape=[
                1, training_dataset.sequence_length,
                sum(training_dataset.input_dims)
            ])
        sampling_sequence_length_op = tf.compat.v1.placeholder(tf.int32,
                                                               shape=[1])
        sampling_model = Model_cls(
            config,
            reuse=True,
            input_op=sampling_input_op,
            target_op=None,
            input_seq_length_op=sampling_sequence_length_op,
            input_dims=training_dataset.input_dims,
            target_dims=training_dataset.target_dims,
            batch_size=1,
            mode="sampling",
            data_processor=training_dataset)
        sampling_model.build_graph()
        sampling_model.create_image_summary(
            training_dataset.prepare_for_visualization)

    # Validation model.
    if config.get('validate_model', False):
        validation_dataset = Dataset_cls(
            config['validation_data'], use_bow_labels=config['use_bow_labels'])

        num_validation_iterations = int(validation_dataset.num_samples /
                                        config['batch_size'])
        print("# validation steps per epoch: " +
              str(num_validation_iterations))

        valid_data_feeder = DataFeederTF(validation_dataset,
                                         config['num_epochs'],
                                         config['batch_size'],
                                         queue_capacity=1024,
                                         shuffle=False)
        valid_sequence_length, valid_inputs, valid_targets = valid_data_feeder.batch_queue(
            dynamic_pad=validation_dataset.is_dynamic,
            queue_capacity=512,
            queue_threads=4)

        if 'use_staging_area' in config and config['use_staging_area']:
            valid_staging_area = TFStagingArea(
                [valid_sequence_length, valid_inputs, valid_targets],
                device_name="/gpu:0")
            valid_sequence_length, valid_inputs, valid_targets = valid_staging_area.tensors

        with tf.name_scope("validation"):
            valid_model = Model_cls(config,
                                    reuse=True,
                                    input_op=valid_inputs,
                                    target_op=valid_targets,
                                    input_seq_length_op=valid_sequence_length,
                                    input_dims=validation_dataset.input_dims,
                                    target_dims=validation_dataset.target_dims,
                                    mode="training",
                                    data_processor=validation_dataset)
            valid_model.build_graph()

    # Create a session object and initialize parameters.
    gpu_options = tf.GPUOptions(allow_growth=True)
    sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options,
                                            allow_soft_placement=True))

    if config['learning_rate_type'] == 'exponential':
        learning_rate = tf.train.exponential_decay(
            config['learning_rate'],
            global_step=global_step,
            decay_steps=config['learning_rate_decay_steps'],
            decay_rate=config['learning_rate_decay_rate'],
            staircase=False)
        tf.summary.scalar('training/learning_rate',
                          learning_rate,
                          collections=["training_status"])
    elif config['learning_rate_type'] == 'fixed':
        learning_rate = config['learning_rate']
    else:
        raise Exception("Invalid learning rate type")

    optimizer = tf.train.AdamOptimizer(learning_rate)
    # Gradient clipping and a sanity check.
    grads = list(
        zip(tf.gradients(model.loss, tf.trainable_variables()),
            tf.trainable_variables()))
    grads_clipped = []
    with tf.name_scope("grad_clipping"):
        for grad, var in grads:
            if grad is not None:
                if config['grad_clip_by_norm'] > 0:
                    grads_clipped.append(
                        (tf.clip_by_norm(grad,
                                         config['grad_clip_by_norm']), var))
                elif config['grad_clip_by_value'] > 0:
                    grads_clipped.append(
                        (tf.clip_by_value(grad, -config['grad_clip_by_value'],
                                          -config['grad_clip_by_value']), var))
                else:
                    grads_clipped.append((grad, var))
    train_op = optimizer.apply_gradients(grads_and_vars=grads_clipped,
                                         global_step=global_step)

    init_op = tf.group(tf.global_variables_initializer(),
                       tf.local_variables_initializer())
    sess.run(init_op)

    run_opts = None
    run_opts_metadata = None
    if config.get('create_timeline', False):
        run_opts = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE,
                                 timeout_in_ms=100000)
        run_opts_metadata = tf.RunMetadata()

    # Create a saver for writing training checkpoints.
    saver = tf.train.Saver(max_to_keep=1, save_relative_paths=True)
    if config['model_dir']:
        # If model directory already exists, continue training by restoring computation graph.
        # Restore variables.
        if config['checkpoint_id'] is None:
            checkpoint_path = tf.train.latest_checkpoint(config['model_dir'])
        else:
            checkpoint_path = os.path.join(config['model_dir'],
                                           config['checkpoint_id'])

        print("Continue training with model " + checkpoint_path)
        saver.restore(sess, checkpoint_path)

        step = tf.train.global_step(sess, global_step)
        start_epoch = round(
            step / (training_dataset.num_samples / config['batch_size']))
    else:
        # Fresh start
        # Create a unique output directory for this experiment.
        config['model_dir'] = get_model_dir_timestamp(
            base_path=config['model_save_dir'],
            prefix="tf",
            suffix=config['experiment_name'],
            connector="-")
        print("Saving to {}\n".format(config['model_dir']))
        start_epoch = 1
        step = 1

    coord = tf.train.Coordinator()
    data_feeder.init(
        sess, coord
    )  # Enqueue threads must be initialized after definition of train_op.
    if config.get('validate_model', False):
        valid_data_feeder.init(sess, coord)
    queue_threads = tf.train.start_queue_runners(coord=coord, sess=sess)
    queue_threads.append(data_feeder.enqueue_threads)

    # Register and create summary ops.
    summary_dir = os.path.join(config['model_dir'], "summary")
    summary_writer = tf.summary.FileWriter(summary_dir, sess.graph)

    # Create summaries to visualize weights and gradients.
    if config['tensorboard_verbose'] > 1:
        for grad, var in grads:
            tf.summary.histogram(var.name,
                                 var,
                                 collections=["training_status"])
            tf.summary.histogram(var.name + '/gradient',
                                 grad,
                                 collections=["training_status"])

    if config['tensorboard_verbose'] > 1:
        tf.summary.scalar(
            "training/queue",
            math_ops.cast(data_feeder.input_queue.size(), dtypes.float32) *
            (1. / data_feeder.queue_capacity),
            collections=["training_status"])

    # Save configuration
    config['loss_weights']['kld_loss'] = kld_loss_weight_backup
    try:
        # Pickle and json dump.
        pickle.dump(
            config, open(os.path.join(config['model_dir'], 'config.pkl'),
                         'wb'))
        json.dump(config,
                  open(os.path.join(config['model_dir'], 'config.json'), 'w'),
                  indent=4,
                  sort_keys=True)
    except:
        pass

    training_summary = tf.compat.v1.summary.merge_all('training_status')
    training_run_ops = [
        model.loss_summary, training_summary, model.ops_loss, train_op
    ]
    training_run_ops_with_img_summary = [
        model.loss_summary, training_summary, model.ops_loss,
        model.ops_img_summary, train_op
    ]

    if config.get('validate_model', False):
        validation_run_ops = [valid_model.ops_loss]

    if config['use_staging_area']:
        training_run_ops.append(staging_area.preload_op)
        training_run_ops_with_img_summary.append(staging_area.preload_op)
        # Fill staging area first.
        for i in range(256):
            _ = sess.run(staging_area.preload_op,
                         feed_dict={},
                         options=run_opts,
                         run_metadata=run_opts_metadata)

        if config.get('validate_model', False):
            validation_run_ops.append(valid_staging_area.preload_op)
            # Fill staging area first.
            for i in range(256):
                _ = sess.run(valid_staging_area.preload_op,
                             feed_dict={},
                             options=run_opts,
                             run_metadata=run_opts_metadata)

    for epoch in range(start_epoch, config['num_epochs'] + 1):
        for epoch_step in range(num_training_iterations):
            start_time = time.perf_counter()
            step = tf.train.global_step(sess, global_step)

            if (step % config['checkpoint_every_step']) == 0:
                ckpt_save_path = saver.save(
                    sess, os.path.join(config['model_dir'], 'model'),
                    global_step)
                print("Model saved in file: %s" % ckpt_save_path)

            if config['img_summary_every_step'] > 0 and step % config[
                    'img_summary_every_step'] == 0:
                run_training_output = sess.run(
                    training_run_ops_with_img_summary,
                    feed_dict={},
                    options=run_opts,
                    run_metadata=run_opts_metadata)

                img_summary = model.get_image_summary(
                    sess,
                    ops_img_summary_evaluated=run_training_output[3],
                    seq_len=500)
                summary_writer.add_summary(img_summary, step)
            else:
                run_training_output = sess.run(training_run_ops,
                                               feed_dict={},
                                               options=run_opts,
                                               run_metadata=run_opts_metadata)

            summary_writer.add_summary(run_training_output[0],
                                       step)  # Loss summary
            summary_writer.add_summary(run_training_output[1],
                                       step)  # Training status summary.

            if step % config['print_every_step'] == 0:
                time_elapsed = (time.perf_counter() -
                                start_time) / config['print_every_step']
                model.log_loss(run_training_output[2], step, epoch,
                               time_elapsed)

            if config['img_summary_every_step'] > 0 and step % config[
                    'img_summary_every_step'] == 0:
                sampling_img_summary = sampling_model.get_image_summary(
                    sess, ops_img_summary_evaluated=None, seq_len=500)
                summary_writer.add_summary(sampling_img_summary, step)

            if config.get('validate_model',
                          False) and step % config['validate_every_step'] == 0:
                start_time = time.perf_counter()
                for i in range(num_validation_iterations):
                    run_validation_output = sess.run(
                        validation_run_ops,
                        feed_dict={},
                        options=run_opts,
                        run_metadata=run_opts_metadata)
                    valid_model.update_validation_loss(
                        run_validation_output[0])

                valid_summary, valid_eval_loss = valid_model.get_validation_summary(
                    session=sess)
                summary_writer.add_summary(valid_summary,
                                           step)  # Validation loss summary

                time_elapsed = (time.perf_counter() -
                                start_time) / num_validation_iterations
                valid_model.log_loss(valid_eval_loss,
                                     step,
                                     epoch,
                                     time_elapsed,
                                     prefix="VALID: ")
                valid_model.reset_validation_loss()

            if config.get('create_timeline', False):
                create_tf_timeline(config['model_dir'], run_opts_metadata)

    print("End-of-Training.")
    ckpt_save_path = saver.save(sess, os.path.join(config['model_dir'],
                                                   'model'), global_step)
    print("Model saved in file: %s" % ckpt_save_path)
    print('Model is trained for %d epochs, %d steps.' %
          (config['num_epochs'], step))

    try:
        sess.run(data_feeder.input_queue.close(cancel_pending_enqueues=True))
        coord.request_stop()
        coord.join(queue_threads, stop_grace_period_secs=5)
    except:
        pass

    sess.close()
Esempio n. 17
0
  def benchmark_model(self, warmup_runs, bm_runs, num_threads,
                      trace_filename=None):
    """Benchmark model."""
    if self.tensorrt:
      print('Using tensorrt ', self.tensorrt)
      graphdef = self.freeze_model()

    if num_threads > 0:
      print('num_threads for benchmarking: {}'.format(num_threads))
      sess_config = tf.ConfigProto(
          intra_op_parallelism_threads=num_threads,
          inter_op_parallelism_threads=1)
    else:
      sess_config = tf.ConfigProto()

    # rewriter_config_pb2.RewriterConfig.OFF
    sess_config.graph_options.rewrite_options.dependency_optimization = 2
    if self.use_xla:
      sess_config.graph_options.optimizer_options.global_jit_level = (
          tf.OptimizerOptions.ON_2)

    with tf.Graph().as_default(), tf.Session(config=sess_config) as sess:
      inputs = tf.placeholder(tf.float32, name='input', shape=self.inputs_shape)
      output = self.build_model(inputs, is_training=False)

      img = np.random.uniform(size=self.inputs_shape)

      sess.run(tf.global_variables_initializer())
      if self.tensorrt:
        fetches = [inputs.name] + [i.name for i in output]
        goutput = self.convert_tr(graphdef, fetches)
        inputs, output = goutput[0], goutput[1:]

      if not self.use_xla:
        # Don't use tf.group because XLA removes the whole graph for tf.group.
        output = tf.group(*output)
      else:
        output = tf.add_n([tf.reduce_sum(x) for x in output])

      output_name = [output.name]
      input_name = inputs.name
      graphdef = tf.graph_util.convert_variables_to_constants(
          sess, sess.graph_def, output_name)

    with tf.Graph().as_default(), tf.Session(config=sess_config) as sess:
      tf.import_graph_def(graphdef, name='')

      for i in range(warmup_runs):
        start_time = time.time()
        sess.run(output_name, feed_dict={input_name: img})
        print('Warm up: {} {:.4f}s'.format(i, time.time() - start_time))

      print('Start benchmark runs total={}'.format(bm_runs))
      start = time.perf_counter()
      for i in range(bm_runs):
        sess.run(output_name, feed_dict={input_name: img})
      end = time.perf_counter()
      inference_time = (end - start) / 10
      print('Per batch inference time: ', inference_time)
      print('FPS: ', self.batch_size / inference_time)

      if trace_filename:
        run_options = tf.RunOptions()
        run_options.trace_level = tf.RunOptions.FULL_TRACE
        run_metadata = tf.RunMetadata()
        sess.run(output_name, feed_dict={input_name: img},
                 options=run_options, run_metadata=run_metadata)
        logging.info('Dumping trace to %s', trace_filename)
        trace_dir = os.path.dirname(trace_filename)
        if not tf.io.gfile.exists(trace_dir):
          tf.io.gfile.makedirs(trace_dir)
        with tf.io.gfile.GFile(trace_filename, 'w') as trace_file:
          from tensorflow.python.client import timeline  # pylint: disable=g-direct-tensorflow-import,g-import-not-at-top
          trace = timeline.Timeline(step_stats=run_metadata.step_stats)
          trace_file.write(
              trace.generate_chrome_trace_format(show_memory=True))
Esempio n. 18
0
    def train(self,
              log_dir=None,
              max_epoch=10000,
              learning_rate=0.001,
              batch_size=None,
              interval_sec=300,
              restore_step=None,
              run_metadata=False):
        """Train model.

        Args:
            log_dir (str): Log directory where log and model is saved.
            max_epoch (int): Size of epoch
            learning_rate (float): Learning rate
            batch_size (int): Batch size when using mini-batch descent method.
                If specifying a size larger then learning data or `None`,
                using batch descent.
            interfal_sec (float): Specify logging time interval in seconds.
                Default by 300.
            restore_step (int): When you specify this argument, this mixin
                resotres model for specified step.
            run_metadata (bool): If true, run metadata and write logs.
        """
        if log_dir is None:
            log_dir = os.path.join(os.path.dirname(__file__), 'tf_logs',
                                   datetime.utcnow().strftime('%Y%m%d%H%M%S'))
        if batch_size is None:
            batch_size = 1
        n_batches = len(self.corpus) // (batch_size * self.time_size)
        jump = (len(self.corpus) - 1) // batch_size
        if run_metadata:
            options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
            metadata = tf.RunMetadata()
        else:
            options = None
            metadata = None
        with self.open_writer(log_dir) as writer:
            with self.open_session(interval_sec=interval_sec,
                                   per_step=n_batches,
                                   restore_step=restore_step) as sess:
                incomes = np.empty([batch_size, self.time_size], dtype=int)
                labels = np.empty([batch_size, self.time_size], dtype=int)
                for b in range(batch_size):
                    incomes[b, ] = self.corpus[b * jump:b * jump +
                                               self.time_size]
                    labels[b, ] = self.corpus[b * jump + 1:b * jump +
                                              self.time_size + 1]
                step = restore_step or 0
                next_h = np.zeros([batch_size, self.hidden_size])
                next_c = np.zeros([batch_size, self.hidden_size])
                if restore_step is None:
                    for summary in sess.run(
                            self.los_summaries,
                            feed_dict={
                                self.incomes: incomes[:batch_size],
                                self.labels: labels[:batch_size],
                                self.prev_h: next_h,
                                self.prev_c: next_c
                            },
                    ):
                        writer.add_summary(summary, step)
                for epoch_i in range(step // self.data_size, max_epoch):
                    for batch_i in range(n_batches):
                        inc, lab = self.fetch_batch(epoch_i, batch_i,
                                                    batch_size, jump, incomes,
                                                    labels)
                        fd = {
                            self.incomes: inc,
                            self.labels: lab,
                            self.prev_h: next_h,
                            self.prev_c: next_c,
                            self.learning_rate: learning_rate,
                        }
                        _, next_h, next_c = sess.run(
                            [self.training_op, self.next_h, self.next_c],
                            feed_dict=fd,
                            options=options,
                            run_metadata=metadata,
                        )
                        step += 1
                        if run_metadata:
                            writer.add_run_metadata(metadata, f'step: {step}')
                        self.record(sess, writer, step, feed_dict=fd)
                    print(f'epock {epoch_i}: finished.')
                self.record(sess, writer, step, feed_dict=fd, force_write=True)