Exemple #1
0
    def load_eager(name, model, optimizer=None):
        checkpoint_dir = "./data/" + name
        checkpoint_prefix = os.path.join(checkpoint_dir, name + "_ckpt")
        if optimizer:
            root = tfe.Checkpoint(optimizer=optimizer, model=model)
        else:
            root = tfe.Checkpoint(model=model)

        root.restore(tf.train.latest_checkpoint(checkpoint_dir))
Exemple #2
0
    def save_eager(name, model, optimizer=None):
        checkpoint_dir = "./data/" + name
        os.makedirs(checkpoint_dir, exist_ok=True)
        checkpoint_prefix = os.path.join(checkpoint_dir, name + "_ckpt")
        if optimizer:
            root = tfe.Checkpoint(optimizer=optimizer, model=model)
        else:
            root = tfe.Checkpoint(model=model)

        root.save(file_prefix=checkpoint_prefix)
Exemple #3
0
def main(_):
    tfe.enable_eager_execution()

    # Automatically determine device and data_format
    (device, data_format) = ('/gpu:0', 'channels_first')
    if FLAGS.no_gpu or tfe.num_gpus() <= 0:
        (device, data_format) = ('/cpu:0', 'channels_last')
    # If data_format is defined in FLAGS, overwrite automatically set value.
    if FLAGS.data_format is not None:
        data_format = data_format
    print('Using device %s, and data format %s.' % (device, data_format))

    # Load the datasets
    train_ds = mnist_dataset.train(FLAGS.data_dir).shuffle(60000).batch(
        FLAGS.batch_size)
    test_ds = mnist_dataset.test(FLAGS.data_dir).batch(FLAGS.batch_size)

    # Create the model and optimizer
    model = mnist.Model(data_format)
    optimizer = tf.train.MomentumOptimizer(FLAGS.lr, FLAGS.momentum)

    # Create file writers for writing TensorBoard summaries.
    if FLAGS.output_dir:
        # Create directories to which summaries will be written
        # tensorboard --logdir=<output_dir>
        # can then be used to see the recorded summaries.
        train_dir = os.path.join(FLAGS.output_dir, 'train')
        test_dir = os.path.join(FLAGS.output_dir, 'eval')
        tf.gfile.MakeDirs(FLAGS.output_dir)
    else:
        train_dir = None
        test_dir = None
    summary_writer = tf.contrib.summary.create_file_writer(train_dir,
                                                           flush_millis=10000)
    test_summary_writer = tf.contrib.summary.create_file_writer(
        test_dir, flush_millis=10000, name='test')

    # Create and restore checkpoint (if one exists on the path)
    checkpoint_prefix = os.path.join(FLAGS.model_dir, 'ckpt')
    step_counter = tf.train.get_or_create_global_step()
    checkpoint = tfe.Checkpoint(model=model,
                                optimizer=optimizer,
                                step_counter=step_counter)
    # Restore variables on creation if a checkpoint exists.
    checkpoint.restore(tf.train.latest_checkpoint(FLAGS.model_dir))

    # Train and evaluate for a set number of epochs.
    with tf.device(device):
        for _ in range(FLAGS.train_epochs):
            start = time.time()
            with summary_writer.as_default():
                train(model, optimizer, train_ds, step_counter,
                      FLAGS.log_interval)
            end = time.time()
            print('\nTrain time for epoch #%d (%d total steps): %f' %
                  (checkpoint.save_counter.numpy() + 1, step_counter.numpy(),
                   end - start))
            with test_summary_writer.as_default():
                test(model, test_ds)
            checkpoint.save(checkpoint_prefix)
Exemple #4
0
    def save_checkpoint(self, epoch, model):
        """
        Create and save a checkpoint.
        """
        # Create the checkpoint directory if required
        self.ckpt_dir = os.path.join(self.session_dir, 'checkpoints')
        if not os.path.exists(self.ckpt_dir):
            os.makedirs(self.ckpt_dir)
            self.checkpoint = tfe.Checkpoint(optimizer=self.optimizer,
                                             model=model.arch)

            # Note: This allows the user to specify how many checkpoints should be saved.
            # Tensorflow does not expose the parameter in tfe.Checkpoint for max_to_keep,
            # however under the hood it uses a Saver object so we can hack around this.
            from tensorflow.python.training.saver import Saver
            default_args = list(Saver.__init__.__code__.co_varnames)
            default_values = list(Saver.__init__.__defaults__)
            if 'self' in default_args:
                # Subtract one since default_values has no value for 'self'
                idx = default_args.index('max_to_keep') - 1
                default_values[idx] = self.p.max_num_ckpts_to_keep
                Saver.__init__.__defaults__ = tuple(default_values)
            else:
                assert (False)

        # Save the checkpoint
        if epoch % self.p.ckpt_save_frequency == 0:
            self.checkpoint.save(os.path.join(self.ckpt_dir, 'ckpt'))
        else:
            return
Exemple #5
0
def checkpoint_load(_checkpoint_path, neural_kb, optimizer):
    logger.info('Loading model...')

    logger.info('   neural kb and optimizer')
    checkpoint_model_prefix = os.path.join(_checkpoint_path, "model/")
    model_saver_path = tf.train.latest_checkpoint(checkpoint_model_prefix)

    # old format compatibility
    if os.path.exists(os.path.join(_checkpoint_path, "optim/")):
        import tensorflow.contrib.eager as tfe
        checkpoint_optim_prefix = os.path.join(_checkpoint_path, "optim/")
        optim_checkpoint_path = tf.train.latest_checkpoint(
            checkpoint_optim_prefix)
        if optim_checkpoint_path is not None:
            optim_checkpoint = tfe.Checkpoint(
                optimizer=optimizer,
                optimizer_step=tf.train.get_or_create_global_step())
            optim_checkpoint.restore(optim_checkpoint_path)
            logger.info('   optimiser')
        else:
            logger.info(
                "   ....couldn't find optim/, ignoring it (loading old model)."
            )

        model_saver = tfe.Saver(neural_kb.variables)
        model_saver.restore(model_saver_path)

    else:
        model_saver = tf.train.Saver(neural_kb.variables +
                                     optimizer.variables() +
                                     [tf.train.get_or_create_global_step()])
        model_saver.restore(None, model_saver_path)

    logger.info('... loading done.')
Exemple #6
0
    def load(self, model, filename):
        model_objects = {'model': model}
        print("=> loading checkpoint '{}'".format(filename))
        ckpt = tfe.Checkpoint(**model_objects)
        ckpt.restore(filename)

        return model_objects['model']
    def __init__(self, cfg, net, trainingset, valset, resume=False):
        self.cfg = cfg
        self.net = net

        # Datasets
        self.trainingset = trainingset
        self.valset = valset

        # Using Adam optimizer
        self.optimizer = tf.train.AdamOptimizer(
            learning_rate=self.cfg.LEARNING_RATE)

        # Create global step
        self.global_step = tf.train.get_or_create_global_step()

        # Create checkpoint directory and save checkpoints
        self.epoch = tfe.Variable(0,
                                  name='epoch',
                                  dtype=tf.float32,
                                  trainable=False)
        self.checkpoint_dir = self.cfg.CKPT_PATH
        self.checkpoint_encoder = os.path.join(self.checkpoint_dir, 'model')
        self.root1 = tfe.Checkpoint(
            optimizer=self.optimizer,
            model=self.net,
            optimizer_step=tf.train.get_or_create_global_step())

        # If resume is true continue from saved checkpoint
        if resume:
            self.root1.restore(tf.train.latest_checkpoint(self.checkpoint_dir))
Exemple #8
0
def main(args):
    xr, log_igfr_r, labels_r = loadData('NEW_GFR_TRAIN')
    xe, log_igfr_e, labels_e = loadData('NEW_GFR_TEST')

    train_ds = tf.data.Dataset.from_tensor_slices((xr, log_igfr_r, labels_r))
    test_ds = tf.data.Dataset.from_tensor_slices((xe, log_igfr_e, labels_e))

    train_ds = train_ds.shuffle(xr.shape[0]).batch(batch_size)
    # test_ds = test_ds.batch(batch_size)
    test_ds = test_ds.batch(1)

    model = KidneyModel(n_cat)
    init_lr, momentum = args.learning_rate, 0.9
    lr = tfe.Variable(init_lr, name="learning_rate")
    optimizer = tf.train.AdamOptimizer(lr)

    with tf.device('/cpu:0'):
        lr = tfe.Variable(init_lr, name="learning_rate")
        optimizer = tf.train.AdamOptimizer(lr)
        for epoch in range(args.epochs):
            print('epoch', epoch)
            train_acc = tfe.metrics.Accuracy('train_accuracy')
            total_loss, total_batch = 0.0, 0.0
            for (batch, (x, log_igfr,
                         labels)) in enumerate(tfe.Iterator(train_ds)):
                with tf.GradientTape() as tape:
                    mean, var, logits, igfr = model(x)
                    loss_value = loss(mean, var, logits, igfr, labels,
                                      log_igfr, args.enlarge, args.w_div,
                                      args.w_l2)
                total_loss += loss_value.cpu().numpy()
                total_batch += 1
                train_acc(tf.argmax(logits, axis=1, output_type=tf.int32),
                          tf.argmax(labels, axis=1, output_type=tf.int32))
                grads = tape.gradient(loss_value, model.variables)
                optimizer.apply_gradients(
                    zip(grads, model.variables),
                    global_step=tf.train.get_or_create_global_step())
            print('Learning Rate', lr.numpy())
            if (epoch + 1) % 50 == 0:
                lr.assign(lr.numpy() / 2)

            print('Training acc {}'.format(100 * train_acc.result()))
            print('train_acc', 100 * train_acc.result().cpu().numpy())
            test_acc = test(model, test_ds)
            test2_acc, reses, test3_acc, reses3 = test23(model, test_ds)
            print('test_acc1', test_acc)
            print('avg_loss ', total_loss / total_batch)
            print('test_acc2', test2_acc)
            print('test_acc3', test3_acc)
            for i in range(reses.shape[0]):
                print('Cate %d ' % i, reses[i])
    checkpoint_dir = './saved_models/'
    checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
    root = tfe.Checkpoint(optimizer=optimizer,
                          model=model,
                          optimizer_step=tf.train.get_or_create_global_step())

    root.save(file_prefix=checkpoint_dir)
Exemple #9
0
    def run(self):
        T = 0  # Worker step counter.
        env = make_atari(self.config.env)
        model = ActorCritic(env.action_space.n, policy=self.config.policy, device=self.config.device)
        loader = tfe.Checkpoint(model=model.policy)
        obs, act, rew = [], [], []
        ob = env.reset()
        done = False
        s = model.policy.s0
        cum_reward = 0
        ep_len = 0

        while T < self.steps:
            try:
                loader.restore(tf.train.latest_checkpoint(self.config.save_dir))
            except:
                continue
            t = 0  # Batch counter.
            s_init = s
            epsilon = 0.6 - (T / self.steps) * 0.5

            while not done and t < self.batch_size:
                logits, v, s = model.forward([ob], s)
                probs = tf.nn.softmax(logits)
                a = greedy(probs, env.action_space.n, epsilon=epsilon)
                next_ob, r, done, _ = env.step(a)
                obs.append(ob)
                act.append(a)
                rew.append(r)
                ob = next_ob
                t += 1
                T += 1
                cum_reward += r
                ep_len += 1

            d_rew = discount(rew, self.config.gamma)
            d_rew = (d_rew - np.mean(d_rew)) / (np.std(d_rew) + 1e-6)  # Stability constant.
            grads, loss = model.gradient(obs, d_rew, act, s_init)
            grads, _ = tf.clip_by_global_norm(grads, self.config.max_norm)

            if done:
                print(f"Step: {T}, Len: {ep_len}, BR: {cum_reward}, TL: {loss:.4f}, Epsilon: {epsilon:.2f}")
                s = model.policy.s0
                done = False
                ob = env.reset()
                cum_reward = 0
                ep_len = 0

            obs.clear()
            act.clear()
            rew.clear()

            for i in range(len(grads)):
                grads[i] = grads[i].numpy()

            self.queue.put(grads)
Exemple #10
0
def restore_model(model, optimizer):
    # model是重新初始化的
    # 指定checkpoint目录
    checkpoint_directory = 'models_checkpoints/SimpleNN/'
    # Create model checkpoint
    checkpoint = tfe.Checkpoint(
        optimizer=optimizer,
        model=model,
        optimizer_step=tf.train.get_or_create_global_step())
    checkpoint.restore(tf.train.latest_checkpoint(checkpoint_directory))
Exemple #11
0
    def restore_checkpoint(self, model):
        """
        Load a given checkpoint.
        """
        # Create a checkpoint
        self.checkpoint = tfe.Checkpoint(optimizer=self.create_optimizer(),
                                         model=model.arch)

        # Restore the checkpoint
        self.checkpoint.restore(self.p.ckpt_path)
def initialize_process():
    global env, model, loader, config, logger
    config = Config("config/config.json")
    env = make_atari(config.env)
    # env = gym.make(config.env)
    model = ActorCritic(env.action_space.n, policy=config.policy)
    loader = tfe.Checkpoint(
        model=model.policy,
        optimizer_step=tf.train.get_or_create_global_step())
    logger = Logger("Worker_{}".format(os.getpid()))
Exemple #13
0
def sample(queue, env_name, steps):
    env = make_atari(env_name)
    model = CRPolicy(env.action_space.n)
    loader = tfe.Checkpoint(model=model)

    for roll in range(steps):
        # TODO: REMOVE THIS CRAP WHEN YOU FIX IT
        try:
            loader.restore(tf.train.latest_checkpoint(CKPT_DIR))
        except:
            continue

        obs, act, rews = [], [], []
        ob = env.reset()
        done = False
        s = model.s0

        while not done:
            logits, v, s = model([ob], s)
            probs = tf.nn.softmax(logits)
            a = boltzmann(probs, env.action_space.n)
            next_ob, r, done, _ = env.step(a)
            obs.append(ob)
            act.append(a)
            rews.append(r)
            ob = next_ob

        d_rews = discount(rews, GAMMA)
        d_rew = (d_rews - np.mean(d_rews)) / (np.std(d_rews) + 1e-6)

        with tf.GradientTape() as tape:
            logits, values, _ = model(obs, model.s0)
            values = tf.squeeze(values)
            advs = tf.constant(d_rew, dtype=tf.float32) - values
            policy = tf.nn.softmax(logits)

            xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=act, logits=logits)
            p_loss = xentropy * tf.stop_gradient(advs)
            v_loss = tf.square(advs)
            e_loss = tf.nn.softmax_cross_entropy_with_logits_v2(labels=policy,
                                                                logits=logits)

            loss = tf.reduce_mean(p_loss + 0.5 * v_loss - 0.01 * e_loss)

        grads = tape.gradient(loss, model.trainable_weights)
        grads, _ = tf.clip_by_global_norm(grads, MAX_GRAD_NORM)

        print("Step: {0}, Len: {1} BR: {2}, TL: {3:.4f}".format(
            roll, len(obs), np.sum(rews), loss))

        for i in range(len(grads)):
            grads[i] = grads[i].numpy()

        queue.put(grads)
Exemple #14
0
def fit_and_save(model, optimizer, input_data, target):
    model.fit(input_data, target, optimizer, num_epochs=500, verbose=50)
    # Specify checkpoint directory
    checkpoint_directory = 'models_checkpoints/SimpleNN/'
    # Create model checkpoint
    checkpoint = tfe.Checkpoint(
        optimizer=optimizer,
        model=model,
        optimizer_step=tf.train.get_or_create_global_step())
    # Save trained model
    checkpoint.save(file_prefix=checkpoint_directory)
    def build_model(self, initializer=tf.zeros):
        self.model = AtariModel(self.obs_spec["screen"][1],
                                self.obs_spec["minimap"][1],
                                possible_action_num)

        # TODO: Training
        optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
        self.root = tfe.Checkpoint(
            optimizer=optimizer,
            model=self.model,
            optimizer_step=tf.train.get_or_create_global_step())
Exemple #16
0
 def create_checkpoint(self, name):
     if name:
         checkpoint = tfe.Checkpoint(**self.get_str2weights())
         checkpoint_dir = "./model/" + name
         checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
         try:
             checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
         except Exception as e:
             print(e)
         return checkpoint, checkpoint_prefix
     else:
         return None, None
Exemple #17
0
def main(_):

    pp = pprint.PrettyPrinter()
    pp.pprint(flags.FLAGS.__flags)

    filenames = glob.glob(data_dir)

    (device, data_format) = ('/gpu:0', 'channels_first')
    if FLAGS.no_gpu or tfe.num_gpus() <= 0:
        (device, data_format) = ('/cpu:0', 'channels_last')
    print('Using device %s, and data format %s.' % (device, data_format))

    if not os.path.exists(FLAGS.checkpoint_dir):
        os.makedirs(FLAGS.checkpoint_dir)
    if not os.path.exists(FLAGS.sample_dir):
        os.makedirs(FLAGS.sample_dir)

    model_objects = {
        'generator': Generator(data_format),
        'discriminator': Discriminator(data_format),
        'generator_optimizer': tf.train.AdamOptimizer(FLAGS.generator_learning_rate, FLAGS.beta1, FLAGS.beta2),
        'discriminator_optimizer': tf.train.AdamOptimizer(FLAGS.discriminator_learning_rate, FLAGS.beta1, FLAGS.beta2),
        'step_counter': tf.train.get_or_create_global_step()
    }

    summary_writer = tf.contrib.summary.create_file_writer(FLAGS.summary_dir,
                                                           flush_millis=1000)

    checkpoint = tfe.Checkpoint(**model_objects)
    checkpoint_prefix = os.path.join(FLAGS.checkpoint_dir, 'ckpt')
    latest_cpkt = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
    if latest_cpkt:
        print('Using latest checkpoint at ' + latest_cpkt)
    checkpoint.restore(latest_cpkt)

    dataset = tf.data.TFRecordDataset(
        filenames).map(read_and_decode_with_labels)
    dataset = dataset.shuffle(10000).apply(
        tf.contrib.data.batch_and_drop_remainder(FLAGS.batch_size))

    with tf.device(device):
        for epoch in range(FLAGS.epoch):
            start = time.time()
            with summary_writer.as_default():
                train_one_epoch(dataset=dataset, batch_size=FLAGS.batch_size, log_interval=FLAGS.log_interval,
                                z_dim=FLAGS.z_dim, device=device, epoch=epoch, **model_objects)
            end = time.time()
            checkpoint.save(checkpoint_prefix)
            print('\nTrain time for epoch #%d (step %d): %f' %
                  (checkpoint.save_counter.numpy(),
                   checkpoint.step_counter.numpy(),
                   end - start))
    def __init__(self, cfg, net, testset):

        self.cfg = cfg
        self.net = net
        self.testset = testset

        # Restore the model
        self.optimizer = tf.train.MomentumOptimizer(
            learning_rate=self.cfg.LEARNING_RATE, momentum=self.cfg.MOMENTUM)
        self.checkpoint_dir = self.cfg.CKPT_PATH
        self.checkpoint_encoder = os.path.join(self.checkpoint_dir, 'Model')
        self.root1 = tfe.Checkpoint(
            optimizer=self.optimizer,
            model=self.net,
            optimizer_step=tf.train.get_or_create_global_step())
        self.root1.restore(tf.train.latest_checkpoint(self.checkpoint_dir))
Exemple #19
0
def main():
    args = setup_args()
    log_msg(args)

    vocab_table = lookup_ops.index_table_from_file(args.vocab, default_value=args.unk_index)
    train_dataset = create_dataset(args.train, vocab_table, args.bs, args.eos, args.t)
    valid_dataset = create_dataset(args.valid, vocab_table, args.bs, args.eos, args.t)

    loss_and_grads_fun = tfe.implicit_value_and_gradients(train_loss)
    lm = LanguageModel(int(vocab_table.size()), d=args.nd, h=args.nh, cell=args.cell)

    log_msg('Model built!')
    best_valid_ppl = compute_ppl(lm, valid_dataset)
    log_msg(f'Start ppl: {best_valid_ppl: 0.4f}')

    if args.opt == 'adam':
        opt = tf.train.AdamOptimizer(args.lr)
    else:
        opt = tf.train.GradientDescentOptimizer(args.lr)

    if not os.path.exists(args.save_dir):
        os.mkdir(args.save_dir)

    ckpt_prefix = os.path.join(args.save_dir, args.ckpt_prefix)
    root = tfe.Checkpoint(optimizer=opt, model=lm, optimizer_step=tf.train.get_or_create_global_step())
    for epoch_num in range(args.num_epochs):
        log_msg(f'Epoch: {epoch_num} START')
        batch_loss = []
        for step_num, train_datum in enumerate(train_dataset, start=1):
            loss_value, gradients = loss_and_grads_fun(lm, train_datum)
            batch_loss.append(loss_value)

            if step_num % args.stats_step == 0:
                log_msg(f'Epoch: {epoch_num} Step: {step_num} Avg Loss: {np.average(np.asarray(loss_value)): 0.4f}')
                batch_loss = []

            if step_num % args.eval_step == 0:
                better, ppl = check_if_ppl_better(best_valid_ppl, lm, valid_dataset, root, ckpt_prefix, epoch_num, step_num)
                if better:
                    best_valid_ppl = ppl

            opt.apply_gradients(clip_gradients(gradients, args.clip_ratio))
        log_msg(f'Epoch: {epoch_num} END')
        better, ppl = check_if_ppl_better(best_valid_ppl, lm, valid_dataset, root, ckpt_prefix, epoch_num, step_num=-1)
        if better:
            best_valid_ppl = ppl
def test():
    config = Config("config/config.json")
    env = make_atari(config.env)
    model = ActorCritic(env.action_space.n, policy=config.policy)
    saver = tfe.Checkpoint(model=model.policy)
    saver.restore(tf.train.latest_checkpoint(config.save_dir))
    ob = env.reset()
    s = model.policy.s0

    while True:
        env.render()
        logits, _, s = model.forward([ob], s)
        probs = tf.nn.softmax(logits)
        a = greedy(probs)
        ob, _, done, _ = env.step(a)

        if done:
            ob = env.reset()
Exemple #21
0
def main(_):
    (device, data_format) = ('/gpu:0', 'channels_first')
    if FLAGS.no_gpu or tfe.num_gpus() <= 0:
        (device, data_format) = ('/cpu:0', 'channels_last')
    print('Using device %s, and data format %s.' % (device, data_format))

    # Load the datasets
    data = input_data.read_data_sets(FLAGS.data_dir)
    dataset = (tf.data.Dataset.from_tensor_slices(
        data.train.images).shuffle(60000).batch(FLAGS.batch_size))

    # Create the models and optimizers.
    model_objects = {
        'generator': Generator(data_format),
        'discriminator': Discriminator(data_format),
        'generator_optimizer': tf.train.AdamOptimizer(FLAGS.lr),
        'discriminator_optimizer': tf.train.AdamOptimizer(FLAGS.lr),
        'step_counter': tf.train.get_or_create_global_step(),
    }

    # Prepare summary writer and checkpoint info
    summary_writer = tf.contrib.summary.create_summary_file_writer(
        FLAGS.output_dir, flush_millis=1000)
    checkpoint_prefix = os.path.join(FLAGS.checkpoint_dir, 'ckpt')
    latest_cpkt = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
    if latest_cpkt:
        print('Using latest checkpoint at ' + latest_cpkt)
    checkpoint = tfe.Checkpoint(**model_objects)
    # Restore variables on creation if a checkpoint exists.
    checkpoint.restore(latest_cpkt)

    with tf.device(device):
        for _ in range(100):
            start = time.time()
            with summary_writer.as_default():
                train_one_epoch(dataset=dataset,
                                log_interval=FLAGS.log_interval,
                                noise_dim=FLAGS.noise,
                                **model_objects)
            end = time.time()
            checkpoint.save(checkpoint_prefix)
            print('\nTrain time for epoch #%d (step %d): %f' %
                  (checkpoint.save_counter.numpy(),
                   checkpoint.step_counter.numpy(), end - start))
def freeze_model():
    x = tf.random_normal((1, 300, 400, 3))
    model = ConvModule(64, kernel_size=(3, 3))
    adam = tf.train.AdamOptimizer()
    checkpoint_prefix = os.path.join(flags.model_dir, 'ckpt')
    global_step = tf.train.get_or_create_global_step()

    y = model(x)

    print("y:", y.shape)

    checkpoint = tfe.Checkpoint(model=model,
                                optimizer=adam,
                                step_counter=global_step)
    checkpoint.restore(tf.train.latest_checkpoint(flags.model_dir))

    print("Global_step:", global_step)

    checkpoint.save(checkpoint_prefix)
Exemple #23
0
def train_model(train_file, validation_file, validation_interval, width, height, batch_size, n_epochs,
                checkpoint_folder, training_device):
    checkpoint_folder = os.path.join(checkpoint_folder, f'{width}x{height}')
    training_generator = LSUNGenerator(train_file)
    transform = LSUNTransform(image_dimensions=(height, width, 3))
    encoder = Encoder()
    decoder = Decoder()
    optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
    checkpointer = tfe.Checkpoint(encoder=encoder, decoder=decoder, optimizer=optimizer)
    best_loss = 1e10
    for epoch in range(n_epochs):
        iteration = 0
        dataset = tf.data.Dataset.from_generator(generator=lambda: training_generator, output_types=tf.string) \
            .map(transform).batch(batch_size)
        for batch in dataset:
            with tf.device(training_device):
                loss, grads_and_vars = calculate_gradients(batch, encoder, decoder)
                optimizer.apply_gradients(grads_and_vars)
            iteration += 1
            training_logger.info(f'Epoch = {epoch}, Iteration = {iteration}, Loss = {loss}')
            if iteration % validation_interval == 0:
                validation_logger.info(f'Epoch: {epoch}, Iteration: {iteration}. Beginning validation pass...')
                validation_generator = LSUNGenerator(validation_file)
                validation_dataset = tf.data.Dataset.from_generator(generator=lambda: validation_generator,
                                                                    output_types=tf.string) \
                    .map(transform).batch(batch_size)
                losses = list()
                for val_batch in validation_dataset:
                    with tf.device(training_device):
                        val_batch = tf.constant(val_batch)
                        loss = evaluate(val_batch, encoder, decoder)
                        losses.append(loss)
                losses = np.array(losses)
                avg_loss = np.mean(losses)
                min_loss = np.min(losses)
                max_loss = np.max(losses)
                std_loss = np.std(losses)
                validation_logger.info(f'avg: {avg_loss}, std: {std_loss}, min: {min_loss}, max: {max_loss}')
                if avg_loss < best_loss:
                    best_loss = avg_loss
                    validation_logger.info(
                        f'Validation loss is best seen so far. Checkpointing to {checkpoint_folder}...')
                    checkpointer.save(checkpoint_folder)
Exemple #24
0
    def train(self, steps=300, name=None):
        """
        :param steps:
        :param name:
        :return: the loss history
        """
        str2weights = {
            str(key): value
            for key, value in self.rule_weights.items()
        }
        if name:
            checkpoint = tfe.Checkpoint(**str2weights)
            checkpoint_dir = "./model/" + name
            checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
            try:
                checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
            except Exception as e:
                print(e)

        losses = []
        optimizer = tf.train.RMSPropOptimizer(learning_rate=0.5)

        for i in range(steps):
            grads = self.grad()
            optimizer.apply_gradients(
                zip(grads, self.__all_variables()),
                global_step=tf.train.get_or_create_global_step())
            loss_avg = float(self.loss().numpy())
            losses.append(loss_avg)
            print("-" * 20)
            print("step " + str(i) + " loss is " + str(loss_avg))
            if i % 5 == 0:
                self.show_definition()
                valuation_dict = self.valuation2atoms(self.deduction()).items()
                for atom, value in valuation_dict:
                    print(str(atom) + ": " + str(value))
                if name:
                    checkpoint.save(checkpoint_prefix)
                    pd.Series(np.array(losses)).to_csv(name + ".csv")
            print("-" * 20 + "\n")
        return losses
def train():
    mp.set_start_method('spawn', force=True)

    config = Config("config/config.json")
    env = make_atari(config.env)
    # env = gym.make(config.env)
    step = tf.train.get_or_create_global_step()
    optimizer = tf.train.AdamOptimizer(learning_rate=config.lr)
    model = ActorCritic(env.action_space.n, policy=config.policy)
    saver = tfe.Checkpoint(optimizer=optimizer,
                           model=model.policy,
                           optimizer_step=step)
    pool = mp.Pool(processes=config.processes, initializer=initialize_process)
    saver.restore(tf.train.latest_checkpoint(config.save_dir))
    logger = Logger('global')

    #   Initialize model.
    model.forward([env.reset()])
    ts = time.time()

    for t in range(config.steps):
        gradients = []
        roll = pool.map(generate_gradients, [t] * config.processes)

        for tup in zip(*roll):
            averaged = np.mean(tup, axis=0)
            gradients.append(tf.constant(averaged, dtype=tf.float32))

        clipped, _ = tf.clip_by_global_norm(gradients, config.max_norm)
        gnorms = [tf.norm(grad) for grad in clipped]
        logger.log_gradients(gnorms)
        logger.log_weights(model.policy.trainable_variables)
        optimizer.apply_gradients(zip(clipped, model.policy.trainable_weights),
                                  global_step=step)
        saver.save(file_prefix=config.file_prefix)

        print("Epoch took: {}".format(time.time() - ts))
        ts = time.time()
    def train(self, steps=6000, name="test"):
        str2weights = {str(key):value for key,value in self.rule_weights.items()}
        checkpoint = tfe.Checkpoint(**str2weights)
        optimizer = tf.train.RMSPropOptimizer(learning_rate=0.5)
        checkpoint_dir = "./model/"+name
        checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")

        try:
            checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
        except Exception as e:
            print(e)
        for i in range(steps):
            grads = self.grad()
            optimizer.apply_gradients(zip(grads, self.rule_weights.values()),
                                      global_step=tf.train.get_or_create_global_step())
            loss_avg = self.loss()
            print("-"*20)
            print("step "+str(i)+" loss is "+str(loss_avg))
            if i%5==0:
                self.show_definition()
                for atom, value in self.valuation2atoms(self.deduction()).items():
                    print(str(atom)+": "+str(value))
                checkpoint.save(checkpoint_prefix)
            print("-"*20+"\n")
def call_umm_segmentation(features, pad, contiguous, random_wins):
    '''
    Parameters
    ----------
    list of features in size (128,201)
    length of padding
    number of contiguous segments
    [(start,end)] for all the random windows
    '''

    model = CRNN.Model(utils.hidden_dim, utils.num_layers, utils.input_dim)
    # load checkpoint
    checkpoint_prefix = os.path.join(utils.model_dir, utils.model_name)

    step_counter = tf.train.get_or_create_global_step()
    checkpoint = tfe.Checkpoint(model=model, step_counter=step_counter)

    if tf.train.checkpoint_exists(checkpoint_prefix):
        checkpoint.restore(checkpoint_prefix)
    norm_feats = normalization(tf.convert_to_tensor(features))
    logit = model(norm_feats, training=False)
    time_segments = compute_timeline(logit, pad, contiguous, random_wins)

    return time_segments
Exemple #28
0
def train_or_infer_spinn(embed,
                         word2index,
                         train_data,
                         dev_data,
                         test_data,
                         config):
  """Perform Training or Inference on a SPINN model.

  Args:
    embed: The embedding matrix as a float32 numpy array with shape
      [vocabulary_size, word_vector_len]. word_vector_len is the length of a
      word embedding vector.
    word2index: A `dict` mapping word to word index.
    train_data: An instance of `data.SnliData`, for the train split.
    dev_data: Same as above, for the dev split.
    test_data: Same as above, for the test split.
    config: A configuration object. See the argument to this Python binary for
      details.

  Returns:
    If `config.inference_premise ` and `config.inference_hypothesis` are not
      `None`, i.e., inference mode: the logits for the possible labels of the
      SNLI data set, as a `Tensor` of three floats.
    else:
      The trainer object.
  Raises:
    ValueError: if only one of config.inference_premise and
      config.inference_hypothesis is specified.
  """
  # TODO(cais): Refactor this function into separate one for training and
  #   inference.
  use_gpu = tfe.num_gpus() > 0 and not config.force_cpu
  device = "gpu:0" if use_gpu else "cpu:0"
  print("Using device: %s" % device)

  if ((config.inference_premise and not config.inference_hypothesis) or
      (not config.inference_premise and config.inference_hypothesis)):
    raise ValueError(
        "--inference_premise and --inference_hypothesis must be both "
        "specified or both unspecified, but only one is specified.")

  if config.inference_premise:
    # Inference mode.
    inference_sentence_pair = [
        data.encode_sentence(config.inference_premise, word2index),
        data.encode_sentence(config.inference_hypothesis, word2index)]
  else:
    inference_sentence_pair = None

  log_header = (
      "  Time Epoch Iteration Progress    (%Epoch)   Loss   Dev/Loss"
      "     Accuracy  Dev/Accuracy")
  log_template = (
      "{:>6.0f} {:>5.0f} {:>9.0f} {:>5.0f}/{:<5.0f} {:>7.0f}% {:>8.6f} {} "
      "{:12.4f} {}")
  dev_log_template = (
      "{:>6.0f} {:>5.0f} {:>9.0f} {:>5.0f}/{:<5.0f} {:>7.0f}% {:>8.6f} "
      "{:8.6f} {:12.4f} {:12.4f}")

  summary_writer = tf.contrib.summary.create_file_writer(
      config.logdir, flush_millis=10000)

  with tf.device(device), \
       summary_writer.as_default(), \
       tf.contrib.summary.always_record_summaries():
    model = SNLIClassifier(config, embed)
    global_step = tf.train.get_or_create_global_step()
    trainer = SNLIClassifierTrainer(model, config.lr)
    checkpoint = tfe.Checkpoint(trainer=trainer, global_step=global_step)
    checkpoint.restore(tf.train.latest_checkpoint(config.logdir))

    if inference_sentence_pair:
      # Inference mode.
      prem, prem_trans = inference_sentence_pair[0]
      hypo, hypo_trans = inference_sentence_pair[1]
      hypo_trans = inference_sentence_pair[1][1]
      inference_logits = model(
          tf.constant(prem), tf.constant(prem_trans),
          tf.constant(hypo), tf.constant(hypo_trans), training=False)
      inference_logits = inference_logits[0][1:]
      max_index = tf.argmax(inference_logits)
      print("\nInference logits:")
      for i, (label, logit) in enumerate(
          zip(data.POSSIBLE_LABELS, inference_logits)):
        winner_tag = " (winner)" if max_index == i else ""
        print("  {0:<16}{1:.6f}{2}".format(label + ":", logit, winner_tag))
      return inference_logits

    train_len = train_data.num_batches(config.batch_size)
    start = time.time()
    iterations = 0
    mean_loss = tfe.metrics.Mean()
    accuracy = tfe.metrics.Accuracy()
    print(log_header)
    for epoch in xrange(config.epochs):
      batch_idx = 0
      for label, prem, prem_trans, hypo, hypo_trans in _get_dataset_iterator(
          train_data, config.batch_size):
        if use_gpu:
          label, prem, hypo = label.gpu(), prem.gpu(), hypo.gpu()
          # prem_trans and hypo_trans are used for dynamic control flow and can
          # remain on CPU. Same in _evaluate_on_dataset().

        iterations += 1
        batch_train_loss, batch_train_logits = trainer.train_batch(
            label, prem, prem_trans, hypo, hypo_trans)
        batch_size = tf.shape(label)[0]
        mean_loss(batch_train_loss.numpy(),
                  weights=batch_size.gpu() if use_gpu else batch_size)
        accuracy(tf.argmax(batch_train_logits, axis=1), label)

        if iterations % config.save_every == 0:
          checkpoint.save(os.path.join(config.logdir, "ckpt"))

        if iterations % config.dev_every == 0:
          dev_loss, dev_frac_correct = _evaluate_on_dataset(
              dev_data, config.batch_size, trainer, use_gpu)
          print(dev_log_template.format(
              time.time() - start,
              epoch, iterations, 1 + batch_idx, train_len,
              100.0 * (1 + batch_idx) / train_len,
              mean_loss.result(), dev_loss,
              accuracy.result() * 100.0, dev_frac_correct * 100.0))
          tf.contrib.summary.scalar("dev/loss", dev_loss)
          tf.contrib.summary.scalar("dev/accuracy", dev_frac_correct)
        elif iterations % config.log_every == 0:
          mean_loss_val = mean_loss.result()
          accuracy_val = accuracy.result()
          print(log_template.format(
              time.time() - start,
              epoch, iterations, 1 + batch_idx, train_len,
              100.0 * (1 + batch_idx) / train_len,
              mean_loss_val, " " * 8, accuracy_val * 100.0, " " * 12))
          tf.contrib.summary.scalar("train/loss", mean_loss_val)
          tf.contrib.summary.scalar("train/accuracy", accuracy_val)
          # Reset metrics.
          mean_loss = tfe.metrics.Mean()
          accuracy = tfe.metrics.Accuracy()

        batch_idx += 1
      if (epoch + 1) % config.lr_decay_every == 0:
        trainer.decay_learning_rate(config.lr_decay_by)

    test_loss, test_frac_correct = _evaluate_on_dataset(
        test_data, config.batch_size, trainer, use_gpu)
    print("Final test loss: %g; accuracy: %g%%" %
          (test_loss, test_frac_correct * 100.0))

  return trainer
Exemple #29
0
from __future__ import absolute_import, division, print_function

import os
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow.contrib.eager as tfe

tf.enable_eager_execution()

from iris import Iris

iris = Iris()
model = iris.model
optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
checkpoint_dir = './iris_model'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
root = tfe.Checkpoint(optimizer=optimizer,
                      model=model,
                      optimizer_step=tf.train.get_or_create_global_step())

root.restore(tf.train.latest_checkpoint(checkpoint_dir))

iris.test()
def main():
    open('output_summary.csv', 'w').close()
    # Constants variables
    NUM_TRAIN_SAMPLES = 72485
    NUM_TEST_SAMPLES = 26528

    # Editable variables
    num_labeled_samples = 5126
    num_validation_samples = 0
    batch_size = 25
    epochs = 200
    max_learning_rate = 0.003
    initial_beta1 = 0.9
    final_beta1 = 0.5
    checkpoint_directory = './checkpoints/PiModel'
    tensorboard_logs_directory = './logs/PiModel'

    # Assign it as tfe.variable since we will change it across epochs
    learning_rate = tfe.Variable(max_learning_rate)
    beta_1 = tfe.Variable(initial_beta1)
    outputArr = np.array([])
    # Download and Save Dataset in Tfrecords
    #loader = SvnhLoader('./data', NUM_TRAIN_SAMPLES,
    #                    num_validation_samples, num_labeled_samples)
    #loader.download_images_and_generate_tf_record()
    loader = FnLoader('./fn_data', NUM_TRAIN_SAMPLES, num_validation_samples,
                      num_labeled_samples)
    #    print ("hello")
    loader.download_images_and_generate_tf_record()
    #sys.exit()
    # Generate data loaders
    train_labeled_iterator, train_unlabeled_iterator, validation_iterator, test_iterator = loader.load_dataset(
        batch_size, epochs)
    #print (train_labeled_iterator)
    batches_per_epoch = int(num_labeled_samples / batch_size)
    batches_per_epoch_val = int(num_validation_samples / batch_size)
    #    sys.exit()
    model = PiModel()
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate,
                                       beta1=beta_1,
                                       beta2=0.999)
    max_unsupervised_weight = 100 * num_labeled_samples / \
        (NUM_TRAIN_SAMPLES - num_validation_samples)
    best_val_accuracy = 0
    global_step = tf.train.get_or_create_global_step()
    writer = tf.contrib.summary.create_file_writer(tensorboard_logs_directory)
    writer.set_as_default()
    #sys.exit()
    for epoch in range(epochs):

        rampdown_value = ramp_down_function(epoch, epochs)
        rampup_value = ramp_up_function(epoch)

        if epoch == 0:
            unsupervised_weight = 0
        else:
            unsupervised_weight = max_unsupervised_weight * \
                rampup_value

        learning_rate.assign(rampup_value * rampdown_value * max_learning_rate)
        beta_1.assign(rampdown_value * initial_beta1 +
                      (1.0 - rampdown_value) * final_beta1)
        epoch_loss_avg = tfe.metrics.Mean()
        epoch_accuracy = tfe.metrics.Accuracy()
        epoch_loss_avg_val = tfe.metrics.Mean()
        epoch_accuracy_val = tfe.metrics.Accuracy()
        for batch_nr in range(batches_per_epoch):
            X_labeled_train, y_labeled_train = train_labeled_iterator.get_next(
            )
            #print(y_labeled_train[0:20,0])
            #print(y_labeled_train[0:20,1])
            #print(y_labeled_train.shape)
            X_unlabeled_train, _ = train_unlabeled_iterator.get_next()

            loss_val, grads = pi_model_gradients(X_labeled_train,
                                                 y_labeled_train,
                                                 X_unlabeled_train, model,
                                                 unsupervised_weight)
            optimizer.apply_gradients(zip(grads, model.variables),
                                      global_step=global_step)
            #sys.exit()
            epoch_loss_avg(loss_val)
            #print(X_labeled_train)
            num_test_batches = int(NUM_TEST_SAMPLES / batch_size)
            pred = model(X_labeled_train)
            #sys.exit()
            outputArr = np.array([])
            epoch_accuracy(tf.argmax(pred, 1), tf.argmax(y_labeled_train, 1))
            if (batch_nr == batches_per_epoch - 1):
                for test_batch in range(num_test_batches):
                    X_val, y_val = test_iterator.get_next()
                    y_val_predictions = model(X_val, training=False)
                    y_pred = tf.argmax(y_val_predictions, 1)
                    y_true = tf.argmax(y_val, 1)
                    y_pred_epoch = np.asarray(y_pred)
                    y_true_epoch = np.asarray(y_true)
                    #print(y_pred, y_true)
                    prec_epch = sk.metrics.precision_score(
                        y_true_epoch, y_pred_epoch)
                    rec_epch = sk.metrics.recall_score(y_true_epoch,
                                                       y_pred_epoch)
                    f1_epch = sk.metrics.f1_score(y_true_epoch, y_pred_epoch)

                    epoch_loss_avg_val(
                        tf.losses.softmax_cross_entropy(
                            y_val, y_val_predictions))
                    epoch_accuracy_val(tf.argmax(y_val_predictions, 1),
                                       tf.argmax(y_val, 1))
        #value1 = epoch+1
        #value2 = epoch_accuracy.result()
        #value3 =
        #value4 =
        #value5 =
        #value6 =
        #arrResult = [epoch+1, epoch_accuracy.result(), epoch_accuracy_val, a, b, c ]
        arrResult = "{:03d}, {:02.6%}, {:02.6%}, {:.4%}, {:.4%}, {:.4%} ".format(
            epoch + 1, epoch_accuracy.result(), epoch_accuracy_val.result(),
            prec_epch, rec_epch, f1_epch)
        out = open('output_summary.csv', 'a+')
        out.write(arrResult + '\n')
        #writef = csv.writer(out, delimiter=' ')
        #writef.writerow(arrResult)

        #        print("Epoch {:03d}/{:03d}: Train Loss: {:9.7f}, Train Accuracy: {:02.6%}, Validation Loss: {:9.7f}, "
        #              "Validation Accuracy: {:02.6%}, lr={:.9f}, unsupervised weight={:5.3f}, beta1={:.9f}".format(epoch+1,
        #                                                                                                           epochs,
        #                                                                                                           epoch_loss_avg.result(),
        #                                                                                                           epoch_accuracy.result(),
        #                                                                                                           epoch_loss_avg_val.result(),
        #                                                                                                           epoch_accuracy_val.result(),
        #                                                                                                           learning_rate.numpy(),
        #                                                                                                           unsupervised_weight,
        #                                                                                                           beta_1.numpy()))
        print(
            "Epoch {:03d}/{:03d}: Train Loss: {:9.7f}, Train Accuracy: {:02.6%}, lr={:.9f}, unsupervised weight={:5.3f}, beta1={:.9f}"
            .format(epoch + 1, epochs, epoch_loss_avg.result(),
                    epoch_accuracy.result(), learning_rate.numpy(),
                    unsupervised_weight, beta_1.numpy()))
        print(epoch_accuracy_val)
        #print (epoch_accuracy.result())
        # If the accuracy of validation improves save a checkpoint Best 85%
        if best_val_accuracy < epoch_accuracy.result():
            best_val_accuracy = epoch_accuracy.result()
            checkpoint = tfe.Checkpoint(optimizer=optimizer,
                                        model=model,
                                        optimizer_step=global_step)
            checkpoint.save(file_prefix=checkpoint_directory)

        # Record summaries
        #with tf.contrib.summary.record_summaries_every_n_global_steps(1):
        #    tf.contrib.summary.scalar('Train Loss', epoch_loss_avg.result())
        #    tf.contrib.summary.scalar(
        #        'Train Accuracy', epoch_accuracy.result())
        #    tf.contrib.summary.scalar(
        #        'Validation Loss', epoch_loss_avg_val.result())
        #    tf.contrib.summary.scalar(
        #        'Validation Accuracy', epoch_accuracy_val.result())
        #    tf.contrib.summary.scalar(
        #        'Unsupervised Weight', unsupervised_weight)
        #    tf.contrib.summary.scalar('Learning Rate', learning_rate.numpy())
        #    tf.contrib.summary.scalar('Ramp Up Function', rampup_value)
        #    tf.contrib.summary.scalar('Ramp Down Function', rampdown_value)

    #print('\nTrain Ended! Best Validation accuracy = {}\n'.format(best_val_accuracy))
    #sys.exit()
    # Load the best model
    root = tfe.Checkpoint(optimizer=optimizer,
                          model=model,
                          optimizer_step=tf.train.get_or_create_global_step())
    root.restore(tf.train.latest_checkpoint(checkpoint_directory))

    # Evaluate on the final test set
    #num_test_batches = NUM_TEST_SAMPLES/batch_size
    test_accuracy = tfe.metrics.Accuracy()
    #recall_eval = tf.metrics.recall(y_test_predictions, y_test)
    #precision_eval = tf.metrics.precision(y_test_predictions, y_test)
    for test_batch in range(int(num_test_batches)):
        X_test, y_test = test_iterator.get_next()
        #print(y_test[0:20,1])

        y_test_predictions = model(X_test, training=False)
        test_accuracy(tf.argmax(y_test_predictions, 1), tf.argmax(y_test, 1))
        y_pred = tf.argmax(y_test_predictions, 1)
        y_true = tf.argmax(y_test, 1)
        y_pred = np.asarray(y_pred)
        y_true = np.asarray(y_true)
        #print(y_pred, y_true)
        a = sk.metrics.precision_score(y_true, y_pred)
        b = sk.metrics.recall_score(y_true, y_pred)
        c = sk.metrics.f1_score(y_true, y_pred)

    print("Precision", a)
    print("Recall", b)
    print("f1_score", c)
    #print ("confusion_matrix")
    #print (sk.metrics.confusion_matrix(y_true, y_pred))
    #fpr, tpr, tresholds = sk.metrics.roc_curve(y_true, y_pred)

    #precision_eval = tf.metrics.precision(y_test_predictions, y_test)
    #precision_eval = tf.contrib.metrics.precision_at_recall(tf.argmax(y_test_predictions, 1), tf.argmax(y_test, 1), 1)
    print(tf.argmax(y_test_predictions))
    print(tf.argmax(y_test))
    #f1_score(y_test_predictions, y_test, average='macro')
    print("Final Test Accuracy: {:.6%}".format(test_accuracy.result()))