def train(hyper_params):
    mnist = get_data()

    # Get graph definition, tensors and ops
    train_step, cross_entropy, accuracy, x, y, y_ = build_model_graph(
        hyper_params)

    # log parameters to Comet.ml
    import os
    # Setting the API key (saved as environment variable)
    exp = Experiment(
        api_key="<HIDDEN>",
        # or
        # api_key=os.environ.get("COMET_API_KEY"),
        project_name="prototype",
        workspace="jaimemarijke")
    exp.log_parameters(hyper_params)
    exp.log_dataset_hash(mnist)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        exp.set_model_graph(sess.graph)

        for i in range(hyper_params["steps"]):
            batch = mnist.train.next_batch(hyper_params["batch_size"])
            exp.set_step(i)
            # Compute train accuracy every 10 steps
            if i % 10 == 0:
                train_accuracy = accuracy.eval(feed_dict={
                    x: batch[0],
                    y_: batch[1]
                })
                print('step %d, training accuracy %g' % (i, train_accuracy))
                exp.log_metric("acc", train_accuracy)

            # Update weights (back propagation)
            loss = train_step.run(feed_dict={x: batch[0], y_: batch[1]})
            exp.log_metric("loss", loss)

        ### Finished Training ###

        # Compute test accuracy
        acc = accuracy.eval(feed_dict={
            x: mnist.test.images,
            y_: mnist.test.labels
        })

        print('test accuracy %g' % acc)
def train(hyper_params):
    mnist = get_data()

    # Get graph definition, tensors and ops
    train_step, cross_entropy, accuracy, x, y, y_ = build_model_graph(
        hyper_params)

    experiment = Experiment(project_name="tf")
    experiment.log_parameters(hyper_params)
    experiment.log_dataset_hash(mnist)

    with tf.Session() as sess:
        with experiment.train():
            sess.run(tf.global_variables_initializer())
            experiment.set_model_graph(sess.graph)

            for i in range(hyper_params["steps"]):
                batch = mnist.train.next_batch(hyper_params["batch_size"])
                experiment.set_step(i)
                # Compute train accuracy every 10 steps
                if i % 10 == 0:
                    train_accuracy = accuracy.eval(feed_dict={
                        x: batch[0],
                        y_: batch[1]
                    })
                    print('step %d, training accuracy %g' %
                          (i, train_accuracy))
                    experiment.log_metric("accuracy", train_accuracy, step=i)

                # Update weights (back propagation)
                _, loss_val = sess.run([train_step, cross_entropy],
                                       feed_dict={
                                           x: batch[0],
                                           y_: batch[1]
                                       })

                experiment.log_metric("loss", loss_val, step=i)

        ### Finished Training ###

        with experiment.test():
            # Compute test accuracy
            acc = accuracy.eval(feed_dict={
                x: mnist.test.images,
                y_: mnist.test.labels
            })
            experiment.log_metric("accuracy", acc)
            print('test accuracy %g' % acc)
def train(hyper_params):
    mnist = get_data()

    # Get graph definition, tensors and ops
    train_step, cross_entropy, accuracy, x, y, y_ = build_model_graph(
        hyper_params)

    #log parameters to Comet.ml
    exp = Experiment(api_key="YOUR-API-KEY",
                     project_name='tensorflow examples')
    exp.log_multiple_params(hyper_params)
    exp.log_dataset_hash(mnist)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        exp.set_model_graph(sess.graph)

        for i in range(hyper_params["steps"]):
            batch = mnist.train.next_batch(hyper_params["batch_size"])
            exp.set_step(i)
            # Compute train accuracy every 10 steps
            if i % 10 == 0:
                train_accuracy = accuracy.eval(feed_dict={
                    x: batch[0],
                    y_: batch[1]
                })
                print('step %d, training accuracy %g' % (i, train_accuracy))
                exp.log_metric("acc", train_accuracy)

            # Update weights (back propagation)
            loss = train_step.run(feed_dict={x: batch[0], y_: batch[1]})
            exp.log_metric("loss", loss)

        ### Finished Training ###

        # Compute test accuracy
        acc = accuracy.eval(feed_dict={
            x: mnist.test.images,
            y_: mnist.test.labels
        })

        print('test accuracy %g' % acc)
Ejemplo n.º 4
0
def run_main_loop(args, train_estimator, predict_estimator):
	total_steps = 0
	train_steps = math.ceil(args.train_examples / args._batch_size)
	eval_steps  = math.ceil(args.eval_examples  / args._batch_size)

	if args.use_comet:
		experiment = Experiment(api_key=comet_ml_api_key, project_name=comet_ml_project, workspace=comet_ml_workspace)
		experiment.log_parameters(vars(args))
		experiment.add_tags(args.tag)
		experiment.set_name(model_name(args))
	else:
		experiment = None

	prefetch_inception_model()

	with tf.gfile.Open(os.path.join(suffixed_folder(args, args.result_dir), "eval.txt"), "a") as eval_file:
		for epoch in range(0, args.epochs, args.predict_every):

			logger.info(f"Training epoch {epoch}")
			train_estimator.train(input_fn=train_input_fn, steps=train_steps * args.predict_every)
			total_steps += train_steps * args.predict_every

			if args.use_comet:
				experiment.set_step(epoch)

			# logger.info(f"Evaluate {epoch}")
			# evaluation = predict_estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
			# logger.info(evaluation)
			# save_evaluation(args, eval_file, evaluation, epoch, total_steps)
			
			# if args.use_comet:
			# 	experiment.log_metrics(evaluation)
			
			logger.info(f"Generate predictions {epoch}")
			predictions = predict_estimator.predict(input_fn=predict_input_fn)
			
			logger.info(f"Save predictions")
			save_predictions(args, suffixed_folder(args, args.result_dir), eval_file, predictions, epoch, total_steps, experiment)

	logger.info(f"Completed {args.epochs} epochs")
Ejemplo n.º 5
0
            parameters['launch_epoch'] = epoch
            disable_flag = 1
            sample_count = len(train_batched)

    else:
        if save:
            torch.save(model.state_dict(), model_name)
            best_idx = epoch

    best_test_F, new_test_F, _ = evaluating_batch(model, test_batched,
                                                  best_test_F)

    all_F.append([0.0, new_dev_F, new_test_F])

    sys.stdout.flush()
    print('Epoch %d : train/dev/test : %.2f / %.2f / %.2f - %d' %
          (epoch, new_train_F, new_dev_F, new_test_F, best_idx))
    model.train(True)
    adjust_learning_rate(optimizer,
                         lr=learning_rate /
                         (1 + 0.05 * sample_count / len(train_data)))

    metrics['new_train_F'] = new_train_F
    metrics['new_test_F'] = new_test_F
    metrics['new_dev_F'] = new_dev_F

    experiment.log_metrics(metrics)
    experiment.set_step(epoch + 1)

print(time.time() - t)
Ejemplo n.º 6
0
criterion = nn.BCELoss()
# Establish convention for real and fake labels during training
real_label = 1
fake_label = 0

# Setup Adam optimizers for both G and D
optimizerD = optim.Adam(netD.parameters(), lr=lr, betas=(beta1, 0.999))
optimizerG = optim.Adam(netG.parameters(), lr=lr, betas=(beta1, 0.999))

steps = 0

for epoch in range(num_epochs):
    experiment.log_current_epoch(epoch)
    for i, data in enumerate(dataloader, 0):
        experiment.set_step(steps)

        ############################
        # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z)))
        ###########################
        ## Train with all-real batch
        netD.zero_grad()
        # Format batch
        real_cpu = data[0].to(device)
        b_size = real_cpu.size(0)
        label = torch.full((b_size,), real_label, device=device)
        # Forward pass real batch through D
        output = netD(real_cpu).view(-1)
        # Calculate loss on all-real batch
        errD_real = criterion(output, label)
        # Calculate gradients for D in backward pass
Ejemplo n.º 7
0
class CometMLMonitor(MonitorBase):
    """
    Send scalar data and the graph to https://www.comet.ml.

    Note:
        1. comet_ml requires you to `import comet_ml` before importing tensorflow or tensorpack.
        2. The "automatic output logging" feature of comet_ml will make the training progress bar appear to freeze.
           Therefore the feature is disabled by default.
    """
    def __init__(self, experiment=None, tags=None, **kwargs):
        """
        Args:
            experiment (comet_ml.Experiment): if provided, invalidate all other arguments
            tags (list[str]): experiment tags
            kwargs: arguments used to initialize :class:`comet_ml.Experiment`,
                such as project name, API key, etc.
                Refer to its documentation for details.
        """
        if experiment is not None:
            self._exp = experiment
            assert tags is None and len(kwargs) == 0
        else:
            from comet_ml import Experiment
            kwargs.setdefault(
                'log_code', True
            )  # though it's not functioning, git patch logging requires it
            kwargs.setdefault('auto_output_logging', None)
            self._exp = Experiment(**kwargs)
            if tags is not None:
                self._exp.add_tags(tags)

        self._exp.set_code("Code logging is impossible ...")
        self._exp.log_dependency('tensorpack', __git_version__)

    @property
    def experiment(self):
        """
        The :class:`comet_ml.Experiment` instance.
        """
        return self._exp

    def _before_train(self):
        self._exp.set_model_graph(tf.get_default_graph())

    @HIDE_DOC
    def process_scalar(self, name, val):
        self._exp.log_metric(name, val, step=self.global_step)

    @HIDE_DOC
    def process_image(self, name, val):
        self._exp.set_step(self.global_step)
        for idx, v in enumerate(val):
            log_name = "{}_step{}{}".format(
                name, self.global_step, "_" + str(idx) if len(val) > 1 else "")

            self._exp.log_image(v,
                                image_format="jpeg",
                                name=log_name,
                                image_minmax=(0, 255))

    def _after_train(self):
        self._exp.end()

    def _after_epoch(self):
        self._exp.log_epoch_end(self.epoch_num)
Ejemplo n.º 8
0
    saver = tf.train.Saver(max_to_keep=1)
    # try:
    #     saver.restore(session, "./model.ckpt")
    #     print('Restored model.')
    # except ValueError:
    #     print('Initialized.')

    for epoch in range(epochs):
        session.run(iterator.initializer)
        experiment.log_current_epoch(epoch)

        try:

            for step in itertools.count(start=0, step=1):
                steps_per_epoch = max(steps_per_epoch, step)
                experiment.set_step(steps_per_epoch * epoch + step)
                total_images_looked_at = (steps_per_epoch * epoch + step) * (model.batch_size // 2)

                current_resolution_schedule_period_length = (0.3+current_resolution*0.003)*60*60

                #if abs(last_schedule_update-total_images_looked_at) > 5000 and not schedule_finalized:
                #if abs(time.time()-last_schedule_update_time) > 60*60*1.428 and not schedule_finalized:
                if abs(time.time() - last_schedule_update_time) > current_resolution_schedule_period_length and not schedule_finalized:
                    if current_mode == 'train':
                        current_mode = 'stabilize'
                        try:
                            current_resolution = sizes[sizes.index(current_resolution)+1]
                        except IndexError:
                            current_resolution = sizes[-1]
                            current_mode = 'train'
                            schedule_finalized = True
Ejemplo n.º 9
0
    def train(self):

        # comet_ml
        # Create an experiment
        experiment = Experiment(api_key="B6hzNydshIpZSG2Xi9BDG9gdG",
                                project_name="glow-mnist", workspace="voletiv")
        hparams_dict = self.hparams_dict()
        experiment.log_parameters(hparams_dict)

        # set to training state
        self.graph.train()
        self.global_step = self.loaded_step

        # begin to train
        for epoch in range(self.n_epoches):
            print("epoch", epoch)
            progress = tqdm(self.data_loader)
            for i_batch, batch in enumerate(progress):

                experiment.set_step(self.global_step)

                # update learning rate
                lr = self.lrschedule["func"](global_step=self.global_step,
                                             **self.lrschedule["args"])
                for param_group in self.optim.param_groups:
                    param_group['lr'] = lr
                self.optim.zero_grad()

                # log
                if self.global_step % self.scalar_log_gaps == 0:
                    # self.writer.add_scalar("lr/lr", lr, self.global_step)
                    experiment.log_metrics({"lr": lr, "epoch": epoch+i_batch/len(self.data_loader)})

                # get batch data
                for k in batch:
                    batch[k] = batch[k].to(self.data_device)
                x = batch["x"]
                y = None
                y_onehot = None
                if self.y_condition:
                    if self.y_criterion == "multi-classes":
                        assert "y_onehot" in batch, "multi-classes ask for `y_onehot` (torch.FloatTensor onehot)"
                        y_onehot = batch["y_onehot"]
                    elif self.y_criterion == "single-class":
                        assert "y" in batch, "single-class ask for `y` (torch.LongTensor indexes)"
                        y = batch["y"]
                        y_onehot = thops.onehot(y, num_classes=self.y_classes)

                # at first time, initialize ActNorm
                if self.global_step == 0:
                    self.graph(x[:self.batch_size // len(self.devices), ...],
                               y_onehot[:self.batch_size // len(self.devices), ...] if y_onehot is not None else None)

                # parallel
                if len(self.devices) > 1 and not hasattr(self.graph, "module"):
                    print("[Parallel] move to {}".format(self.devices))
                    self.graph = torch.nn.parallel.DataParallel(self.graph, self.devices, self.devices[0])

                # forward phase
                z, nll, y_logits = self.graph(x=x, y_onehot=y_onehot)

                # loss_generative
                loss_generative = Glow.loss_generative(nll)

                # loss_classes
                loss_classes = 0
                if self.y_condition:
                    loss_classes = (Glow.loss_multi_classes(y_logits, y_onehot)
                                    if self.y_criterion == "multi-classes" else
                                    Glow.loss_class(y_logits, y))

                # total loss
                loss = loss_generative + loss_classes * self.weight_y

                # log
                if self.global_step % self.scalar_log_gaps == 0:
                    # self.writer.add_scalar("loss/loss_generative", loss_generative, self.global_step)
                    experiment.log_metrics({"loss_generative": loss_generative})
                    if self.y_condition:
                        # self.writer.add_scalar("loss/loss_classes", loss_classes, self.global_step)
                        experiment.log_metrics({"loss_classes": loss_classes, "total_loss": loss})

                # backward
                self.graph.zero_grad()
                self.optim.zero_grad()
                loss.backward()

                # operate grad
                if self.max_grad_clip is not None and self.max_grad_clip > 0:
                    torch.nn.utils.clip_grad_value_(self.graph.parameters(), self.max_grad_clip)
                if self.max_grad_norm is not None and self.max_grad_norm > 0:
                    grad_norm = torch.nn.utils.clip_grad_norm_(self.graph.parameters(), self.max_grad_norm)
                    if self.global_step % self.scalar_log_gaps == 0:
                        # self.writer.add_scalar("grad_norm/grad_norm", grad_norm, self.global_step)
                        experiment.log_metrics({"grad_norm": grad_norm})

                # step
                self.optim.step()

                # checkpoints
                if self.global_step % self.checkpoints_gap == 0 and self.global_step > 0:
                    save(global_step=self.global_step,
                         graph=self.graph,
                         optim=self.optim,
                         pkg_dir=self.checkpoints_dir,
                         is_best=True,
                         max_checkpoints=self.max_checkpoints)

                # plot images
                if self.global_step % self.plot_gaps == 0:
                    img = self.graph(z=z, y_onehot=y_onehot, reverse=True)
                    # img = torch.clamp(img, min=0, max=1.0)

                    if self.y_condition:
                        if self.y_criterion == "multi-classes":
                            y_pred = torch.sigmoid(y_logits)
                        elif self.y_criterion == "single-class":
                            y_pred = thops.onehot(torch.argmax(F.softmax(y_logits, dim=1), dim=1, keepdim=True),
                                                  self.y_classes)
                        y_true = y_onehot

                    # plot images
                    # self.writer.add_image("0_reverse/{}".format(bi), torch.cat((img[bi], batch["x"][bi]), dim=1), self.global_step)
                    vutils.save_image(torch.stack([torch.cat((img[bi], batch["x"][bi]), dim=1) for bi in range(min([len(img), self.n_image_samples]))]), '/tmp/vikramvoleti.png', nrow=10)
                    experiment.log_image('/tmp/vikramvoleti_rev.png', file_name="0_reverse")

                    # plot preds
                    # for bi in range(min([len(img), self.n_image_samples])):
                    #     # wandb.log({"0_reverse_{}".format(bi): [wandb.Image(torch.cat((img[bi], batch["x"][bi]), dim=1), caption="0_reverse/{}".format(bi))]}, step=self.global_step)
                    #     if self.y_condition:
                    #         # self.writer.add_image("1_prob/{}".format(bi), plot_prob([y_pred[bi], y_true[bi]], ["pred", "true"]), self.global_step)
                    #         wandb.log({"1_prob_{}".format(bi): [wandb.Image(plot_prob([y_pred[bi], y_true[bi]], ["pred", "true"]))]}, step=self.global_step)

                # inference
                if hasattr(self, "inference_gap"):
                    if self.global_step % self.inference_gap == 0:
                        try:
                            img = self.graph(z=None, y_onehot=inference_y_onehot, eps_std=0.5, reverse=True)
                        except NameError:
                            inference_y_onehot = torch.zeros_like(y_onehot, device=torch.device('cpu'))
                            for i in range(inference_y_onehot.size(0)):
                                inference_y_onehot[i, (i % inference_y_onehot.size(1))] = 1.
                            # now
                            inference_y_onehot = inference_y_onehot.to(y_onehot.device)
                            img = self.graph(z=None, y_onehot=inference_y_onehot, eps_std=0.5, reverse=True)
                        # grid
                        vutils.save_image(img[:min([len(img), self.n_image_samples])], '/tmp/vikramvoleti.png', nrow=10)
                        experiment.log_image('/tmp/vikramvoleti_sam.png', file_name="1_samples")
                        # img = torch.clamp(img, min=0, max=1.0)
                        # for bi in range(min([len(img), n_images])):
                        #     # self.writer.add_image("2_sample/{}".format(bi), img[bi], self.global_step)
                        #     wandb.log({"2_sample_{}".format(bi): [wandb.Image(img[bi])]}, step=self.global_step)

                if self.global_step == 0:
                    subprocess.run('nvidia-smi')

                # global step
                self.global_step += 1
Ejemplo n.º 10
0
        labels = Variable(labels)

        # Forward + Backward + Optimize
        optimizer.zero_grad()
        outputs = rnn(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # Compute train accuracy
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels.data).sum()

        # Log to Comet.ml
        experiment.set_step(i)
        experiment.log_metric("loss", loss.data[0])
        experiment.log_metric("accuracy", correct / total)

        if (i + 1) % 100 == 0:
            print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f' %
                  (epoch + 1, hyper_params['num_epochs'], i + 1,
                   len(train_dataset) // hyper_params['batch_size'],
                   loss.data[0]))

# Test the Model
correct = 0
total = 0
for images, labels in test_loader:
    images = Variable(
        images.view(-1, hyper_params['sequence_length'],
Ejemplo n.º 11
0
class Reptile(Task):
    """
    A meta-learning task that teaches an agent over a set of other tasks
    """
    def __init__(self,
                 data_handler,
                 load_key=None,
                 sender=True,
                 receiver=True,
                 image_captioner=True,
                 image_selector=False,
                 track_results=True):
        self.sess = Agent.sess
        self.N = 1  # number of steps taken for each task - should be > 1

        self.S = SenderAgent()
        self.R = ReceiverAgent(*self.S.get_output())
        self.IC = ImageCaptioner()
        # self.IS = ImageSelector()

        self.S.all_agents_initialized(load_key)
        self.R.all_agents_initialized(load_key)

        self.train_metrics = {}
        self.val_metrics = {}
        self.experiment = Experiment(api_key='1jl4lQOnJsVdZR6oekS6WO5FI',
                                     project_name='Reptile',
                                     auto_param_logging=False,
                                     auto_metric_logging=False,
                                     disabled=(not track_results))

        self.params = {}
        self.params.update(Agent.get_params())
        self.params.update(data_handler.get_params())
        self.experiment.log_parameters(self.params)

        self.T = {}
        if image_captioner:
            self.ic = ImageCaptioning(self.IC,
                                      experiment=self.experiment,
                                      track_results=False)
            self.T["Image Captioner"] = lambda img, capts: self.ic.train_batch(
                (img, capts), mode="train")
        if image_selector:
            self.is_ = ImageSelection(self.IS,
                                      experiment=self.experiment,
                                      track_results=False)
            self.T["Image Selector"] = lambda img, capts: self.is_.train_batch(
                (img, capts), mode="train")
        if sender or receiver:
            self.rg = ReferentialGame(self.S,
                                      self.R,
                                      experiment=self.experiment,
                                      track_results=False)
            if receiver:
                self.T["Receiver"] = lambda img, capts: self.rg.train_batch(
                    img, mode="receiver_train")
            if sender:
                self.T["Sender"] = lambda img, capts: self.rg.train_batch(
                    img, mode="sender_train")

        # Initialize TF
        variables_to_initialize = tf.global_variables()
        if load_key is not None:
            dont_initialize = []
            if SenderAgent.loaded:
                dont_initialize += SenderAgent.get_all_weights()
            if ReceiverAgent.loaded:
                dont_initialize += ReceiverAgent.get_all_weights()
            if ImageCaptioner.loaded:
                dont_initialize += ImageCaptioner.get_all_weights()
            variables_to_initialize = [
                v for v in tf.global_variables() if v not in dont_initialize
            ]
            # REMOVE LATER
            #variables_to_initialize += ImageCaptioner.optimizer.variables()
        Agent.sess.run(tf.variables_initializer(variables_to_initialize))

        self.sender_shared_state = VariableState(
            self.sess, SenderAgent.get_shared_weights())
        self.receiver_shared_state = VariableState(
            self.sess, ReceiverAgent.get_shared_weights())
        self.sender_own_state = VariableState(self.sess,
                                              SenderAgent.get_weights())
        self.receiver_own_state = VariableState(self.sess,
                                                ReceiverAgent.get_weights())

        # print(SenderAgent.get_shared_weights())
        # print(ReceiverAgent.get_shared_weights())
        # print(SenderAgent.get_weights())
        # print(ReceiverAgent.get_weights())
        # print(tf.trainable_variables())

        self.shared_states = {
            "shared_sender": self.sender_shared_state,
            "shared_receiver": self.receiver_shared_state
        }
        self.own_states = {
            "own_sender": self.sender_own_state,
            "own_receiver": self.receiver_own_state
        }

        shared_average = []
        for k, v in self.shared_states.items():
            shared_average.append(v.export_variables())

        shared_average = np.mean(shared_average, axis=0)
        self.set_weights(new_shared_weights=shared_average)

        self.dh = data_handler
        with open(
                "{}/data/csv_loss_{}.csv".format(project_path,
                                                 self.experiment.get_key()),
                'w+') as csv_loss_file:
            csv_loss_file.write(
                "Image Captioner Loss,Image Selector Loss,Sender Loss,Receiver Loss\n"
            )
        with open(
                "{}/data/csv_accuracy_{}.csv".format(
                    project_path, self.experiment.get_key()),
                'w+') as csv_acc_file:
            csv_acc_file.write(
                "Image Captioner Loss,Image Selector Loss,Sender Loss,Receiver Loss\n"
            )

        self.step = 0

    def get_diff(self, a, b):
        diff = 0.
        if isinstance(a, (np.ndarray, np.generic)):
            return np.sum(np.abs(a - b))

        elif isinstance(a, list):
            for i in range(len(a)):
                diff += self.get_diff(a[i], b[i])

        elif isinstance(a, dict):
            for k in a:
                diff += self.get_diff(a[k], b[k])

        return diff

    def set_weights(self, new_own_weights=None, new_shared_weights=None):
        if new_own_weights is not None:
            for k, s in self.own_states.items():
                s.import_variables(new_own_weights[k])
        if new_shared_weights is not None:
            for k, s in self.shared_states.items():
                s.import_variables(new_shared_weights)

    def train_epoch(self, e, mode=None):
        self.dh.set_params(distractors=0)
        image_gen = self.dh.get_images(return_captions=True, mode="train")
        # Get current variables
        start_vars = {
            k: s.export_variables()
            for k, s in self.own_states.items()
        }
        start_vars["shared"] = self.shared_states[
            "shared_sender"].export_variables()

        while True:
            try:

                # Save current variables
                old_own = {
                    k: s.export_variables()
                    for k, s in self.own_states.items()
                }
                new_own = {k: [] for k, s in self.own_states.items()}
                old_shared = self.shared_states[
                    "shared_sender"].export_variables()
                new_shared = []

                # For each task
                for task in ["Image Captioner", "Sender", "Receiver"]:
                    # parameter setup to not waste data
                    if task in ["Sender", "Receiver", "Image Selector"]:
                        self.dh.set_params(distractors=Agent.D)
                    else:
                        self.dh.set_params(distractors=0)
                    # Run task n times
                    for _ in range(self.N):
                        images, captions = next(image_gen)
                        acc, loss = self.T[task](images, captions)
                    self.train_metrics[task + " Accuracy"] = acc
                    self.train_metrics[task + " Loss"] = loss

                    # Store new variables
                    [
                        new_own[k].append(s.export_variables())
                        for k, s in self.own_states.items()
                    ]
                    [
                        new_shared.append(s.export_variables())
                        for k, s in self.shared_states.items()
                    ]

                    # Reset to old variables for next task
                    [
                        s.import_variables(old_own[k])
                        for k, s in self.own_states.items()
                    ]
                    [
                        s.import_variables(old_shared)
                        for k, s in self.shared_states.items()
                    ]

                self.step += 1
                self.experiment.set_step(self.step)
                self.experiment.log_metrics(self.train_metrics)
                # Average new variables
                new_own = {
                    k: interpolate_vars(old_own[k], average_vars(new_own[k]),
                                        0.2)
                    for k, s in self.own_states.items()
                }
                new_shared = interpolate_vars(old_shared,
                                              average_vars(new_shared), 0.2)
                # Set variables to new variables
                self.set_weights(new_own_weights=new_own,
                                 new_shared_weights=new_shared)

            except StopIteration:
                break

        # Get change in weights
        end_vars = {
            k: s.export_variables()
            for k, s in self.own_states.items()
        }
        end_vars["shared"] = self.shared_states[
            "shared_sender"].export_variables()
        weight_diff = self.get_diff(start_vars, end_vars)

        #self.experiment.set_step(e)
        self.val_metrics["Weight Change"] = weight_diff
        self.experiment.log_metrics(self.val_metrics)

        # Log data to a csv
        with open("{}/data/csv_loss_{}.csv".format(project_path, self.experiment.get_key()), 'a') as csv_loss_file, \
             open("{}/data/csv_accuracy_{}.csv".format(project_path, self.experiment.get_key()), 'a') as csv_acc_file:
            losses = []
            accs = []
            for task in ["Image Captioner", "Sender", "Receiver"]:
                losses.append(str(self.train_metrics[task + " Loss"]))
                accs.append(str(self.train_metrics[task + " Accuracy"]))

            csv_loss_file.write(",".join(losses))
            csv_loss_file.write("\n")

            csv_acc_file.write(",".join(accs))
            csv_acc_file.write("\n")

        return 0, weight_diff
Ejemplo n.º 12
0
def main(_):
    experiment = Experiment(api_key="xXtJguCo8yFdU7dpjEpo6YbHw",
                            project_name=args.experiment_name)
    hyper_params = {
        "learning_rate": args.lr,
        "num_epochs": args.max_epoch,
        "batch_size": args.single_batch_size,
        "alpha": args.alpha,
        "beta": args.beta,
        "gamma": args.gamma,
        "loss": args.loss
    }
    experiment.log_multiple_params(hyper_params)

    # TODO: split file support
    with tf.Graph().as_default():
        global save_model_dir
        start_epoch = 0
        global_counter = 0

        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=cfg.GPU_MEMORY_FRACTION,
            visible_device_list=cfg.GPU_AVAILABLE,
            allow_growth=True)
        config = tf.ConfigProto(
            gpu_options=gpu_options,
            device_count={
                "GPU": cfg.GPU_USE_COUNT,
            },
            allow_soft_placement=True,
            log_device_placement=False,
        )
        with tf.Session(config=config) as sess:
            # sess=tf_debug.LocalCLIDebugWrapperSession(sess,ui_type='readline')
            model = RPN3D(cls=cfg.DETECT_OBJ,
                          single_batch_size=args.single_batch_size,
                          learning_rate=args.lr,
                          max_gradient_norm=5.0,
                          alpha=args.alpha,
                          beta=args.beta,
                          gamma=args.gamma,
                          loss_type=args.loss,
                          avail_gpus=cfg.GPU_AVAILABLE.split(','))
            # param init/restore
            if tf.train.get_checkpoint_state(save_model_dir):
                print("Reading model parameters from %s" % save_model_dir)
                model.saver.restore(sess,
                                    tf.train.latest_checkpoint(save_model_dir))
                start_epoch = model.epoch.eval() + 1
                global_counter = model.global_step.eval() + 1
            else:
                print("Created model with fresh parameters.")
                tf.global_variables_initializer().run()

            # train and validate
            is_summary, is_summary_image, is_validate = False, False, False

            summary_interval = 5
            summary_val_interval = 10
            summary_writer = tf.summary.FileWriter(log_dir, sess.graph)
            experiment.set_model_graph(sess.graph)

            # training
            with experiment.train():
                for epoch in range(start_epoch, args.max_epoch):
                    counter = 0
                    batch_time = time.time()
                    experiment.log_current_epoch(epoch)

                    for batch in iterate_data(
                            train_dir,
                            shuffle=True,
                            aug=True,
                            is_testset=False,
                            batch_size=args.single_batch_size *
                            cfg.GPU_USE_COUNT,
                            multi_gpu_sum=cfg.GPU_USE_COUNT):

                        counter += 1
                        global_counter += 1
                        experiment.set_step(global_counter)
                        if counter % summary_interval == 0:
                            is_summary = True
                        else:
                            is_summary = False
                        epochs = args.max_epoch
                        start_time = time.time()
                        ret = model.train_step(sess,
                                               batch,
                                               train=True,
                                               summary=is_summary)
                        forward_time = time.time() - start_time
                        batch_time = time.time() - batch_time
                        param = ret
                        params = {
                            "loss": param[0],
                            "cls_loss": param[1],
                            "cls_pos_loss": param[2],
                            "cls_neg_loss": param[3]
                        }
                        experiment.log_multiple_metrics(params)
                        # print(ret)
                        print(
                            'train: {} @ epoch:{}/{} loss: {:.4f} cls_loss: {:.4f} cls_pos_loss: {:.4f} cls_neg_loss: {:.4f} forward time: {:.4f} batch time: {:.4f}'
                            .format(counter, epoch, epochs, ret[0], ret[1],
                                    ret[2], ret[3], forward_time, batch_time))
                        # with open('log/train.txt', 'a') as f:
                        # f.write( 'train: {} @ epoch:{}/{} loss: {:.4f} cls_loss: {:.4f} cls_pos_loss: {:.4f} cls_neg_loss: {:.4f} forward time: {:.4f} batch time: {:.4f}'.format(counter,epoch, epochs, ret[0], ret[1], ret[2], ret[3], forward_time, batch_time))

                        #print(counter, summary_interval, counter % summary_interval)
                        if counter % summary_interval == 0:
                            print("summary_interval now")
                            summary_writer.add_summary(ret[-1], global_counter)

                        #print(counter, summary_val_interval, counter % summary_val_interval)
                        if counter % summary_val_interval == 0:
                            print("summary_val_interval now")
                            batch = sample_test_data(
                                val_dir,
                                args.single_batch_size * cfg.GPU_USE_COUNT,
                                multi_gpu_sum=cfg.GPU_USE_COUNT)

                            ret = model.validate_step(sess,
                                                      batch,
                                                      summary=True)
                            summary_writer.add_summary(ret[-1], global_counter)

                            try:
                                ret = model.predict_step(sess,
                                                         batch,
                                                         summary=True)
                                summary_writer.add_summary(
                                    ret[-1], global_counter)
                            except:
                                print("prediction skipped due to error")

                        if check_if_should_pause(args.tag):
                            model.saver.save(sess,
                                             os.path.join(
                                                 save_model_dir, 'checkpoint'),
                                             global_step=model.global_step)
                            print('pause and save model @ {} steps:{}'.format(
                                save_model_dir, model.global_step.eval()))
                            sys.exit(0)

                        batch_time = time.time()
                    experiment.log_epoch_end(epoch)
                    sess.run(model.epoch_add_op)

                    model.saver.save(sess,
                                     os.path.join(save_model_dir,
                                                  'checkpoint'),
                                     global_step=model.global_step)

                    # dump test data every 10 epochs
                    if (epoch + 1) % 10 == 0:
                        # create output folder
                        os.makedirs(os.path.join(args.output_path, str(epoch)),
                                    exist_ok=True)
                        os.makedirs(os.path.join(args.output_path, str(epoch),
                                                 'data'),
                                    exist_ok=True)
                        if args.vis:
                            os.makedirs(os.path.join(args.output_path,
                                                     str(epoch), 'vis'),
                                        exist_ok=True)

                        for batch in iterate_data(
                                val_dir,
                                shuffle=False,
                                aug=False,
                                is_testset=False,
                                batch_size=args.single_batch_size *
                                cfg.GPU_USE_COUNT,
                                multi_gpu_sum=cfg.GPU_USE_COUNT):

                            if args.vis:
                                tags, results, front_images, bird_views, heatmaps = model.predict_step(
                                    sess, batch, summary=False, vis=True)
                            else:
                                tags, results = model.predict_step(
                                    sess, batch, summary=False, vis=False)

                            for tag, result in zip(tags, results):
                                of_path = os.path.join(args.output_path,
                                                       str(epoch), 'data',
                                                       tag + '.txt')
                                with open(of_path, 'w+') as f:
                                    labels = box3d_to_label(
                                        [result[:, 1:8]], [result[:, 0]],
                                        [result[:, -1]],
                                        coordinate='lidar')[0]
                                    for line in labels:
                                        f.write(line)
                                    print('write out {} objects to {}'.format(
                                        len(labels), tag))
                            # dump visualizations
                            if args.vis:
                                for tag, front_image, bird_view, heatmap in zip(
                                        tags, front_images, bird_views,
                                        heatmaps):
                                    front_img_path = os.path.join(
                                        args.output_path, str(epoch), 'vis',
                                        tag + '_front.jpg')
                                    bird_view_path = os.path.join(
                                        args.output_path, str(epoch), 'vis',
                                        tag + '_bv.jpg')
                                    heatmap_path = os.path.join(
                                        args.output_path, str(epoch), 'vis',
                                        tag + '_heatmap.jpg')
                                    cv2.imwrite(front_img_path, front_image)
                                    cv2.imwrite(bird_view_path, bird_view)
                                    cv2.imwrite(heatmap_path, heatmap)

                        # execute evaluation code
                        cmd_1 = "./kitti_eval/launch_test.sh"
                        cmd_2 = os.path.join(args.output_path, str(epoch))
                        cmd_3 = os.path.join(args.output_path, str(epoch),
                                             'log')
                        os.system(" ".join([cmd_1, cmd_2, cmd_3]))

            print('train done. total epoch:{} iter:{}'.format(
                epoch, model.global_step.eval()))

            # finallly save model
            model.saver.save(sess,
                             os.path.join(save_model_dir, 'checkpoint'),
                             global_step=model.global_step)
Ejemplo n.º 13
0
def main():
    args = parse_args()
    if args is None:
        exit()

    setup_logging(args)

    gan = BigGAN_128(args)

    if args.use_tpu:
        cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            tpu=args.tpu_name, zone=args.tpu_zone)
        master = cluster_resolver.get_master()
    else:
        master = ''

    tpu_run_config = tf.contrib.tpu.RunConfig(
        master=master,
        evaluation_master=master,
        model_dir=model_dir(args),
        session_config=tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False),
        tpu_config=tf.contrib.tpu.TPUConfig(args.steps_per_loop,
                                            args.num_shards),
    )

    tpu_estimator = tf.contrib.tpu.TPUEstimator(
        model_fn=lambda features, labels, mode, params: gan.tpu_model_fn(
            features, labels, mode, params),
        config=tpu_run_config,
        use_tpu=args.use_tpu,
        train_batch_size=args._batch_size,
        eval_batch_size=args._batch_size,
        predict_batch_size=args._batch_size,
        params=vars(args),
    )

    total_steps = 0

    if args.use_comet:
        experiment = Experiment(api_key="bRptcjkrwOuba29GcyiNaGDbj",
                                project_name="BigGAN",
                                workspace="davidhughhenrymack")
        experiment.log_parameters(vars(args))
        experiment.add_tags(args.tag)
        experiment.set_name(model_name(args))
    else:
        experiment = None

    prefetch_inception_model()

    with tf.gfile.Open(
            os.path.join(suffixed_folder(args, args.result_dir), "eval.txt"),
            "a") as eval_file:
        for epoch in range(args.epochs):
            logger.info(f"Training epoch {epoch}")
            tpu_estimator.train(input_fn=train_input_fn,
                                steps=args.train_steps)
            total_steps += args.train_steps

            logger.info(f"Evaluate {epoch}")
            evaluation = tpu_estimator.evaluate(input_fn=eval_input_fn,
                                                steps=args.eval_steps)

            if args.use_comet:
                experiment.set_step(total_steps)
                experiment.log_metrics(evaluation)

            logger.info(evaluation)
            save_evaluation(args, eval_file, evaluation, epoch, total_steps)

            logger.info(f"Generate predictions {epoch}")
            predictions = tpu_estimator.predict(input_fn=predict_input_fn)

            logger.info(f"Save predictions")
            save_predictions(args, suffixed_folder(args,
                                                   args.result_dir), eval_file,
                             predictions, epoch, total_steps, experiment)
Ejemplo n.º 14
0
def run_HAC(FLAGS, env, agent):
    experiment = Experiment(api_key="M03EcOc9o9kiG95hws4mq1uqI",
                            project_name="HAC",
                            workspace="antonwiehe")

    # Print task summary
    print_summary(FLAGS, env)

    # Determine training mode.  If not testing and not solely training, interleave training and testing to track progress
    mix_train_test = False
    if not FLAGS.test and not FLAGS.train_only:
        mix_train_test = True

    for batch in range(NUM_BATCH):

        num_episodes = agent.other_params["num_exploration_episodes"]

        # Evaluate policy every TEST_FREQ batches if interleaving training and testing
        if mix_train_test and batch % TEST_FREQ == 0:
            print("\n--- TESTING ---")
            agent.FLAGS.test = True
            num_episodes = num_test_episodes

            # Reset successful episode counter
            successful_episodes = 0

        for episode in range(num_episodes):

            print("\nBatch %d, Episode %d" % (batch, episode))

            # Train for an episode
            success = agent.train(env, episode)

            if success:
                print("Batch %d, Episode %d End Goal Achieved\n" %
                      (batch, episode))

                # Increment successful episode counter if applicable
                if mix_train_test and batch % TEST_FREQ == 0:
                    successful_episodes += 1

        # Save agent
        agent.save_model(episode)

        # Finish evaluating policy if tested prior batch
        if mix_train_test and batch % TEST_FREQ == 0:

            # Log performance
            success_rate = successful_episodes / num_test_episodes * 100
            print("\nTesting Success Rate %.2f%%" % success_rate)
            agent.log_performance(success_rate)
            agent.FLAGS.test = False

            experiment.set_step(batch)
            experiment.log_metric("Success rate", success_rate)
            success_list.append(success_rate)
            with open("successRates.csv", 'w', newline='') as myfile:
                wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
                wr.writerow(success_list)

            if success_rate > 95:
                print("Success rate over 95\%!")
                break

            print("\n--- END TESTING ---\n")
Ejemplo n.º 15
0
def train():
    """Train SqueezeSeg model"""
    assert FLAGS.dataset == 'KITTI', \
        'Currently only support KITTI dataset'

    os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu

    with tf.Graph().as_default():

        assert FLAGS.net == 'squeezeSeg', \
            'Selected neural net architecture not supported: {}'.format(FLAGS.net)

        if FLAGS.net == 'squeezeSeg':
            mc = kitti_squeezeSeg_config()
            mc.PRETRAINED_MODEL_PATH = FLAGS.pretrained_model_path
            model = SqueezeSeg(mc)

        imdb = kitti(FLAGS.image_set, FLAGS.data_path, mc)

        # save model size, flops, activations by layers
        with open(os.path.join(FLAGS.train_dir, 'model_metrics.txt'),
                  'w') as f:
            f.write('Number of parameter by layer:\n')
            count = 0
            for c in model.model_size_counter:
                f.write('\t{}: {}\n'.format(c[0], c[1]))
                count += c[1]
            f.write('\ttotal: {}\n'.format(count))

            count = 0
            f.write('\nActivation size by layer:\n')
            for c in model.activation_counter:
                f.write('\t{}: {}\n'.format(c[0], c[1]))
                count += c[1]
            f.write('\ttotal: {}\n'.format(count))

            count = 0
            f.write('\nNumber of flops by layer:\n')
            for c in model.flop_counter:
                f.write('\t{}: {}\n'.format(c[0], c[1]))
                count += c[1]
            f.write('\ttotal: {}\n'.format(count))
        f.close()
        print('Model statistics saved to {}.'.format(
            os.path.join(FLAGS.train_dir, 'model_metrics.txt')))

        def enqueue(sess, coord):
            with coord.stop_on_exception():
                while not coord.should_stop():
                    # read batch input
                    lidar_per_batch, lidar_mask_per_batch, label_per_batch,\
                        weight_per_batch = imdb.read_batch()

                    feed_dict = {
                        model.ph_keep_prob: mc.KEEP_PROB,
                        model.ph_lidar_input: lidar_per_batch,
                        model.ph_lidar_mask: lidar_mask_per_batch,
                        model.ph_label: label_per_batch,
                        model.ph_loss_weight: weight_per_batch,
                    }

                    sess.run(model.enqueue_op, feed_dict=feed_dict)

        saver = tf.train.Saver(tf.all_variables())
        summary_op = tf.summary.merge_all()
        init = tf.initialize_all_variables()

        sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
        sess.run(init)

        summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)

        coord = tf.train.Coordinator()
        enq_threads = []
        for _ in range(mc.NUM_ENQUEUE_THREAD):
            eqth = threading.Thread(target=enqueue, args=[sess, coord])
            eqth.start()
            enq_threads.append(eqth)

        run_options = tf.RunOptions(timeout_in_ms=60000)

        # Create an experiment with your api key
        experiment = Experiment(api_key="lISr0JWgyUIsox8HYPC3isnTP",
                                project_name="squeezeSeg_1080ti",
                                workspace="asimonov")
        hyper_params = {
            "learning_rate": mc.LEARNING_RATE,
            "steps": FLAGS.max_steps,
            "batch_size": mc.BATCH_SIZE
        }
        experiment.log_multiple_params(hyper_params)
        # some_param = "some value"
        # experiment.log_parameter("param name", some_param)

        try:
            for step in xrange(FLAGS.max_steps):
                start_time = time.time()

                experiment.set_step(step)

                if step % FLAGS.summary_step == 0 or step == FLAGS.max_steps - 1:
                    op_list = [
                        model.lidar_input, model.lidar_mask, model.label,
                        model.train_op, model.loss, model.pred_cls, summary_op
                    ]

                    lidar_per_batch, lidar_mask_per_batch, label_per_batch, \
                        _, loss_value, pred_cls, summary_str = sess.run(op_list,
                                                                        options=run_options)

                    experiment.log_metric("loss", loss_value)

                    label_image = visualize_seg(label_per_batch[:6, :, :], mc)
                    pred_image = visualize_seg(pred_cls[:6, :, :], mc)

                    # Run evaluation on the batch
                    ious, _, _, _ = evaluate_iou(
                        label_per_batch,
                        pred_cls * np.squeeze(lidar_mask_per_batch),
                        mc.NUM_CLASS)

                    feed_dict = {}
                    # Assume that class-0 is the background class
                    for i in range(1, mc.NUM_CLASS):
                        feed_dict[model.iou_summary_placeholders[i]] = ious[i]

                    iou_summary_list = sess.run(model.iou_summary_ops[1:],
                                                feed_dict)

                    # Run visualization
                    viz_op_list = [
                        model.show_label, model.show_depth_img, model.show_pred
                    ]
                    viz_summary_list = sess.run(viz_op_list,
                                                feed_dict={
                                                    model.depth_image_to_show:
                                                    lidar_per_batch[:6, :, :,
                                                                    [4]],
                                                    model.label_to_show:
                                                    label_image,
                                                    model.pred_image_to_show:
                                                    pred_image,
                                                })

                    # Add summaries
                    summary_writer.add_summary(summary_str, step)

                    for sum_str in iou_summary_list:
                        summary_writer.add_summary(sum_str, step)

                    for viz_sum in viz_summary_list:
                        summary_writer.add_summary(viz_sum, step)

                    # force tensorflow to synchronise summaries
                    summary_writer.flush()

                else:
                    _, loss_value = sess.run([model.train_op, model.loss],
                                             options=run_options)

                duration = time.time() - start_time

                assert not np.isnan(loss_value), \
                    'Model diverged. Total loss: {}, conf_loss: {}, bbox_loss: {}, ' \
                    'class_loss: {}'.format(loss_value, conf_loss, bbox_loss,
                                            class_loss)

                if step % 10 == 0:
                    num_images_per_step = mc.BATCH_SIZE
                    images_per_sec = num_images_per_step / duration
                    sec_per_batch = float(duration)
                    format_str = (
                        '%s: step %d, loss = %.2f (%.1f images/sec; %.3f '
                        'sec/batch)')
                    print(format_str % (datetime.now(), step, loss_value,
                                        images_per_sec, sec_per_batch))
                    sys.stdout.flush()

                # Save the model checkpoint periodically.
                if step % FLAGS.checkpoint_step == 0 or step == FLAGS.max_steps - 1:
                    checkpoint_path = os.path.join(FLAGS.train_dir,
                                                   'model.ckpt')
                    saver.save(sess, checkpoint_path, global_step=step)
        except Exception as e:
            coord.request_stop(e)
        finally:
            coord.request_stop()
            sess.run(model.q.close(cancel_pending_enqueues=True))
            coord.join(enq_threads)
Ejemplo n.º 16
0
    return rmse_test, auc_test, pearson_test, accuracy_test


with tf.Session() as sess:

    model = nn_model_tensorflow.Model(num_classes, num_steps, sess,
                                      RESTORE_MODEL)
    loss_list = []

    #
    # Training
    #
    for epoch_idx in range(NUM_EPOCHS):
        if LOG_COMET:
            experiment.set_step(epoch_idx)

        ###
        ### TRAINING DATASET
        ###
        if RUN_TRAIN:
            run_train(model, sess)
            model.save_model()

        ###
        ### TESTING DATASET
        ###
        if RUN_TEST:
            rmse, auc, pearson, accuracy = run_test(model)
        #
        # if RUN_MAPS: