def train(hyper_params): mnist = get_data() # Get graph definition, tensors and ops train_step, cross_entropy, accuracy, x, y, y_ = build_model_graph( hyper_params) # log parameters to Comet.ml import os # Setting the API key (saved as environment variable) exp = Experiment( api_key="<HIDDEN>", # or # api_key=os.environ.get("COMET_API_KEY"), project_name="prototype", workspace="jaimemarijke") exp.log_parameters(hyper_params) exp.log_dataset_hash(mnist) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) exp.set_model_graph(sess.graph) for i in range(hyper_params["steps"]): batch = mnist.train.next_batch(hyper_params["batch_size"]) exp.set_step(i) # Compute train accuracy every 10 steps if i % 10 == 0: train_accuracy = accuracy.eval(feed_dict={ x: batch[0], y_: batch[1] }) print('step %d, training accuracy %g' % (i, train_accuracy)) exp.log_metric("acc", train_accuracy) # Update weights (back propagation) loss = train_step.run(feed_dict={x: batch[0], y_: batch[1]}) exp.log_metric("loss", loss) ### Finished Training ### # Compute test accuracy acc = accuracy.eval(feed_dict={ x: mnist.test.images, y_: mnist.test.labels }) print('test accuracy %g' % acc)
def train(hyper_params): mnist = get_data() # Get graph definition, tensors and ops train_step, cross_entropy, accuracy, x, y, y_ = build_model_graph( hyper_params) experiment = Experiment(project_name="tf") experiment.log_parameters(hyper_params) experiment.log_dataset_hash(mnist) with tf.Session() as sess: with experiment.train(): sess.run(tf.global_variables_initializer()) experiment.set_model_graph(sess.graph) for i in range(hyper_params["steps"]): batch = mnist.train.next_batch(hyper_params["batch_size"]) experiment.set_step(i) # Compute train accuracy every 10 steps if i % 10 == 0: train_accuracy = accuracy.eval(feed_dict={ x: batch[0], y_: batch[1] }) print('step %d, training accuracy %g' % (i, train_accuracy)) experiment.log_metric("accuracy", train_accuracy, step=i) # Update weights (back propagation) _, loss_val = sess.run([train_step, cross_entropy], feed_dict={ x: batch[0], y_: batch[1] }) experiment.log_metric("loss", loss_val, step=i) ### Finished Training ### with experiment.test(): # Compute test accuracy acc = accuracy.eval(feed_dict={ x: mnist.test.images, y_: mnist.test.labels }) experiment.log_metric("accuracy", acc) print('test accuracy %g' % acc)
def train(hyper_params): mnist = get_data() # Get graph definition, tensors and ops train_step, cross_entropy, accuracy, x, y, y_ = build_model_graph( hyper_params) #log parameters to Comet.ml exp = Experiment(api_key="YOUR-API-KEY", project_name='tensorflow examples') exp.log_multiple_params(hyper_params) exp.log_dataset_hash(mnist) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) exp.set_model_graph(sess.graph) for i in range(hyper_params["steps"]): batch = mnist.train.next_batch(hyper_params["batch_size"]) exp.set_step(i) # Compute train accuracy every 10 steps if i % 10 == 0: train_accuracy = accuracy.eval(feed_dict={ x: batch[0], y_: batch[1] }) print('step %d, training accuracy %g' % (i, train_accuracy)) exp.log_metric("acc", train_accuracy) # Update weights (back propagation) loss = train_step.run(feed_dict={x: batch[0], y_: batch[1]}) exp.log_metric("loss", loss) ### Finished Training ### # Compute test accuracy acc = accuracy.eval(feed_dict={ x: mnist.test.images, y_: mnist.test.labels }) print('test accuracy %g' % acc)
def run_main_loop(args, train_estimator, predict_estimator): total_steps = 0 train_steps = math.ceil(args.train_examples / args._batch_size) eval_steps = math.ceil(args.eval_examples / args._batch_size) if args.use_comet: experiment = Experiment(api_key=comet_ml_api_key, project_name=comet_ml_project, workspace=comet_ml_workspace) experiment.log_parameters(vars(args)) experiment.add_tags(args.tag) experiment.set_name(model_name(args)) else: experiment = None prefetch_inception_model() with tf.gfile.Open(os.path.join(suffixed_folder(args, args.result_dir), "eval.txt"), "a") as eval_file: for epoch in range(0, args.epochs, args.predict_every): logger.info(f"Training epoch {epoch}") train_estimator.train(input_fn=train_input_fn, steps=train_steps * args.predict_every) total_steps += train_steps * args.predict_every if args.use_comet: experiment.set_step(epoch) # logger.info(f"Evaluate {epoch}") # evaluation = predict_estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) # logger.info(evaluation) # save_evaluation(args, eval_file, evaluation, epoch, total_steps) # if args.use_comet: # experiment.log_metrics(evaluation) logger.info(f"Generate predictions {epoch}") predictions = predict_estimator.predict(input_fn=predict_input_fn) logger.info(f"Save predictions") save_predictions(args, suffixed_folder(args, args.result_dir), eval_file, predictions, epoch, total_steps, experiment) logger.info(f"Completed {args.epochs} epochs")
parameters['launch_epoch'] = epoch disable_flag = 1 sample_count = len(train_batched) else: if save: torch.save(model.state_dict(), model_name) best_idx = epoch best_test_F, new_test_F, _ = evaluating_batch(model, test_batched, best_test_F) all_F.append([0.0, new_dev_F, new_test_F]) sys.stdout.flush() print('Epoch %d : train/dev/test : %.2f / %.2f / %.2f - %d' % (epoch, new_train_F, new_dev_F, new_test_F, best_idx)) model.train(True) adjust_learning_rate(optimizer, lr=learning_rate / (1 + 0.05 * sample_count / len(train_data))) metrics['new_train_F'] = new_train_F metrics['new_test_F'] = new_test_F metrics['new_dev_F'] = new_dev_F experiment.log_metrics(metrics) experiment.set_step(epoch + 1) print(time.time() - t)
criterion = nn.BCELoss() # Establish convention for real and fake labels during training real_label = 1 fake_label = 0 # Setup Adam optimizers for both G and D optimizerD = optim.Adam(netD.parameters(), lr=lr, betas=(beta1, 0.999)) optimizerG = optim.Adam(netG.parameters(), lr=lr, betas=(beta1, 0.999)) steps = 0 for epoch in range(num_epochs): experiment.log_current_epoch(epoch) for i, data in enumerate(dataloader, 0): experiment.set_step(steps) ############################ # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z))) ########################### ## Train with all-real batch netD.zero_grad() # Format batch real_cpu = data[0].to(device) b_size = real_cpu.size(0) label = torch.full((b_size,), real_label, device=device) # Forward pass real batch through D output = netD(real_cpu).view(-1) # Calculate loss on all-real batch errD_real = criterion(output, label) # Calculate gradients for D in backward pass
class CometMLMonitor(MonitorBase): """ Send scalar data and the graph to https://www.comet.ml. Note: 1. comet_ml requires you to `import comet_ml` before importing tensorflow or tensorpack. 2. The "automatic output logging" feature of comet_ml will make the training progress bar appear to freeze. Therefore the feature is disabled by default. """ def __init__(self, experiment=None, tags=None, **kwargs): """ Args: experiment (comet_ml.Experiment): if provided, invalidate all other arguments tags (list[str]): experiment tags kwargs: arguments used to initialize :class:`comet_ml.Experiment`, such as project name, API key, etc. Refer to its documentation for details. """ if experiment is not None: self._exp = experiment assert tags is None and len(kwargs) == 0 else: from comet_ml import Experiment kwargs.setdefault( 'log_code', True ) # though it's not functioning, git patch logging requires it kwargs.setdefault('auto_output_logging', None) self._exp = Experiment(**kwargs) if tags is not None: self._exp.add_tags(tags) self._exp.set_code("Code logging is impossible ...") self._exp.log_dependency('tensorpack', __git_version__) @property def experiment(self): """ The :class:`comet_ml.Experiment` instance. """ return self._exp def _before_train(self): self._exp.set_model_graph(tf.get_default_graph()) @HIDE_DOC def process_scalar(self, name, val): self._exp.log_metric(name, val, step=self.global_step) @HIDE_DOC def process_image(self, name, val): self._exp.set_step(self.global_step) for idx, v in enumerate(val): log_name = "{}_step{}{}".format( name, self.global_step, "_" + str(idx) if len(val) > 1 else "") self._exp.log_image(v, image_format="jpeg", name=log_name, image_minmax=(0, 255)) def _after_train(self): self._exp.end() def _after_epoch(self): self._exp.log_epoch_end(self.epoch_num)
saver = tf.train.Saver(max_to_keep=1) # try: # saver.restore(session, "./model.ckpt") # print('Restored model.') # except ValueError: # print('Initialized.') for epoch in range(epochs): session.run(iterator.initializer) experiment.log_current_epoch(epoch) try: for step in itertools.count(start=0, step=1): steps_per_epoch = max(steps_per_epoch, step) experiment.set_step(steps_per_epoch * epoch + step) total_images_looked_at = (steps_per_epoch * epoch + step) * (model.batch_size // 2) current_resolution_schedule_period_length = (0.3+current_resolution*0.003)*60*60 #if abs(last_schedule_update-total_images_looked_at) > 5000 and not schedule_finalized: #if abs(time.time()-last_schedule_update_time) > 60*60*1.428 and not schedule_finalized: if abs(time.time() - last_schedule_update_time) > current_resolution_schedule_period_length and not schedule_finalized: if current_mode == 'train': current_mode = 'stabilize' try: current_resolution = sizes[sizes.index(current_resolution)+1] except IndexError: current_resolution = sizes[-1] current_mode = 'train' schedule_finalized = True
def train(self): # comet_ml # Create an experiment experiment = Experiment(api_key="B6hzNydshIpZSG2Xi9BDG9gdG", project_name="glow-mnist", workspace="voletiv") hparams_dict = self.hparams_dict() experiment.log_parameters(hparams_dict) # set to training state self.graph.train() self.global_step = self.loaded_step # begin to train for epoch in range(self.n_epoches): print("epoch", epoch) progress = tqdm(self.data_loader) for i_batch, batch in enumerate(progress): experiment.set_step(self.global_step) # update learning rate lr = self.lrschedule["func"](global_step=self.global_step, **self.lrschedule["args"]) for param_group in self.optim.param_groups: param_group['lr'] = lr self.optim.zero_grad() # log if self.global_step % self.scalar_log_gaps == 0: # self.writer.add_scalar("lr/lr", lr, self.global_step) experiment.log_metrics({"lr": lr, "epoch": epoch+i_batch/len(self.data_loader)}) # get batch data for k in batch: batch[k] = batch[k].to(self.data_device) x = batch["x"] y = None y_onehot = None if self.y_condition: if self.y_criterion == "multi-classes": assert "y_onehot" in batch, "multi-classes ask for `y_onehot` (torch.FloatTensor onehot)" y_onehot = batch["y_onehot"] elif self.y_criterion == "single-class": assert "y" in batch, "single-class ask for `y` (torch.LongTensor indexes)" y = batch["y"] y_onehot = thops.onehot(y, num_classes=self.y_classes) # at first time, initialize ActNorm if self.global_step == 0: self.graph(x[:self.batch_size // len(self.devices), ...], y_onehot[:self.batch_size // len(self.devices), ...] if y_onehot is not None else None) # parallel if len(self.devices) > 1 and not hasattr(self.graph, "module"): print("[Parallel] move to {}".format(self.devices)) self.graph = torch.nn.parallel.DataParallel(self.graph, self.devices, self.devices[0]) # forward phase z, nll, y_logits = self.graph(x=x, y_onehot=y_onehot) # loss_generative loss_generative = Glow.loss_generative(nll) # loss_classes loss_classes = 0 if self.y_condition: loss_classes = (Glow.loss_multi_classes(y_logits, y_onehot) if self.y_criterion == "multi-classes" else Glow.loss_class(y_logits, y)) # total loss loss = loss_generative + loss_classes * self.weight_y # log if self.global_step % self.scalar_log_gaps == 0: # self.writer.add_scalar("loss/loss_generative", loss_generative, self.global_step) experiment.log_metrics({"loss_generative": loss_generative}) if self.y_condition: # self.writer.add_scalar("loss/loss_classes", loss_classes, self.global_step) experiment.log_metrics({"loss_classes": loss_classes, "total_loss": loss}) # backward self.graph.zero_grad() self.optim.zero_grad() loss.backward() # operate grad if self.max_grad_clip is not None and self.max_grad_clip > 0: torch.nn.utils.clip_grad_value_(self.graph.parameters(), self.max_grad_clip) if self.max_grad_norm is not None and self.max_grad_norm > 0: grad_norm = torch.nn.utils.clip_grad_norm_(self.graph.parameters(), self.max_grad_norm) if self.global_step % self.scalar_log_gaps == 0: # self.writer.add_scalar("grad_norm/grad_norm", grad_norm, self.global_step) experiment.log_metrics({"grad_norm": grad_norm}) # step self.optim.step() # checkpoints if self.global_step % self.checkpoints_gap == 0 and self.global_step > 0: save(global_step=self.global_step, graph=self.graph, optim=self.optim, pkg_dir=self.checkpoints_dir, is_best=True, max_checkpoints=self.max_checkpoints) # plot images if self.global_step % self.plot_gaps == 0: img = self.graph(z=z, y_onehot=y_onehot, reverse=True) # img = torch.clamp(img, min=0, max=1.0) if self.y_condition: if self.y_criterion == "multi-classes": y_pred = torch.sigmoid(y_logits) elif self.y_criterion == "single-class": y_pred = thops.onehot(torch.argmax(F.softmax(y_logits, dim=1), dim=1, keepdim=True), self.y_classes) y_true = y_onehot # plot images # self.writer.add_image("0_reverse/{}".format(bi), torch.cat((img[bi], batch["x"][bi]), dim=1), self.global_step) vutils.save_image(torch.stack([torch.cat((img[bi], batch["x"][bi]), dim=1) for bi in range(min([len(img), self.n_image_samples]))]), '/tmp/vikramvoleti.png', nrow=10) experiment.log_image('/tmp/vikramvoleti_rev.png', file_name="0_reverse") # plot preds # for bi in range(min([len(img), self.n_image_samples])): # # wandb.log({"0_reverse_{}".format(bi): [wandb.Image(torch.cat((img[bi], batch["x"][bi]), dim=1), caption="0_reverse/{}".format(bi))]}, step=self.global_step) # if self.y_condition: # # self.writer.add_image("1_prob/{}".format(bi), plot_prob([y_pred[bi], y_true[bi]], ["pred", "true"]), self.global_step) # wandb.log({"1_prob_{}".format(bi): [wandb.Image(plot_prob([y_pred[bi], y_true[bi]], ["pred", "true"]))]}, step=self.global_step) # inference if hasattr(self, "inference_gap"): if self.global_step % self.inference_gap == 0: try: img = self.graph(z=None, y_onehot=inference_y_onehot, eps_std=0.5, reverse=True) except NameError: inference_y_onehot = torch.zeros_like(y_onehot, device=torch.device('cpu')) for i in range(inference_y_onehot.size(0)): inference_y_onehot[i, (i % inference_y_onehot.size(1))] = 1. # now inference_y_onehot = inference_y_onehot.to(y_onehot.device) img = self.graph(z=None, y_onehot=inference_y_onehot, eps_std=0.5, reverse=True) # grid vutils.save_image(img[:min([len(img), self.n_image_samples])], '/tmp/vikramvoleti.png', nrow=10) experiment.log_image('/tmp/vikramvoleti_sam.png', file_name="1_samples") # img = torch.clamp(img, min=0, max=1.0) # for bi in range(min([len(img), n_images])): # # self.writer.add_image("2_sample/{}".format(bi), img[bi], self.global_step) # wandb.log({"2_sample_{}".format(bi): [wandb.Image(img[bi])]}, step=self.global_step) if self.global_step == 0: subprocess.run('nvidia-smi') # global step self.global_step += 1
labels = Variable(labels) # Forward + Backward + Optimize optimizer.zero_grad() outputs = rnn(images) loss = criterion(outputs, labels) loss.backward() optimizer.step() # Compute train accuracy _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels.data).sum() # Log to Comet.ml experiment.set_step(i) experiment.log_metric("loss", loss.data[0]) experiment.log_metric("accuracy", correct / total) if (i + 1) % 100 == 0: print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f' % (epoch + 1, hyper_params['num_epochs'], i + 1, len(train_dataset) // hyper_params['batch_size'], loss.data[0])) # Test the Model correct = 0 total = 0 for images, labels in test_loader: images = Variable( images.view(-1, hyper_params['sequence_length'],
class Reptile(Task): """ A meta-learning task that teaches an agent over a set of other tasks """ def __init__(self, data_handler, load_key=None, sender=True, receiver=True, image_captioner=True, image_selector=False, track_results=True): self.sess = Agent.sess self.N = 1 # number of steps taken for each task - should be > 1 self.S = SenderAgent() self.R = ReceiverAgent(*self.S.get_output()) self.IC = ImageCaptioner() # self.IS = ImageSelector() self.S.all_agents_initialized(load_key) self.R.all_agents_initialized(load_key) self.train_metrics = {} self.val_metrics = {} self.experiment = Experiment(api_key='1jl4lQOnJsVdZR6oekS6WO5FI', project_name='Reptile', auto_param_logging=False, auto_metric_logging=False, disabled=(not track_results)) self.params = {} self.params.update(Agent.get_params()) self.params.update(data_handler.get_params()) self.experiment.log_parameters(self.params) self.T = {} if image_captioner: self.ic = ImageCaptioning(self.IC, experiment=self.experiment, track_results=False) self.T["Image Captioner"] = lambda img, capts: self.ic.train_batch( (img, capts), mode="train") if image_selector: self.is_ = ImageSelection(self.IS, experiment=self.experiment, track_results=False) self.T["Image Selector"] = lambda img, capts: self.is_.train_batch( (img, capts), mode="train") if sender or receiver: self.rg = ReferentialGame(self.S, self.R, experiment=self.experiment, track_results=False) if receiver: self.T["Receiver"] = lambda img, capts: self.rg.train_batch( img, mode="receiver_train") if sender: self.T["Sender"] = lambda img, capts: self.rg.train_batch( img, mode="sender_train") # Initialize TF variables_to_initialize = tf.global_variables() if load_key is not None: dont_initialize = [] if SenderAgent.loaded: dont_initialize += SenderAgent.get_all_weights() if ReceiverAgent.loaded: dont_initialize += ReceiverAgent.get_all_weights() if ImageCaptioner.loaded: dont_initialize += ImageCaptioner.get_all_weights() variables_to_initialize = [ v for v in tf.global_variables() if v not in dont_initialize ] # REMOVE LATER #variables_to_initialize += ImageCaptioner.optimizer.variables() Agent.sess.run(tf.variables_initializer(variables_to_initialize)) self.sender_shared_state = VariableState( self.sess, SenderAgent.get_shared_weights()) self.receiver_shared_state = VariableState( self.sess, ReceiverAgent.get_shared_weights()) self.sender_own_state = VariableState(self.sess, SenderAgent.get_weights()) self.receiver_own_state = VariableState(self.sess, ReceiverAgent.get_weights()) # print(SenderAgent.get_shared_weights()) # print(ReceiverAgent.get_shared_weights()) # print(SenderAgent.get_weights()) # print(ReceiverAgent.get_weights()) # print(tf.trainable_variables()) self.shared_states = { "shared_sender": self.sender_shared_state, "shared_receiver": self.receiver_shared_state } self.own_states = { "own_sender": self.sender_own_state, "own_receiver": self.receiver_own_state } shared_average = [] for k, v in self.shared_states.items(): shared_average.append(v.export_variables()) shared_average = np.mean(shared_average, axis=0) self.set_weights(new_shared_weights=shared_average) self.dh = data_handler with open( "{}/data/csv_loss_{}.csv".format(project_path, self.experiment.get_key()), 'w+') as csv_loss_file: csv_loss_file.write( "Image Captioner Loss,Image Selector Loss,Sender Loss,Receiver Loss\n" ) with open( "{}/data/csv_accuracy_{}.csv".format( project_path, self.experiment.get_key()), 'w+') as csv_acc_file: csv_acc_file.write( "Image Captioner Loss,Image Selector Loss,Sender Loss,Receiver Loss\n" ) self.step = 0 def get_diff(self, a, b): diff = 0. if isinstance(a, (np.ndarray, np.generic)): return np.sum(np.abs(a - b)) elif isinstance(a, list): for i in range(len(a)): diff += self.get_diff(a[i], b[i]) elif isinstance(a, dict): for k in a: diff += self.get_diff(a[k], b[k]) return diff def set_weights(self, new_own_weights=None, new_shared_weights=None): if new_own_weights is not None: for k, s in self.own_states.items(): s.import_variables(new_own_weights[k]) if new_shared_weights is not None: for k, s in self.shared_states.items(): s.import_variables(new_shared_weights) def train_epoch(self, e, mode=None): self.dh.set_params(distractors=0) image_gen = self.dh.get_images(return_captions=True, mode="train") # Get current variables start_vars = { k: s.export_variables() for k, s in self.own_states.items() } start_vars["shared"] = self.shared_states[ "shared_sender"].export_variables() while True: try: # Save current variables old_own = { k: s.export_variables() for k, s in self.own_states.items() } new_own = {k: [] for k, s in self.own_states.items()} old_shared = self.shared_states[ "shared_sender"].export_variables() new_shared = [] # For each task for task in ["Image Captioner", "Sender", "Receiver"]: # parameter setup to not waste data if task in ["Sender", "Receiver", "Image Selector"]: self.dh.set_params(distractors=Agent.D) else: self.dh.set_params(distractors=0) # Run task n times for _ in range(self.N): images, captions = next(image_gen) acc, loss = self.T[task](images, captions) self.train_metrics[task + " Accuracy"] = acc self.train_metrics[task + " Loss"] = loss # Store new variables [ new_own[k].append(s.export_variables()) for k, s in self.own_states.items() ] [ new_shared.append(s.export_variables()) for k, s in self.shared_states.items() ] # Reset to old variables for next task [ s.import_variables(old_own[k]) for k, s in self.own_states.items() ] [ s.import_variables(old_shared) for k, s in self.shared_states.items() ] self.step += 1 self.experiment.set_step(self.step) self.experiment.log_metrics(self.train_metrics) # Average new variables new_own = { k: interpolate_vars(old_own[k], average_vars(new_own[k]), 0.2) for k, s in self.own_states.items() } new_shared = interpolate_vars(old_shared, average_vars(new_shared), 0.2) # Set variables to new variables self.set_weights(new_own_weights=new_own, new_shared_weights=new_shared) except StopIteration: break # Get change in weights end_vars = { k: s.export_variables() for k, s in self.own_states.items() } end_vars["shared"] = self.shared_states[ "shared_sender"].export_variables() weight_diff = self.get_diff(start_vars, end_vars) #self.experiment.set_step(e) self.val_metrics["Weight Change"] = weight_diff self.experiment.log_metrics(self.val_metrics) # Log data to a csv with open("{}/data/csv_loss_{}.csv".format(project_path, self.experiment.get_key()), 'a') as csv_loss_file, \ open("{}/data/csv_accuracy_{}.csv".format(project_path, self.experiment.get_key()), 'a') as csv_acc_file: losses = [] accs = [] for task in ["Image Captioner", "Sender", "Receiver"]: losses.append(str(self.train_metrics[task + " Loss"])) accs.append(str(self.train_metrics[task + " Accuracy"])) csv_loss_file.write(",".join(losses)) csv_loss_file.write("\n") csv_acc_file.write(",".join(accs)) csv_acc_file.write("\n") return 0, weight_diff
def main(_): experiment = Experiment(api_key="xXtJguCo8yFdU7dpjEpo6YbHw", project_name=args.experiment_name) hyper_params = { "learning_rate": args.lr, "num_epochs": args.max_epoch, "batch_size": args.single_batch_size, "alpha": args.alpha, "beta": args.beta, "gamma": args.gamma, "loss": args.loss } experiment.log_multiple_params(hyper_params) # TODO: split file support with tf.Graph().as_default(): global save_model_dir start_epoch = 0 global_counter = 0 gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=cfg.GPU_MEMORY_FRACTION, visible_device_list=cfg.GPU_AVAILABLE, allow_growth=True) config = tf.ConfigProto( gpu_options=gpu_options, device_count={ "GPU": cfg.GPU_USE_COUNT, }, allow_soft_placement=True, log_device_placement=False, ) with tf.Session(config=config) as sess: # sess=tf_debug.LocalCLIDebugWrapperSession(sess,ui_type='readline') model = RPN3D(cls=cfg.DETECT_OBJ, single_batch_size=args.single_batch_size, learning_rate=args.lr, max_gradient_norm=5.0, alpha=args.alpha, beta=args.beta, gamma=args.gamma, loss_type=args.loss, avail_gpus=cfg.GPU_AVAILABLE.split(',')) # param init/restore if tf.train.get_checkpoint_state(save_model_dir): print("Reading model parameters from %s" % save_model_dir) model.saver.restore(sess, tf.train.latest_checkpoint(save_model_dir)) start_epoch = model.epoch.eval() + 1 global_counter = model.global_step.eval() + 1 else: print("Created model with fresh parameters.") tf.global_variables_initializer().run() # train and validate is_summary, is_summary_image, is_validate = False, False, False summary_interval = 5 summary_val_interval = 10 summary_writer = tf.summary.FileWriter(log_dir, sess.graph) experiment.set_model_graph(sess.graph) # training with experiment.train(): for epoch in range(start_epoch, args.max_epoch): counter = 0 batch_time = time.time() experiment.log_current_epoch(epoch) for batch in iterate_data( train_dir, shuffle=True, aug=True, is_testset=False, batch_size=args.single_batch_size * cfg.GPU_USE_COUNT, multi_gpu_sum=cfg.GPU_USE_COUNT): counter += 1 global_counter += 1 experiment.set_step(global_counter) if counter % summary_interval == 0: is_summary = True else: is_summary = False epochs = args.max_epoch start_time = time.time() ret = model.train_step(sess, batch, train=True, summary=is_summary) forward_time = time.time() - start_time batch_time = time.time() - batch_time param = ret params = { "loss": param[0], "cls_loss": param[1], "cls_pos_loss": param[2], "cls_neg_loss": param[3] } experiment.log_multiple_metrics(params) # print(ret) print( 'train: {} @ epoch:{}/{} loss: {:.4f} cls_loss: {:.4f} cls_pos_loss: {:.4f} cls_neg_loss: {:.4f} forward time: {:.4f} batch time: {:.4f}' .format(counter, epoch, epochs, ret[0], ret[1], ret[2], ret[3], forward_time, batch_time)) # with open('log/train.txt', 'a') as f: # f.write( 'train: {} @ epoch:{}/{} loss: {:.4f} cls_loss: {:.4f} cls_pos_loss: {:.4f} cls_neg_loss: {:.4f} forward time: {:.4f} batch time: {:.4f}'.format(counter,epoch, epochs, ret[0], ret[1], ret[2], ret[3], forward_time, batch_time)) #print(counter, summary_interval, counter % summary_interval) if counter % summary_interval == 0: print("summary_interval now") summary_writer.add_summary(ret[-1], global_counter) #print(counter, summary_val_interval, counter % summary_val_interval) if counter % summary_val_interval == 0: print("summary_val_interval now") batch = sample_test_data( val_dir, args.single_batch_size * cfg.GPU_USE_COUNT, multi_gpu_sum=cfg.GPU_USE_COUNT) ret = model.validate_step(sess, batch, summary=True) summary_writer.add_summary(ret[-1], global_counter) try: ret = model.predict_step(sess, batch, summary=True) summary_writer.add_summary( ret[-1], global_counter) except: print("prediction skipped due to error") if check_if_should_pause(args.tag): model.saver.save(sess, os.path.join( save_model_dir, 'checkpoint'), global_step=model.global_step) print('pause and save model @ {} steps:{}'.format( save_model_dir, model.global_step.eval())) sys.exit(0) batch_time = time.time() experiment.log_epoch_end(epoch) sess.run(model.epoch_add_op) model.saver.save(sess, os.path.join(save_model_dir, 'checkpoint'), global_step=model.global_step) # dump test data every 10 epochs if (epoch + 1) % 10 == 0: # create output folder os.makedirs(os.path.join(args.output_path, str(epoch)), exist_ok=True) os.makedirs(os.path.join(args.output_path, str(epoch), 'data'), exist_ok=True) if args.vis: os.makedirs(os.path.join(args.output_path, str(epoch), 'vis'), exist_ok=True) for batch in iterate_data( val_dir, shuffle=False, aug=False, is_testset=False, batch_size=args.single_batch_size * cfg.GPU_USE_COUNT, multi_gpu_sum=cfg.GPU_USE_COUNT): if args.vis: tags, results, front_images, bird_views, heatmaps = model.predict_step( sess, batch, summary=False, vis=True) else: tags, results = model.predict_step( sess, batch, summary=False, vis=False) for tag, result in zip(tags, results): of_path = os.path.join(args.output_path, str(epoch), 'data', tag + '.txt') with open(of_path, 'w+') as f: labels = box3d_to_label( [result[:, 1:8]], [result[:, 0]], [result[:, -1]], coordinate='lidar')[0] for line in labels: f.write(line) print('write out {} objects to {}'.format( len(labels), tag)) # dump visualizations if args.vis: for tag, front_image, bird_view, heatmap in zip( tags, front_images, bird_views, heatmaps): front_img_path = os.path.join( args.output_path, str(epoch), 'vis', tag + '_front.jpg') bird_view_path = os.path.join( args.output_path, str(epoch), 'vis', tag + '_bv.jpg') heatmap_path = os.path.join( args.output_path, str(epoch), 'vis', tag + '_heatmap.jpg') cv2.imwrite(front_img_path, front_image) cv2.imwrite(bird_view_path, bird_view) cv2.imwrite(heatmap_path, heatmap) # execute evaluation code cmd_1 = "./kitti_eval/launch_test.sh" cmd_2 = os.path.join(args.output_path, str(epoch)) cmd_3 = os.path.join(args.output_path, str(epoch), 'log') os.system(" ".join([cmd_1, cmd_2, cmd_3])) print('train done. total epoch:{} iter:{}'.format( epoch, model.global_step.eval())) # finallly save model model.saver.save(sess, os.path.join(save_model_dir, 'checkpoint'), global_step=model.global_step)
def main(): args = parse_args() if args is None: exit() setup_logging(args) gan = BigGAN_128(args) if args.use_tpu: cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( tpu=args.tpu_name, zone=args.tpu_zone) master = cluster_resolver.get_master() else: master = '' tpu_run_config = tf.contrib.tpu.RunConfig( master=master, evaluation_master=master, model_dir=model_dir(args), session_config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False), tpu_config=tf.contrib.tpu.TPUConfig(args.steps_per_loop, args.num_shards), ) tpu_estimator = tf.contrib.tpu.TPUEstimator( model_fn=lambda features, labels, mode, params: gan.tpu_model_fn( features, labels, mode, params), config=tpu_run_config, use_tpu=args.use_tpu, train_batch_size=args._batch_size, eval_batch_size=args._batch_size, predict_batch_size=args._batch_size, params=vars(args), ) total_steps = 0 if args.use_comet: experiment = Experiment(api_key="bRptcjkrwOuba29GcyiNaGDbj", project_name="BigGAN", workspace="davidhughhenrymack") experiment.log_parameters(vars(args)) experiment.add_tags(args.tag) experiment.set_name(model_name(args)) else: experiment = None prefetch_inception_model() with tf.gfile.Open( os.path.join(suffixed_folder(args, args.result_dir), "eval.txt"), "a") as eval_file: for epoch in range(args.epochs): logger.info(f"Training epoch {epoch}") tpu_estimator.train(input_fn=train_input_fn, steps=args.train_steps) total_steps += args.train_steps logger.info(f"Evaluate {epoch}") evaluation = tpu_estimator.evaluate(input_fn=eval_input_fn, steps=args.eval_steps) if args.use_comet: experiment.set_step(total_steps) experiment.log_metrics(evaluation) logger.info(evaluation) save_evaluation(args, eval_file, evaluation, epoch, total_steps) logger.info(f"Generate predictions {epoch}") predictions = tpu_estimator.predict(input_fn=predict_input_fn) logger.info(f"Save predictions") save_predictions(args, suffixed_folder(args, args.result_dir), eval_file, predictions, epoch, total_steps, experiment)
def run_HAC(FLAGS, env, agent): experiment = Experiment(api_key="M03EcOc9o9kiG95hws4mq1uqI", project_name="HAC", workspace="antonwiehe") # Print task summary print_summary(FLAGS, env) # Determine training mode. If not testing and not solely training, interleave training and testing to track progress mix_train_test = False if not FLAGS.test and not FLAGS.train_only: mix_train_test = True for batch in range(NUM_BATCH): num_episodes = agent.other_params["num_exploration_episodes"] # Evaluate policy every TEST_FREQ batches if interleaving training and testing if mix_train_test and batch % TEST_FREQ == 0: print("\n--- TESTING ---") agent.FLAGS.test = True num_episodes = num_test_episodes # Reset successful episode counter successful_episodes = 0 for episode in range(num_episodes): print("\nBatch %d, Episode %d" % (batch, episode)) # Train for an episode success = agent.train(env, episode) if success: print("Batch %d, Episode %d End Goal Achieved\n" % (batch, episode)) # Increment successful episode counter if applicable if mix_train_test and batch % TEST_FREQ == 0: successful_episodes += 1 # Save agent agent.save_model(episode) # Finish evaluating policy if tested prior batch if mix_train_test and batch % TEST_FREQ == 0: # Log performance success_rate = successful_episodes / num_test_episodes * 100 print("\nTesting Success Rate %.2f%%" % success_rate) agent.log_performance(success_rate) agent.FLAGS.test = False experiment.set_step(batch) experiment.log_metric("Success rate", success_rate) success_list.append(success_rate) with open("successRates.csv", 'w', newline='') as myfile: wr = csv.writer(myfile, quoting=csv.QUOTE_ALL) wr.writerow(success_list) if success_rate > 95: print("Success rate over 95\%!") break print("\n--- END TESTING ---\n")
def train(): """Train SqueezeSeg model""" assert FLAGS.dataset == 'KITTI', \ 'Currently only support KITTI dataset' os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu with tf.Graph().as_default(): assert FLAGS.net == 'squeezeSeg', \ 'Selected neural net architecture not supported: {}'.format(FLAGS.net) if FLAGS.net == 'squeezeSeg': mc = kitti_squeezeSeg_config() mc.PRETRAINED_MODEL_PATH = FLAGS.pretrained_model_path model = SqueezeSeg(mc) imdb = kitti(FLAGS.image_set, FLAGS.data_path, mc) # save model size, flops, activations by layers with open(os.path.join(FLAGS.train_dir, 'model_metrics.txt'), 'w') as f: f.write('Number of parameter by layer:\n') count = 0 for c in model.model_size_counter: f.write('\t{}: {}\n'.format(c[0], c[1])) count += c[1] f.write('\ttotal: {}\n'.format(count)) count = 0 f.write('\nActivation size by layer:\n') for c in model.activation_counter: f.write('\t{}: {}\n'.format(c[0], c[1])) count += c[1] f.write('\ttotal: {}\n'.format(count)) count = 0 f.write('\nNumber of flops by layer:\n') for c in model.flop_counter: f.write('\t{}: {}\n'.format(c[0], c[1])) count += c[1] f.write('\ttotal: {}\n'.format(count)) f.close() print('Model statistics saved to {}.'.format( os.path.join(FLAGS.train_dir, 'model_metrics.txt'))) def enqueue(sess, coord): with coord.stop_on_exception(): while not coord.should_stop(): # read batch input lidar_per_batch, lidar_mask_per_batch, label_per_batch,\ weight_per_batch = imdb.read_batch() feed_dict = { model.ph_keep_prob: mc.KEEP_PROB, model.ph_lidar_input: lidar_per_batch, model.ph_lidar_mask: lidar_mask_per_batch, model.ph_label: label_per_batch, model.ph_loss_weight: weight_per_batch, } sess.run(model.enqueue_op, feed_dict=feed_dict) saver = tf.train.Saver(tf.all_variables()) summary_op = tf.summary.merge_all() init = tf.initialize_all_variables() sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) sess.run(init) summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph) coord = tf.train.Coordinator() enq_threads = [] for _ in range(mc.NUM_ENQUEUE_THREAD): eqth = threading.Thread(target=enqueue, args=[sess, coord]) eqth.start() enq_threads.append(eqth) run_options = tf.RunOptions(timeout_in_ms=60000) # Create an experiment with your api key experiment = Experiment(api_key="lISr0JWgyUIsox8HYPC3isnTP", project_name="squeezeSeg_1080ti", workspace="asimonov") hyper_params = { "learning_rate": mc.LEARNING_RATE, "steps": FLAGS.max_steps, "batch_size": mc.BATCH_SIZE } experiment.log_multiple_params(hyper_params) # some_param = "some value" # experiment.log_parameter("param name", some_param) try: for step in xrange(FLAGS.max_steps): start_time = time.time() experiment.set_step(step) if step % FLAGS.summary_step == 0 or step == FLAGS.max_steps - 1: op_list = [ model.lidar_input, model.lidar_mask, model.label, model.train_op, model.loss, model.pred_cls, summary_op ] lidar_per_batch, lidar_mask_per_batch, label_per_batch, \ _, loss_value, pred_cls, summary_str = sess.run(op_list, options=run_options) experiment.log_metric("loss", loss_value) label_image = visualize_seg(label_per_batch[:6, :, :], mc) pred_image = visualize_seg(pred_cls[:6, :, :], mc) # Run evaluation on the batch ious, _, _, _ = evaluate_iou( label_per_batch, pred_cls * np.squeeze(lidar_mask_per_batch), mc.NUM_CLASS) feed_dict = {} # Assume that class-0 is the background class for i in range(1, mc.NUM_CLASS): feed_dict[model.iou_summary_placeholders[i]] = ious[i] iou_summary_list = sess.run(model.iou_summary_ops[1:], feed_dict) # Run visualization viz_op_list = [ model.show_label, model.show_depth_img, model.show_pred ] viz_summary_list = sess.run(viz_op_list, feed_dict={ model.depth_image_to_show: lidar_per_batch[:6, :, :, [4]], model.label_to_show: label_image, model.pred_image_to_show: pred_image, }) # Add summaries summary_writer.add_summary(summary_str, step) for sum_str in iou_summary_list: summary_writer.add_summary(sum_str, step) for viz_sum in viz_summary_list: summary_writer.add_summary(viz_sum, step) # force tensorflow to synchronise summaries summary_writer.flush() else: _, loss_value = sess.run([model.train_op, model.loss], options=run_options) duration = time.time() - start_time assert not np.isnan(loss_value), \ 'Model diverged. Total loss: {}, conf_loss: {}, bbox_loss: {}, ' \ 'class_loss: {}'.format(loss_value, conf_loss, bbox_loss, class_loss) if step % 10 == 0: num_images_per_step = mc.BATCH_SIZE images_per_sec = num_images_per_step / duration sec_per_batch = float(duration) format_str = ( '%s: step %d, loss = %.2f (%.1f images/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), step, loss_value, images_per_sec, sec_per_batch)) sys.stdout.flush() # Save the model checkpoint periodically. if step % FLAGS.checkpoint_step == 0 or step == FLAGS.max_steps - 1: checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt') saver.save(sess, checkpoint_path, global_step=step) except Exception as e: coord.request_stop(e) finally: coord.request_stop() sess.run(model.q.close(cancel_pending_enqueues=True)) coord.join(enq_threads)
return rmse_test, auc_test, pearson_test, accuracy_test with tf.Session() as sess: model = nn_model_tensorflow.Model(num_classes, num_steps, sess, RESTORE_MODEL) loss_list = [] # # Training # for epoch_idx in range(NUM_EPOCHS): if LOG_COMET: experiment.set_step(epoch_idx) ### ### TRAINING DATASET ### if RUN_TRAIN: run_train(model, sess) model.save_model() ### ### TESTING DATASET ### if RUN_TEST: rmse, auc, pearson, accuracy = run_test(model) # # if RUN_MAPS: