def eval(self, data_set, eval_tensor_names=(), eval_ph_names=(), num_batches=None): # TODO : eval_ph_names assert isinstance(data_set, DataSet) assert self.initialized, "Initialize tower before training." params = self.params sess = self.sess epoch_op = self.tensors['epoch'] epoch = sess.run(epoch_op) progress = params.progress num_batches = num_batches or data_set.get_num_batches(partial=True) num_iters = int(np.ceil(num_batches / self.num_towers)) num_corrects, total, total_loss = 0, 0, 0.0 eval_values = [] idxs = [] N = data_set.batch_size * num_batches if N > data_set.num_examples: N = data_set.num_examples eval_args = self._get_eval_args(epoch) string = "eval on %s, N=%d|" % (data_set.name, N) if progress: pbar = get_pbar(num_iters, prefix=string).start() for iter_idx in range(num_iters): batches = [] for _ in range(self.num_towers): if data_set.has_next_batch(partial=True): idxs.extend(data_set.get_batch_idxs(partial=True)) batches.append(data_set.get_next_labeled_batch(partial=True)) (cur_num_corrects, cur_avg_loss, _, global_step), eval_value_batches = \ self._eval_batches(batches, eval_tensor_names=eval_tensor_names, **eval_args) num_corrects += cur_num_corrects cur_num = sum(len(batch[0]) for batch in batches) total += cur_num for eval_value_batch in eval_value_batches: eval_values.append([x.tolist() for x in eval_value_batch]) # numpy.array.toList total_loss += cur_avg_loss * cur_num if progress: pbar.update(iter_idx) if progress: pbar.finish() loss = float(total_loss) / total data_set.reset() acc = float(num_corrects) / total print("%s at epoch %d: acc = %.2f%% = %d / %d, loss = %.4f" % (data_set.name, epoch, 100 * acc, num_corrects, total, loss)) # For outputting eval json files if len(eval_tensor_names) > 0: ids = [data_set.idx2id[idx] for idx in idxs] zipped_eval_values = [list(itertools.chain(*each)) for each in zip(*eval_values)] values = {name: values for name, values in zip(eval_tensor_names, zipped_eval_values)} out = {'ids': ids, 'values': values} eval_path = os.path.join(params.eval_dir, "%s_%s.json" % (data_set.name, str(epoch).zfill(4))) json.dump(out, open(eval_path, 'w')) return loss, acc
def eval(self, data_set, eval_tensor_names=(), eval_ph_names=(), num_batches=None): # TODO : eval_ph_names assert isinstance(data_set, DataSet) assert self.initialized, "Initialize tower before training." params = self.params sess = self.sess epoch_op = self.tensors['epoch'] epoch = sess.run(epoch_op) progress = params.progress num_batches = num_batches or data_set.get_num_batches(partial=True) num_iters = int(np.ceil(num_batches / self.num_towers)) num_corrects, total, total_loss = 0, 0, 0.0 eval_values = [] idxs = [] N = data_set.batch_size * num_batches if N > data_set.num_examples: N = data_set.num_examples eval_args = self._get_eval_args(epoch) string = "eval on %s, N=%d|" % (data_set.name, N) if progress: pbar = get_pbar(num_iters, prefix=string).start() for iter_idx in range(num_iters): batches = [] for _ in range(self.num_towers): if data_set.has_next_batch(partial=True): idxs.extend(data_set.get_batch_idxs(partial=True)) batches.append(data_set.get_next_labeled_batch(partial=True)) (cur_num_corrects, cur_avg_loss, _, global_step), eval_value_batches = \ self._eval_batches(batches, eval_tensor_names=eval_tensor_names, **eval_args) num_corrects += cur_num_corrects cur_num = sum(len(batch[0]) for batch in batches) total += cur_num for eval_value_batch in eval_value_batches: eval_values.append([x.tolist() for x in eval_value_batch]) # numpy.array.toList total_loss += cur_avg_loss * cur_num if progress: pbar.update(iter_idx) if progress: pbar.finish() loss = float(total_loss) / total data_set.reset() acc = float(num_corrects) / total print("%s at epoch %d: acc = %.2f%% = %d / %d, loss = %.4f" % (data_set.name, epoch, 100 * acc, num_corrects, total, loss)) # For outputting eval json files ids = [data_set.idx2id[idx] for idx in idxs] zipped_eval_values = [list(itertools.chain(*each)) for each in zip(*eval_values)] values = {name: values for name, values in zip(eval_tensor_names, zipped_eval_values)} out = {'ids': ids, 'values': values} eval_path = os.path.join(params.eval_dir, "%s_%s.json" % (data_set.name, str(epoch).zfill(4))) json.dump(out, open(eval_path, 'w')) return loss, acc
def train(self, train_data_set, num_epochs, val_data_set=None, eval_ph_names=(), eval_tensor_names=(), num_batches=None, val_num_batches=None): assert isinstance(train_data_set, DataSet) assert self.initialized, "Initialize tower before training." sess = self.sess writer = self.writer params = self.params progress = params.progress val_acc = None # if num batches is specified, then train only that many num_batches = num_batches or train_data_set.get_num_batches(partial=False) num_iters_per_epoch = int(num_batches / self.num_towers) num_digits = int(np.log10(num_batches)) epoch_op = self.tensors['epoch'] epoch = sess.run(epoch_op) print("training %d epochs ... " % num_epochs) logging.info("num iters per epoch: %d" % num_iters_per_epoch) logging.info("starting from epoch %d." % (epoch+1)) while epoch < num_epochs: train_args = self._get_train_args(epoch) if progress: pbar = get_pbar(num_iters_per_epoch, "epoch %s|" % str(epoch+1).zfill(num_digits)).start() for iter_idx in range(num_iters_per_epoch): batches = [train_data_set.get_next_labeled_batch() for _ in range(self.num_towers)] _, summary, global_step = self._train_batches(batches, **train_args) if self.write_log: writer.add_summary(summary, global_step) if progress: pbar.update(iter_idx) if progress: pbar.finish() train_data_set.complete_epoch() assign_op = epoch_op.assign_add(1) _, epoch = sess.run([assign_op, epoch_op]) if val_data_set and epoch % params.val_period == 0: self.eval(train_data_set, eval_tensor_names=eval_tensor_names, num_batches=val_num_batches) val_loss, val_acc = self.eval(val_data_set, eval_tensor_names=eval_tensor_names, num_batches=val_num_batches) if epoch % params.save_period == 0: self.save() return val_loss, val_acc
def train(self, train_data_set, num_epochs, val_data_set=None, eval_ph_names=(), eval_tensor_names=(), num_batches=None, val_num_batches=None): assert isinstance(train_data_set, DataSet) assert self.initialized, "Initialize tower before training." sess = self.sess writer = self.writer params = self.params progress = params.progress val_acc = None # if num batches is specified, then train only that many num_batches = num_batches or train_data_set.get_num_batches(partial=False) num_iters_per_epoch = int(num_batches / self.num_towers) num_digits = int(np.log10(num_batches)) epoch_op = self.tensors['epoch'] epoch = sess.run(epoch_op) print("training %d epochs ... " % num_epochs) logging.info("num iters per epoch: %d" % num_iters_per_epoch) logging.info("starting from epoch %d." % (epoch+1)) while epoch < num_epochs: train_args = self._get_train_args(epoch) if progress: pbar = get_pbar(num_iters_per_epoch, "epoch %s|" % str(epoch+1).zfill(num_digits)).start() for iter_idx in range(num_iters_per_epoch): batches = [train_data_set.get_next_labeled_batch() for _ in range(self.num_towers)] _, summary, global_step = self._train_batches(batches, **train_args) writer.add_summary(summary, global_step) if progress: pbar.update(iter_idx) if progress: pbar.finish() train_data_set.complete_epoch() assign_op = epoch_op.assign_add(1) _, epoch = sess.run([assign_op, epoch_op]) if val_data_set and epoch % params.val_period == 0: self.eval(train_data_set, eval_tensor_names=eval_tensor_names, num_batches=val_num_batches) val_loss, val_acc = self.eval(val_data_set, eval_tensor_names=eval_tensor_names, num_batches=val_num_batches) if epoch % params.save_period == 0: self.save() return val_loss, val_acc
def train(self, train_data_set, val_data_set=None, eval_tensor_names=()): assert isinstance(train_data_set, DataSet) assert self.initialized, "Initialize tower before training." # TODO : allow partial batch sess = self.sess writer = self.writer params = self.params num_epochs = params.num_epochs num_batches = params.train_num_batches if params.train_num_batches >= 0 else train_data_set.get_num_batches(partial=False) num_iters_per_epoch = int(num_batches / self.num_towers) num_digits = int(np.log10(num_batches)) epoch_op = self.tensors['epoch'] epoch = sess.run(epoch_op) print("training %d epochs ... " % num_epochs) print("num iters per epoch: %d" % num_iters_per_epoch) print("starting from epoch %d." % (epoch+1)) while epoch < num_epochs: train_args = self._get_train_args(epoch) pbar = get_pbar(num_iters_per_epoch, "epoch %s|" % str(epoch+1).zfill(num_digits)).start() for iter_idx in range(num_iters_per_epoch): batches = [train_data_set.get_next_labeled_batch() for _ in range(self.num_towers)] _, summary, global_step = self._train_batches(batches, **train_args) writer.add_summary(summary, global_step) pbar.update(iter_idx) pbar.finish() train_data_set.complete_epoch() assign_op = epoch_op.assign_add(1) _, epoch = sess.run([assign_op, epoch_op]) if val_data_set and epoch % params.val_period == 0: self.eval(train_data_set, is_val=True, eval_tensor_names=eval_tensor_names) self.eval(val_data_set, is_val=True, eval_tensor_names=eval_tensor_names) if epoch % params.save_period == 0: self.save()
def train(self, train_data_set, num_epochs, val_data_set=None, eval_ph_names=(), eval_tensor_names=(), num_batches=None, val_num_batches=None): assert isinstance(train_data_set, DataSet) assert self.initialized, "Initialize tower before training." sess = self.sess writer = self.writer params = self.params progress = params.progress val_acc = None # if num batches is specified, then train only that many num_batches = num_batches or train_data_set.get_num_batches( partial=False) num_iters_per_epoch = int(num_batches / self.num_towers) num_digits = int(np.log10(num_batches)) epoch_op = self.tensors['epoch'] epoch = sess.run(epoch_op) print("training %d epochs ... " % num_epochs) logging.info("num iters per epoch: %d" % num_iters_per_epoch) logging.info("starting from epoch %d." % (epoch + 1)) best_global_step = self.tensors['global_step'] best_val_acc = 0.0 best_val_loss = 99999 while epoch < num_epochs: train_args = self._get_train_args(epoch) if progress: pbar = get_pbar(num_iters_per_epoch, "epoch %s|" % str(epoch + 1).zfill(num_digits)).start() for iter_idx in range(num_iters_per_epoch): batches = [ train_data_set.get_next_labeled_batch() for _ in range(self.num_towers) ] _, summary, global_step = self._train_batches( batches, **train_args) if self.write_log: writer.add_summary(summary, global_step) if progress: pbar.update(iter_idx) if progress: pbar.finish() train_data_set.complete_epoch() assign_op = epoch_op.assign_add(1) _, epoch = sess.run([assign_op, epoch_op]) global_step = sess.run(self.tensors['global_step']) if val_data_set and epoch % params.val_period == 0: self.eval(train_data_set, eval_tensor_names=eval_tensor_names, num_batches=val_num_batches) val_loss, val_acc = self.eval( val_data_set, eval_tensor_names=eval_tensor_names, num_batches=val_num_batches) if val_acc > best_val_acc or (val_acc == best_val_acc and val_loss < best_val_loss): count = 0 best_val_acc = val_acc best_val_loss = val_loss best_global_step = global_step self.save() elif val_loss < best_val_loss: count = 0 else: count += 1 if count >= 5: break if not best_global_step == global_step: save_dir = self.params.save_dir name = self.params.model_name save_path = os.path.join(save_dir, name) self.saver.restore(sess, '%s-%s' % (save_path, best_global_step)) return best_val_loss, best_val_acc
def train(self, train_data_set, num_epochs, val_data_set=None, eval_ph_names=(), eval_tensor_names=(), num_batches=None, val_num_batches=None): assert isinstance(train_data_set, DataSet) assert self.initialized, "Initialize tower before training." sess = self.sess writer = self.writer params = self.params progress = params.progress val_acc = None # if num batches is specified, then train only that many num_batches = num_batches or train_data_set.get_num_batches(partial=False) num_iters_per_epoch = int(num_batches / self.num_towers) num_digits = int(np.log10(num_batches)) epoch_op = self.tensors['epoch'] epoch = sess.run(epoch_op) print("training %d epochs ... " % num_epochs) logging.info("num iters per epoch: %d" % num_iters_per_epoch) logging.info("starting from epoch %d." % (epoch+1)) best_global_step = self.tensors['global_step'] best_val_acc = 0.0 best_val_loss = 99999 while epoch < num_epochs: train_args = self._get_train_args(epoch) if progress: pbar = get_pbar(num_iters_per_epoch, "epoch %s|" % str(epoch+1).zfill(num_digits)).start() for iter_idx in range(num_iters_per_epoch): batches = [train_data_set.get_next_labeled_batch() for _ in range(self.num_towers)] _, summary, global_step = self._train_batches(batches, **train_args) if self.write_log: writer.add_summary(summary, global_step) if progress: pbar.update(iter_idx) if progress: pbar.finish() train_data_set.complete_epoch() assign_op = epoch_op.assign_add(1) _, epoch = sess.run([assign_op, epoch_op]) global_step = sess.run(self.tensors['global_step']) if val_data_set and epoch % params.val_period == 0: self.eval(train_data_set, eval_tensor_names=eval_tensor_names, num_batches=val_num_batches) val_loss, val_acc = self.eval(val_data_set, eval_tensor_names=eval_tensor_names, num_batches=val_num_batches) if val_acc > best_val_acc or (val_acc == best_val_acc and val_loss < best_val_loss): count = 0 best_val_acc = val_acc best_val_loss = val_loss best_global_step = global_step self.save() elif val_loss < best_val_loss: count = 0 else: count += 1 if count >= 5: break if not best_global_step == global_step: save_dir = self.params.save_dir name = self.params.model_name save_path = os.path.join(save_dir, name) self.saver.restore(sess, '%s-%s'%(save_path,best_global_step)) return best_val_loss, best_val_acc