def main(_argv): # init os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu logger = tf.get_logger() logger.disabled = True logger.setLevel(logging.FATAL) set_memory_growth() cfg = load_yaml(FLAGS.cfg_path) # load dataset test_dataset = load_cifar10_dataset( cfg['val_batch_size'], split='test', shuffle=False, drop_remainder=False, using_crop=False, using_flip=False, using_cutout=False) # define network # TODO : change cfg for num_arch in range(50): model = CifarModel(cfg, training=False) model.summary(line_length=80) print("param size = {:f}MB".format(count_parameters_in_MB(model))) # load checkpoint checkpoint_path = './checkpoints/' + cfg['sub_name'] + '/best.ckpt' try: model.load_weights('./checkpoints/' + cfg['sub_name'] + '/best.ckpt') print("[*] load ckpt from {}.".format(checkpoint_path)) except: print("[*] Cannot find ckpt from {}.".format(checkpoint_path)) exit() # inference top1 = AvgrageMeter() top5 = AvgrageMeter() for step, (inputs, labels) in enumerate(test_dataset): # run model logits = model(inputs) # cacludate top1, top5 acc prec1, prec5 = accuracy(logits.numpy(), labels.numpy(), topk=(1, 5)) n = inputs.shape[0] top1.update(prec1, n) top5.update(prec5, n) print(" {:03d}: top1 {:f}, top5 {:f}".format(step, top1.avg, top5.avg)) print("Test Acc: top1 {:.2f}%, top5 {:.2f}%".format(top1.avg, top5.avg))
def main(_): # init os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu logger = tf.get_logger() logger.disabled = True logger.setLevel(logging.FATAL) set_memory_growth() cfg = load_yaml(FLAGS.cfg_path) # define training step function @tf.function def train_step(inputs, labels, drop_path_prob): with tf.GradientTape() as tape: logits, logits_aux = model((inputs, drop_path_prob), training=True) losses = {} losses['reg'] = tf.reduce_sum(model.losses) losses['ce'] = criterion(labels, logits) losses['ce_auxiliary'] = \ cfg['auxiliary_weight'] * criterion(labels, logits_aux) total_loss = tf.add_n([l for l in losses.values()]) grads = tape.gradient(total_loss, model.trainable_variables) grads = [(tf.clip_by_norm(grad, cfg['grad_clip'])) for grad in grads] optimizer.apply_gradients(zip(grads, model.trainable_variables)) return logits, total_loss, losses # Used to store the final accuracy for every arch final_acc = pd.DataFrame(data=None, columns=['arch_name', 'acc']) loop_num = 50 if Debug: # debugpy.wait_for_client() loop_num = 1 # define network for arch_num in range(loop_num): # read the arch arch = str(f"{cfg['sub_name']}_{arch_num}") cfg['arch'] = arch model = CifarModel(cfg, training=True, file_name=FLAGS.file_name) if Debug: model.summary(line_length=80) print("param size = {:f}MB".format(count_parameters_in_MB(model))) # load dataset train_dataset = load_cifar10_dataset( cfg['batch_size'], split='train', shuffle=True, drop_remainder=True, using_normalize=cfg['using_normalize'], using_crop=cfg['using_crop'], using_flip=cfg['using_flip'], using_cutout=cfg['using_cutout'], cutout_length=cfg['cutout_length']) val_dataset = load_cifar10_dataset( cfg['val_batch_size'], split='test', shuffle=False, drop_remainder=False, using_normalize=cfg['using_normalize'], using_crop=False, using_flip=False, using_cutout=False) # define optimizer steps_per_epoch = cfg['dataset_len'] // cfg['batch_size'] learning_rate = CosineAnnealingLR(initial_learning_rate=cfg['init_lr'], t_period=cfg['epoch'] * steps_per_epoch, lr_min=cfg['lr_min']) optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate, momentum=cfg['momentum']) # define losses function criterion = CrossEntropyLoss() # load checkpoint checkpoint_dir = './checkpoints/' + arch checkpoint = tf.train.Checkpoint(step=tf.Variable(0, name='step'), optimizer=optimizer, model=model) manager = tf.train.CheckpointManager(checkpoint=checkpoint, directory=checkpoint_dir, max_to_keep=3) if manager.latest_checkpoint: checkpoint.restore(manager.latest_checkpoint) print('[*] load ckpt from {} at step {}.'.format( manager.latest_checkpoint, checkpoint.step.numpy())) else: print("[*] training from scratch.") # training loop summary_writer = tf.summary.create_file_writer('./logs/' + cfg['sub_name']) total_steps = steps_per_epoch * cfg['epoch'] remain_steps = max(total_steps - checkpoint.step.numpy(), 0) prog_bar = ProgressBar(steps_per_epoch, checkpoint.step.numpy() % steps_per_epoch) train_acc = AvgrageMeter() val_acc = AvgrageMeter() best_acc = 0. for inputs, labels in train_dataset.take(remain_steps): checkpoint.step.assign_add(1) drop_path_prob = cfg['drop_path_prob'] * ( tf.cast(checkpoint.step, tf.float32) / total_steps) steps = checkpoint.step.numpy() epochs = ((steps - 1) // steps_per_epoch) + 1 logits, total_loss, losses = train_step(inputs, labels, drop_path_prob) train_acc.update( accuracy(logits.numpy(), labels.numpy())[0], cfg['batch_size']) prog_bar.update( "epoch={}/{}, loss={:.4f}, acc={:.2f}, lr={:.2e}".format( epochs, cfg['epoch'], total_loss.numpy(), train_acc.avg, optimizer.lr(steps).numpy())) if steps % cfg['val_steps'] == 0 and steps > 1: print("\n[*] validate...", end='') val_acc.reset() for inputs_val, labels_val in val_dataset: logits_val, _ = model((inputs_val, tf.constant([0.]))) val_acc.update( accuracy(logits_val.numpy(), labels_val.numpy())[0], inputs_val.shape[0]) if val_acc.avg > best_acc: best_acc = val_acc.avg model.save_weights( f"checkpoints/{cfg['sub_name']}/best.ckpt") val_str = " val acc {:.2f}%, best acc {:.2f}%" print(val_str.format(val_acc.avg, best_acc), end='') if steps % 10 == 0: with summary_writer.as_default(): tf.summary.scalar('acc/train', train_acc.avg, step=steps) tf.summary.scalar('acc/val', val_acc.avg, step=steps) tf.summary.scalar('loss/total_loss', total_loss, step=steps) for k, l in losses.items(): tf.summary.scalar('loss/{}'.format(k), l, step=steps) tf.summary.scalar('learning_rate', optimizer.lr(steps), step=steps) if steps % cfg['save_steps'] == 0: manager.save() print("\n[*] save ckpt file at {}".format( manager.latest_checkpoint)) if steps % steps_per_epoch == 0: train_acc.reset() manager.save() print("\n[*] training one arch done! save ckpt file at {}".format( manager.latest_checkpoint)) final_acc.loc[arch_num] = list([arch, best_acc]) print("Whole training ended, the best result is :") print("\t", final_acc.iloc[final_acc['acc'].idxmax()])
def main(_): ''' Train for one epoch to get supernet , then random sample 50 architectures for finetuning. This structure is basically the same as train_search.py TODO: Add PGD here and calculate FSP ''' # init os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu logger = tf.get_logger() logger.disabled = True logger.setLevel(logging.FATAL) set_memory_growth() cfg = load_yaml(FLAGS.cfg_path) # define network sna = SearchNetArch(cfg) sna.model.summary(line_length=80) print("param size = {:f}MB".format(count_parameters_in_MB(sna.model))) # load dataset t_split = f"train[0%:{int(cfg['train_portion'] * 100)}%]" v_split = f"train[{int(cfg['train_portion'] * 100)}%:100%]" train_dataset = load_cifar10_dataset( cfg['batch_size'], split=t_split, shuffle=True, drop_remainder=True, using_normalize=cfg['using_normalize'], using_crop=cfg['using_crop'], using_flip=cfg['using_flip'], using_cutout=cfg['using_cutout'], cutout_length=cfg['cutout_length']) val_dataset = load_cifar10_dataset(cfg['batch_size'], split=v_split, shuffle=True, drop_remainder=True, using_normalize=cfg['using_normalize'], using_crop=cfg['using_crop'], using_flip=cfg['using_flip'], using_cutout=cfg['using_cutout'], cutout_length=cfg['cutout_length']) # define optimizer steps_per_epoch = int(cfg['dataset_len'] * cfg['train_portion'] // cfg['batch_size']) learning_rate = CosineAnnealingLR(initial_learning_rate=cfg['init_lr'], t_period=cfg['epoch'] * steps_per_epoch, lr_min=cfg['lr_min']) optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate, momentum=cfg['momentum']) optimizer_arch = tf.keras.optimizers.Adam( learning_rate=cfg['arch_learning_rate'], beta_1=0.5, beta_2=0.999) # define losses function criterion = CrossEntropyLoss() # load checkpoint checkpoint_dir = './checkpoints/' + cfg['sub_name'] checkpoint = tf.train.Checkpoint(step=tf.Variable(0, name='step'), optimizer=optimizer, optimizer_arch=optimizer_arch, model=sna.model, alphas_normal=sna.alphas_normal, alphas_reduce=sna.alphas_reduce, betas_normal=sna.betas_normal, betas_reduce=sna.betas_reduce) manager = tf.train.CheckpointManager(checkpoint=checkpoint, directory=checkpoint_dir, max_to_keep=3) if manager.latest_checkpoint: checkpoint.restore(manager.latest_checkpoint) print('[*] load ckpt from {} at step {}.'.format( manager.latest_checkpoint, checkpoint.step.numpy())) else: print("[*] training from scratch.") print(f"[*] searching model after {cfg['start_search_epoch']} epochs.") # define training step function for model @tf.function def train_step(inputs, labels): with tf.GradientTape() as tape: logits = sna.model((inputs, *sna.arch_parameters), training=True) losses = {} losses['reg'] = tf.reduce_sum(sna.model.losses) losses['ce'] = criterion(labels, logits) total_loss = tf.add_n([l for l in losses.values()]) grads = tape.gradient(total_loss, sna.model.trainable_variables) grads = [(tf.clip_by_norm(grad, cfg['grad_clip'])) for grad in grads] optimizer.apply_gradients(zip(grads, sna.model.trainable_variables)) return logits, total_loss, losses # define training step function for arch_parameters @tf.function def train_step_arch(inputs, labels): with tf.GradientTape() as tape: logits = sna.model((inputs, *sna.arch_parameters), training=True) losses = {} losses['reg'] = cfg['arch_weight_decay'] * tf.add_n( [tf.reduce_sum(p**2) for p in sna.arch_parameters]) losses['ce'] = criterion(labels, logits) total_loss = tf.add_n([l for l in losses.values()]) grads = tape.gradient(total_loss, sna.arch_parameters) optimizer_arch.apply_gradients(zip(grads, sna.arch_parameters)) return losses summary_writer = tf.summary.create_file_writer('./logs/' + cfg['sub_name']) print("[*] finished searching for one epoch") print("[*] Start sampling architetures") prog_bar = ProgressBar(50, 0) # Start sampling for 50 archs for geno_num in range(50): genotype = sna.get_genotype(random_search_flag=True) prog_bar.update(f"\n Sampled{geno_num}th arch: {genotype}") # print(f"\n Sampled {geno_num}th arch: {genotype}") f = open( os.path.join('./logs', cfg['sub_name'], 'search_random_arch_genotype.py'), 'a') f.write(f"\n{cfg['sub_name']}_{geno_num} = {genotype}\n") f.close() print("Sampling done!") debugpy.wait_for_client()
def main(_): # init os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu logger = tf.get_logger() logger.disabled = True logger.setLevel(logging.FATAL) set_memory_growth() cfg = load_yaml(FLAGS.cfg_path) # define network sna = SearchNetArch(cfg) sna.model.summary(line_length=80) print("param size = {:f}MB".format(count_parameters_in_MB(sna.model))) # load dataset t_split = f"train[0%:{int(cfg['train_portion'] * 100)}%]" v_split = f"train[{int(cfg['train_portion'] * 100)}%:100%]" train_dataset = load_cifar10_dataset( cfg['batch_size'], split=t_split, shuffle=True, drop_remainder=True, using_normalize=cfg['using_normalize'], using_crop=cfg['using_crop'], using_flip=cfg['using_flip'], using_cutout=cfg['using_cutout'], cutout_length=cfg['cutout_length']) val_dataset = load_cifar10_dataset(cfg['batch_size'], split=v_split, shuffle=True, drop_remainder=True, using_normalize=cfg['using_normalize'], using_crop=cfg['using_crop'], using_flip=cfg['using_flip'], using_cutout=cfg['using_cutout'], cutout_length=cfg['cutout_length']) # define optimizer steps_per_epoch = int(cfg['dataset_len'] * cfg['train_portion'] // cfg['batch_size']) learning_rate = CosineAnnealingLR(initial_learning_rate=cfg['init_lr'], t_period=cfg['epoch'] * steps_per_epoch, lr_min=cfg['lr_min']) optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate, momentum=cfg['momentum']) optimizer_arch = tf.keras.optimizers.Adam( learning_rate=cfg['arch_learning_rate'], beta_1=0.5, beta_2=0.999) # define losses function criterion = CrossEntropyLoss() # load checkpoint checkpoint_dir = './checkpoints/' + cfg['sub_name'] checkpoint = tf.train.Checkpoint(step=tf.Variable(0, name='step'), optimizer=optimizer, optimizer_arch=optimizer_arch, model=sna.model, alphas_normal=sna.alphas_normal, alphas_reduce=sna.alphas_reduce, betas_normal=sna.betas_normal, betas_reduce=sna.betas_reduce) manager = tf.train.CheckpointManager(checkpoint=checkpoint, directory=checkpoint_dir, max_to_keep=3) if manager.latest_checkpoint: checkpoint.restore(manager.latest_checkpoint) print('[*] load ckpt from {} at step {}.'.format( manager.latest_checkpoint, checkpoint.step.numpy())) else: print("[*] training from scratch.") print(f"[*] searching model after {cfg['start_search_epoch']} epochs.") # define training step function for model @tf.function def train_step(inputs, labels): with tf.GradientTape() as tape: logits = sna.model((inputs, *sna.arch_parameters), training=True) losses = {} losses['reg'] = tf.reduce_sum(sna.model.losses) losses['ce'] = criterion(labels, logits) total_loss = tf.add_n([l for l in losses.values()]) grads = tape.gradient(total_loss, sna.model.trainable_variables) grads = [(tf.clip_by_norm(grad, cfg['grad_clip'])) for grad in grads] optimizer.apply_gradients(zip(grads, sna.model.trainable_variables)) return logits, total_loss, losses # define training step function for arch_parameters @tf.function def train_step_arch(inputs, labels): with tf.GradientTape() as tape: logits = sna.model((inputs, *sna.arch_parameters), training=True) losses = {} losses['reg'] = cfg['arch_weight_decay'] * tf.add_n( [tf.reduce_sum(p**2) for p in sna.arch_parameters]) losses['ce'] = criterion(labels, logits) total_loss = tf.add_n([l for l in losses.values()]) grads = tape.gradient(total_loss, sna.arch_parameters) optimizer_arch.apply_gradients(zip(grads, sna.arch_parameters)) return losses # training loop summary_writer = tf.summary.create_file_writer('./logs/' + cfg['sub_name']) total_steps = steps_per_epoch * cfg['epoch'] remain_steps = max(total_steps - checkpoint.step.numpy(), 0) prog_bar = ProgressBar(steps_per_epoch, checkpoint.step.numpy() % steps_per_epoch) train_acc = AvgrageMeter() for inputs, labels in train_dataset.take(remain_steps): checkpoint.step.assign_add(1) steps = checkpoint.step.numpy() epochs = ((steps - 1) // steps_per_epoch) + 1 if epochs > cfg['start_search_epoch']: inputs_val, labels_val = next(iter(val_dataset)) arch_losses = train_step_arch(inputs_val, labels_val) logits, total_loss, losses = train_step(inputs, labels) train_acc.update( accuracy(logits.numpy(), labels.numpy())[0], cfg['batch_size']) prog_bar.update( "epoch={:d}/{:d}, loss={:.4f}, acc={:.2f}, lr={:.2e}".format( epochs, cfg['epoch'], total_loss.numpy(), train_acc.avg, optimizer.lr(steps).numpy())) if steps % 10 == 0: with summary_writer.as_default(): tf.summary.scalar('acc/train', train_acc.avg, step=steps) tf.summary.scalar('loss/total_loss', total_loss, step=steps) for k, l in losses.items(): tf.summary.scalar('loss/{}'.format(k), l, step=steps) tf.summary.scalar('learning_rate', optimizer.lr(steps), step=steps) if epochs > cfg['start_search_epoch']: for k, l in arch_losses.items(): tf.summary.scalar('arch_losses/{}'.format(k), l, step=steps) tf.summary.scalar('arch_learning_rate', cfg['arch_learning_rate'], step=steps) if steps % cfg['save_steps'] == 0: manager.save() print("\n[*] save ckpt file at {}".format( manager.latest_checkpoint)) if steps % steps_per_epoch == 0: train_acc.reset() if epochs > cfg['start_search_epoch']: genotype = sna.get_genotype() print(f"\nsearch arch: {genotype}") f = open( os.path.join('./logs', cfg['sub_name'], 'search_arch_genotype.py'), 'a') f.write(f"\n{cfg['sub_name']}_{epochs} = {genotype}\n") f.close() manager.save() print("\n[*] training done! save ckpt file at {}".format( manager.latest_checkpoint))