def op_sampler(self, model, architect, args): """ Sample operation from model, used mainly for FairNAS procedure. :param model: :param architect: :param args: :return: """ spec = self.model_spec ops = spec.ops avail_ops = self.search_space.available_ops try: op_vs_choice = np.tile(np.arange(len(avail_ops)), (len(ops) - 2, 1)) op_vs_choice = np.apply_along_axis(np.random.permutation, 1, op_vs_choice).transpose() for i in range(len(avail_ops)): new_ops = [avail_ops[ind] for ind in op_vs_choice[i]] spec.ops = [ 'input', ] + new_ops + ['output'] yield change_model_spec(model, spec) except ValueError as e: logging.warning( f'Op sampler: received exception {e}, return the original model without any op sampling.' ) yield model
def child_valid(self, model, valid_queue, arch_pool, criterion): valid_acc_list = [] objs = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() logging.info("num valid arch {}".format(len(arch_pool))) with torch.no_grad(): model.eval() for i, arch in enumerate(arch_pool): # for step, (inputs, targets) in enumerate(valid_queue): inputs, targets = valid_queue.next_batch() inputs = inputs.cuda() targets = targets.cuda() n = inputs.size(0) arch_l = arch model = change_model_spec(model, self.search_space.topologies[arch]) logits, _ = model(inputs) loss = criterion(logits, targets) prec1, prec5 = utils.accuracy(logits, targets, topk=(1, 5)) objs.update(loss.item(), n) top1.update(prec1.item(), n) top5.update(prec5.item(), n) valid_acc_list.append(prec1.data / 100) if (i + 1) % 100 == 0: logging.info( 'Valid arch %s\n loss %.2f top1 %f top5 %f', self.search_space.process_archname_by_id(arch_l), loss, prec1, prec5) self.save_arch_pool_performance(arch_pool, valid_acc_list, prefix='valid') return valid_acc_list, objs.avg, top1.avg
def random_sampler(self, model, architect, args): """ random sampler and update the model scenario :param model: :param architect: :param args: :return: """ rand_spec_id, rand_spec = self.search_space.random_topology() self.model_spec_id = rand_spec_id self.model_spec = rand_spec new_model = change_model_spec(model, rand_spec) # this is saved per sample. self.trained_model_spec_ids.append(rand_spec_id) return new_model
def validate_model(self, current_model, data_source, current_geno_id, current_genotype, batch_size=10): # this is flaw, let me do another evaluating on all possible models. # compute all possible batches complete_valid_queue = data_source _valid_queue = [] nb_batch_per_model, nb_models, valid_model_pool = self.\ search_space.validate_model_indices(len(complete_valid_queue)) total_valid_acc = 0. total_valid_obj = 0. valid_accs = OrderedDict() valid_objs = OrderedDict() for step, val_d in enumerate(complete_valid_queue): if self.args.debug: if step > 10: logging.debug("Break after 10 step in validation step.") break _valid_queue.append(val_d) if step % nb_batch_per_model == 0 and step > 0: _id = valid_model_pool[min(int(step / nb_batch_per_model), nb_models - 1)] current_model = change_model_spec( current_model, self.search_space.topologies[_id]) current_model.eval() _valid_acc, _valid_obj = self.eval_fn(_valid_queue, current_model, self._loss) # logging.info(f"model id {valid_model_pool[_id]} acc {_valid_acc} loss {_valid_obj}") # update the metrics total_valid_acc += _valid_acc total_valid_obj += _valid_obj # store the results valid_accs[_id] = _valid_acc valid_objs[_id] = _valid_obj _valid_queue = [] self.save_arch_pool_performance(archs=list(valid_accs.keys()), perfs=list(valid_accs.values()), prefix='valid') return total_valid_acc / nb_models, total_valid_obj / nb_models
def random_sampler(self, model, architect, args): # according to Aug 8 meeting. become the new topo sampler total = self.args.num_intermediate_nodes matrices_list = self.search_space.nasbench_sample_matrix_from_list( np.arange(1, total + 1), self.search_space.nasbench_topo_sample_probs) for matrix in matrices_list: if matrix is not None: spec = obtain_full_model_spec(total + 2) try: spec = ModelSpec_v2(matrix, spec.ops) except: IPython.embed() self.model_spec = spec self.model_spec_id = None yield change_model_spec(model, spec)
def evaluate(self, epoch, data_source, arch_pool=None, fitnesses_dict=None, train_queue=None, criterion=None): """ Full evaluation of all possible models. :param epoch: :param data_source: :param fitnesses_dict: Store the model_spec_id -> accuracy :return: """ fitnesses_dict = fitnesses_dict or {} total_avg_acc = 0 total_avg_obj = 0 # rank dict for the possible solutions model_specs_rank = {} model_specs_rank_before = {} queries = {} # as backup ind = 0 eval_result = {} # let us sample 200 architecture to evaluate. # just keep the top K. clean_arch_pool = self.clean_arch_pool( arch_pool)[:self.top_K_complete_evaluate] while ind < len(clean_arch_pool): # get this id if self.args.debug and ind > 10: break arch = clean_arch_pool[ind] new_model_spec = self.process_arch(clean_arch_pool[ind]) ind += 1 # increment this. try: model_spec_id = self.nasbench_hashs.index( new_model_spec.hash_spec()) except Exception as e: logging.error(e) continue query = { 'test accuracy': self.search_space.nasbench.perf_rank[model_spec_id] } # selecting the current subDAG in our DAG to train change_model_spec(self.parallel_model, new_model_spec) # Reset the weights. # evaluate before train self.logger.info( 'evaluate the model spec id: {}'.format(model_spec_id)) _avg_val_acc, _avg_val_acc5, _avg_val_obj = self.child_test( data_source, self.parallel_model, arch, criterion=criterion) eval_result[model_spec_id] = _avg_val_acc, _avg_val_obj logging.info("Query: {}".format(query)) # update the total loss. total_avg_acc += _avg_val_acc total_avg_obj += _avg_val_obj # saving the particle fit in our dictionaries fitnesses_dict[model_spec_id] = _avg_val_acc ms_hash = self.nasbench_hashs[model_spec_id] model_specs_rank[ms_hash] = Rank(_avg_val_acc, _avg_val_obj, model_spec_id, model_spec_id) queries[ms_hash] = query gc.collect() # save the ranking, according to their GENOTYPE but not particle id rank_gens = sorted(model_specs_rank.items(), key=operator.itemgetter(1)) self.ranking_per_epoch[epoch] = rank_gens self.eval_result[epoch] = eval_result # IPython.embed(header="Check evaluation result") self.logger.info('VALIDATION RANKING OF PARTICLES') for pos, elem in enumerate(rank_gens): self.logger.info( f'particle gen id: {elem[1].geno_id}, acc: {elem[1].valid_acc}, obj {elem[1].valid_obj}, ' f'hash: {elem[0]}, pos {pos}') if self.writer: # process data into list. accs_after, objs_after = zip(*eval_result.values()) tensorboard_summarize_list(accs_after, writer=self.writer, key='neweval_after/acc', step=epoch, ascending=False) tensorboard_summarize_list(objs_after, writer=self.writer, key='neweval_after/obj', step=epoch) return fitnesses_dict
def evaluate_extra_steps(policy, epoch, data_source, fitnesses_dict=None, train_queue=None): """ Full evaluation of all possible models. :param epoch: :param data_source: :param fitnesses_dict: Store the model_spec_id -> accuracy :return: """ nb = policy.args.neweval_num_train_batches assert nb > 0 if not train_queue: raise ValueError("New evaluation scheme requires a training queue.") fitnesses_dict = fitnesses_dict or {} total_avg_acc = 0 total_avg_obj = 0 # rank dict for the possible solutions model_specs_rank = {} model_specs_rank_before = {} # make sure the backup weights on CPU, do not occupy the space backup_weights = policy.parallel_model.cpu().state_dict() policy.parallel_model.cuda() _train_iter = enumerate(train_queue) # manual iterate the data here. _train_queue = [] # as backup ind = 0 eval_before_train = {} eval_after_train = {} eval_pool = policy.evaluate_model_spec_id_pool() while ind < len(eval_pool): # model spec id to test. # computing the genotype of the next particle # recover the weights try: if policy.args.debug: if ind > 10: logging.debug("Break after evaluating 10 architectures. Total {}".format(len(eval_pool))) break model_spec_id = eval_pool[ind] ind += 1 # increment this. new_model_spec = policy.search_space.topologies[model_spec_id] # selecting the current subDAG in our DAG to train change_model_spec(policy.parallel_model, new_model_spec) # Reset the weights. logging.debug('Resetting parallel model weights ...') policy.parallel_model.load_state_dict(backup_weights) policy.parallel_model.cuda() # make sure this is on GPU. # IPython.embed(header=f'Model {model_spec_id}: before eval, == checked') # evaluate before train _avg_val_acc_before, _avg_val_obj_before = policy.eval_fn(data_source, policy.parallel_model, criterion=policy._loss, verbose=False) eval_before_train[model_spec_id] = _avg_val_acc_before, _avg_val_obj_before _train_queue = policy.next_batches(train_queue, nb) _batch_count = len(_train_queue) # IPython.embed(header=f'Model {model_spec_id}: finish before eval, == checked') logging.debug('Train {} batches for model_id {} before eval'.format(_batch_count, model_spec_id)) lr = policy.scheduler.get_lr()[0] org_train_acc, org_train_obj = policy.eval_fn(_train_queue, policy.parallel_model, criterion=policy._loss, verbose=policy.args.debug) # IPython.embed(header=f'finish Model {model_spec_id}: validate train batches') # only train the specific parallel model, do not sample a new one. train_acc, train_obj = policy.eval_train_fn( _train_queue, None, policy.parallel_model, policy._loss, policy.optimizer, lr) # clean up the train queue completely. for d in _train_queue: del d _train_queue = [] # clean the data, destroy the graph. logging.debug('-> Train acc {} -> {} | train obj {} -> {} '.format( org_train_acc, train_acc, org_train_obj, train_obj)) # IPython.embed(header=f'Model {model_spec_id}: finish training == checked 1916MB') policy.logger.info('evaluate the model spec id: {}'.format(model_spec_id)) _avg_val_acc, _avg_val_obj = policy.eval_fn(data_source, policy.parallel_model, criterion=policy._loss, verbose=False) eval_after_train[model_spec_id] = _avg_val_acc, _avg_val_obj logging.info('eval acc {} -> {} | eval obj {} -> {}'.format( _avg_val_acc_before, _avg_val_acc, _avg_val_obj_before, _avg_val_obj )) # IPython.embed(header=f'Model {model_spec_id}: finish and move to next one, check GPU release') # update the total loss. total_avg_acc += _avg_val_acc total_avg_obj += _avg_val_obj # saving the particle fit in our dictionaries fitnesses_dict[model_spec_id] = _avg_val_acc ms_hash = policy.search_space.hashs[model_spec_id] model_specs_rank[ms_hash] = Rank(_avg_val_acc, _avg_val_obj, model_spec_id, policy.search_space.rank_by_mid[model_spec_id]) model_specs_rank_before[ms_hash] = Rank(_avg_val_acc_before, _avg_val_obj_before, model_spec_id, policy.search_space.rank_by_mid[model_spec_id]) # manual collect the non-used graphs. gc.collect() except StopIteration as e: _train_iter = enumerate(train_queue) logging.debug("Run out of train queue, {}, restart ind {}".format(e, ind - 1)) ind = ind - 1 # IPython.embed(header="Checking the results.") # save the ranking, according to their GENOTYPE but not particle id rank_gens = sorted(model_specs_rank.items(), key=operator.itemgetter(1)) rank_gens_before = sorted(model_specs_rank_before.items(), key=operator.itemgetter(1)) # hash to positions mapping, before training rank_gens_before_pos = {elem[0]: pos for pos, elem in enumerate(rank_gens_before)} policy.ranking_per_epoch[epoch] = rank_gens policy.ranking_per_epoch_before[epoch] = rank_gens_before policy.eval_result[epoch] = (eval_before_train, eval_after_train) policy.logger.info('VALIDATION RANKING OF PARTICLES') for pos, elem in enumerate(rank_gens): policy.logger.info(f'particle gen id: {elem[1].geno_id}, acc: {elem[1].valid_acc}, obj {elem[1].valid_obj}, ' f'hash: {elem[0]}, pos {pos} vs orig pos {rank_gens_before_pos[elem[0]]}') if policy.writer: # process data into list. accs_before, objs_before = zip(*eval_before_train.values()) accs_after, objs_after = zip(*eval_after_train.values()) tensorboard_summarize_list(accs_before, writer=policy.writer, key='neweval_before/acc', step=epoch, ascending=False) tensorboard_summarize_list(accs_after, writer=policy.writer, key='neweval_after/acc', step=epoch, ascending=False) tensorboard_summarize_list(objs_before, writer=policy.writer, key='neweval_before/obj', step=epoch) tensorboard_summarize_list(objs_after, writer=policy.writer, key='neweval_after/obj', step=epoch) return fitnesses_dict