def _get_model(config, nclasses_train, nclasses_eval): with tf.name_scope("MetaTrain"): with tf.variable_scope("Model"): m = get_model(config.model_class, config, nclasses_train, is_training=True, nshot=FLAGS.nshot) with tf.name_scope("MetaValid"): with tf.variable_scope("Model", reuse=True): mvalid = get_model(config.model_class, config, nclasses_eval, is_training=False, nshot=FLAGS.nshot) return m, mvalid
def build_net(config, backbone=None, memory=None, distributed=False): """Build a memory based lifelong learning model. Args: config: Model config. backbone: Backbone network. memory: Memory network. """ if backbone is None: backbone = build_backbone(config) if memory is None: memory = build_memory_module(config, backbone) model = get_model(config.model_class, config, backbone, memory, distributed=distributed) return model
def __init__(self, config, x, y, x_b, y_b, x_b_v, y_b_v, num_classes_a, num_classes_b, is_training=True, y_sel=None, ext_wts=None): """Attractor model with RBP. Args: config: Model config object. x: Inputs on task A. y: Labels on task A. x_b: Support inputs on task B. y_b: Support labels on task B. x_b_v: Query inputs on task B. y_b_v: Query labels on task B. num_classes_a: Number of classes on task A. num_classes_b: Number of classes on task B. is_training: Whether in training mode. y_sel: Mask on base classes. ext_wts: External weights for initialization. """ self._config = config self._is_training = is_training self._num_classes_a = num_classes_a self._num_classes_b = num_classes_b self._global_step = None if config.backbone_class == 'resnet_backbone': bb_config = config.resnet_config else: assert False, 'Not supported' opt_config = config.optimizer_config proto_config = config.protonet_config transfer_config = config.transfer_config ft_opt_config = transfer_config.ft_optimizer_config self._backbone = get_model(config.backbone_class, bb_config) self._inputs = x self._labels = y self._labels_all = self._labels self._y_sel = y_sel self._rnd = np.random.RandomState(0) # Common random seed. # A step counter for the meta training stage. global_step = self.global_step log.info('LR decay steps {}'.format(opt_config.lr_decay_steps)) log.info('LR list {}'.format(opt_config.lr_list)) # Learning rate decay. learn_rate = tf.train.piecewise_constant( global_step, list(np.array(opt_config.lr_decay_steps).astype(np.int64)), list(opt_config.lr_list)) self._learn_rate = learn_rate # Class matrix mask. self._mask = tf.placeholder(tf.bool, [], name='mask') # Optimizer definition. opt = self.get_optimizer(opt_config.optimizer, learn_rate) # Task A branch. with tf.name_scope('TaskA'): self.build_task_a(x, y, is_training, ext_wts=ext_wts) if is_training: grads_and_vars_a = self.build_task_a_grad() with tf.variable_scope('Optimizer'): bn_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(bn_ops): self._train_op_a = opt.apply_gradients( grads_and_vars_a, global_step=global_step) h_size = self._h_size # Calculated in the function above. w_class_a = self.w_class_a b_class_a = self.b_class_a # The finetuning task. self._inputs_b = x_b self._labels_b = y_b self._inputs_b_v = x_b_v self._labels_b_v = y_b_v self._labels_b_v_all = y_b_v with tf.name_scope('TaskB'): self.build_task_b(x_b, y_b, x_b_v, y_sel) if is_training: grads_and_vars_b = self.build_task_b_grad(x_b_v, y_b_v, y_sel) # Task A and Task B cost weights. assert transfer_config.cost_a_ratio == 0.0 assert transfer_config.cost_b_ratio == 1.0 cost_a_ratio_var = tf.constant(transfer_config.cost_a_ratio, name='cost_a_ratio', dtype=self.dtype) cost_b_ratio_var = tf.constant(transfer_config.cost_b_ratio, name='cost_b_ratio', dtype=self.dtype) # Update gradients for meta-leraning. if is_training: total_grads_and_vars_ab = self._aggregate_grads_and_vars( [grads_and_vars_a, grads_and_vars_b], weights=[cost_a_ratio_var, cost_b_ratio_var]) with tf.variable_scope('Optimizer'): with tf.control_dependencies(bn_ops): self._train_op = opt.apply_gradients( total_grads_and_vars_ab, global_step=global_step) if len(grads_and_vars_b) > 0: self._train_op_b = opt.apply_gradients(grads_and_vars_b, global_step=global_step) else: self._train_op_b = tf.no_op() self._initializer = tf.global_variables_initializer()
def __init__(self, config, x, y, x_b, y_b, x_b_v, y_b_v, num_classes_a, num_classes_b, is_training=True, ext_wts=None, y_sel=None, w_class_a=None, b_class_a=None, nshot=None): self._config = config self._is_training = is_training self._num_classes_a = num_classes_a self._num_classes_b = num_classes_b if config.backbone_class == 'resnet_backbone': bb_config = config.resnet_config else: assert False, 'Not supported' opt_config = config.optimizer_config proto_config = config.protonet_config transfer_config = config.transfer_config self._backbone = get_model(config.backbone_class, bb_config) self._inputs = x self._labels = y # if opt_config.num_gpu > 1: # self._labels_all = allgather(self._labels) # else: self._labels_all = self._labels self._inputs_b = x_b self._labels_b = y_b self._inputs_b_v = x_b_v self._labels_b_v = y_b_v # if opt_config.num_gpu > 1: # self._labels_b_v_all = allgather(self._labels_b_v) # else: self._labels_b_v_all = self._labels_b_v self._y_sel = y_sel self._mask = tf.placeholder(tf.bool, [], name='mask') # global_step = tf.get_variable( # 'global_step', shape=[], dtype=tf.int64, trainable=False) global_step = tf.contrib.framework.get_or_create_global_step() self._global_step = global_step log.info('LR decay steps {}'.format(opt_config.lr_decay_steps)) log.info('LR list {}'.format(opt_config.lr_list)) learn_rate = tf.train.piecewise_constant( global_step, list( np.array(opt_config.lr_decay_steps).astype(np.int64)), list(opt_config.lr_list)) self._learn_rate = learn_rate opt = self.get_optimizer(opt_config.optimizer, learn_rate) # if opt_config.num_gpu > 1: # opt = hvd.DistributedOptimizer(opt) with tf.name_scope('TaskA'): h_a = self.backbone(x, is_training=is_training, ext_wts=ext_wts) self._h_a = h_a # Apply BN ops. bn_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.name_scope('TaskB'): x_b_all = tf.concat([x_b, x_b_v], axis=0) if ext_wts is not None: h_b_all = self.backbone( x_b_all, is_training=is_training, reuse=True, ext_wts=ext_wts) else: h_b_all = self.backbone(x_b_all, is_training=is_training, reuse=True) with tf.name_scope('TaskA'): # Calculates hidden activation size. h_shape = h_a.get_shape() h_size = 1 for ss in h_shape[1:]: h_size *= int(ss) if w_class_a is None: if ext_wts is not None: w_class_a = weight_variable( [h_size, num_classes_a], init_method='numpy', dtype=tf.float32, init_param={'val': np.transpose(ext_wts['w_class_a'])}, wd=config.wd, name='w_class_a') b_class_a = weight_variable([], init_method='numpy', dtype=tf.float32, init_param={'val': ext_wts['b_class_a']}, wd=0e0, name='b_class_a') else: w_class_a = weight_variable([h_size, num_classes_a], init_method='truncated_normal', dtype=tf.float32, init_param={'stddev': 0.01}, wd=bb_config.wd, name='w_class_a') b_class_a = weight_variable([num_classes_a], init_method='constant', init_param={'val': 0.0}, name='b_class_a') self._w_class_a_orig = w_class_a self._b_class_a_orig = b_class_a else: assert b_class_a is not None w_class_a_orig = weight_variable([h_size, num_classes_a], init_method='truncated_normal', dtype=tf.float32, init_param={'stddev': 0.01}, wd=bb_config.wd, name='w_class_a') b_class_a_orig = weight_variable([num_classes_a], init_method='constant', init_param={'val': 0.0}, name='b_class_a') self._w_class_a_orig = w_class_a_orig self._b_class_a_orig = b_class_a_orig self._w_class_a = w_class_a self._b_class_a = b_class_a num_classes_a_dyn = tf.cast(tf.shape(b_class_a)[0], tf.int64) num_classes_a_dyn32 = tf.shape(b_class_a)[0] if proto_config.cosine_a: if proto_config.cosine_tau: if ext_wts is None: init_val = 10.0 else: init_val = ext_wts['tau'][0] tau = weight_variable([], init_method='constant', init_param={'val': init_val}, name='tau') else: tau = tf.constant(1.0) w_class_a_norm = self._normalize(w_class_a, 0) h_a_norm = self._normalize(h_a, 1) dot = tf.matmul(h_a_norm, w_class_a_norm) if ext_wts is not None: dot += b_class_a logits_a = tau * dot else: logits_a = compute_euc(tf.transpose(w_class_a), h_a) self._prediction_a = logits_a # if opt_config.num_gpu > 1: # self._prediction_a_all = allgather(self._prediction_a) # else: self._prediction_a_all = self._prediction_a xent_a = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits_a, labels=y) cost_a = tf.reduce_mean(xent_a, name='xent') self._cost_a = cost_a cost_a += self._decay() correct_a = tf.equal(tf.argmax(logits_a, axis=1), y) self._correct_a = correct_a self._acc_a = tf.reduce_mean(tf.cast(correct_a, cost_a.dtype)) with tf.name_scope('TaskB'): h_b = h_b_all[:tf.shape(x_b)[0]] h_b_v = h_b_all[tf.shape(x_b)[0]:] # Add new axes for the `batch` dimension. h_b_ = tf.expand_dims(h_b, 0) h_b_v_ = tf.expand_dims(h_b_v, 0) y_b_ = tf.expand_dims(y_b, 0) y_b_v_ = tf.expand_dims(y_b_v, 0) if transfer_config.old_and_new: protos_b = self._compute_protos(num_classes_b, h_b_, y_b_ - num_classes_a) else: protos_b = self._compute_protos(num_classes_b, h_b_, y_b_) w_class_a_ = tf.expand_dims(tf.transpose(w_class_a), 0) if proto_config.protos_phi: w_p1 = weight_variable([h_size], init_method='constant', dtype=tf.float32, init_param={'val': 1.0}, wd=bb_config.wd, name='w_p1') if proto_config.cosine_attention: w_q = weight_variable([h_size, h_size], init_method='truncated_normal', dtype=tf.float32, init_param={'stddev': 0.1}, wd=bb_config.wd, name='w_q') k_b = weight_variable([num_classes_a, h_size], init_method='truncated_normal', dtype=tf.float32, init_param={'stddev': 0.1}, wd=bb_config.wd, name='k_b') tau_q = weight_variable([], init_method='constant', init_param={'val': 10.0}, name='tau_q') if transfer_config.old_and_new: w_class_b = self._compute_protos_attend_fix( num_classes_b, h_b_, y_b_ - num_classes_a_dyn, w_q, tau_q, k_b, self._w_class_a_orig) else: w_class_b = self._compute_protos_attend_fix( num_classes_b, h_b_, y_b_, w_q, tau_q, k_b, self._w_class_a_orig) assert proto_config.protos_phi w_p2 = weight_variable([h_size], init_method='constant', dtype=tf.float32, init_param={'val': 1.0}, wd=bb_config.wd, name='w_p2') self._k_b = tf.expand_dims(w_p2, 1) * self._w_class_a_orig self._k_b2 = k_b self.bias = w_class_b self.new_protos = w_p1 * protos_b self.new_bias = w_p2 * w_class_b w_class_b = w_p1 * protos_b + w_p2 * w_class_b self.protos = protos_b self.w_class_b_final = w_class_b else: w_class_b = protos_b if proto_config.protos_phi: w_class_b = w_p1 * w_class_b self._w_class_b = w_class_b if transfer_config.old_and_new: w_class_all = tf.concat([w_class_a_, w_class_b], axis=1) else: w_class_all = w_class_b if proto_config.cosine_softmax_tau: tau_b = weight_variable([], init_method='constant', init_param={'val': 10.0}, name='tau_b') else: tau_b = tf.constant(1.0) if proto_config.similarity == 'euclidean': logits_b_v = compute_logits(w_class_all, h_b_v_) elif proto_config.similarity == 'cosine': logits_b_v = tau_b * compute_logits_cosine(w_class_all, h_b_v_) else: raise ValueError('Unknown similarity') self._logits_b_v = logits_b_v self._prediction_b = logits_b_v[0] # if opt_config.num_gpu > 1: # self._prediction_b_all = allgather(self._prediction_b) # else: self._prediction_b_all = self._prediction_b # Mask out the old classes. def mask_fn(): bin_mask = tf.expand_dims( tf.reduce_sum( tf.one_hot(y_sel, num_classes_a + num_classes_b), 0, keep_dims=True), 0) logits_b_v_m = logits_b_v * (1.0 - bin_mask) logits_b_v_m -= bin_mask * 100.0 return logits_b_v_m # if transfer_config.old_and_new: # logits_b_v = tf.cond(self._mask, mask_fn, lambda: logits_b_v) xent_b_v = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits_b_v, labels=y_b_v_) cost_b = tf.reduce_mean(xent_b_v, name='xent') self._cost_b = cost_b if transfer_config.old_and_new: total_cost = cost_b else: total_cost = (transfer_config.cost_a_ratio * cost_a + transfer_config.cost_b_ratio * cost_b) self._total_cost = total_cost if not transfer_config.meta_only: # assert False, 'let us go for pretrained model first' var_list = tf.trainable_variables() var_list = list(filter(lambda x: 'phi' in x.name, var_list)) layers = self.config.transfer_config.meta_layers if layers == "all": pass elif layers == "4": keywords = ['TaskB', 'unit_4_'] filter_fn = lambda x: any([kw in x.name for kw in keywords]) var_list = list(filter(filter_fn, var_list)) else: raise ValueError('Unknown finetune layers {}'.format(layers)) [log.info('Slow weights {}'.format(v.name)) for v in var_list] else: var_list = [] if proto_config.cosine_softmax_tau: var_list += [tau_b] if proto_config.cosine_attention: var_list += [w_q, tau_q, k_b, w_p2] if proto_config.protos_phi: var_list += [w_p1] if transfer_config.train_wclass_a: if proto_config.similarity == 'euclidean': var_list += [w_class_a, b_class_a] elif proto_config.similarity == 'cosine': var_list += [w_class_a] if is_training: grads_and_vars = opt.compute_gradients(total_cost, var_list) with tf.control_dependencies(bn_ops): [log.info('BN op {}'.format(op.name)) for op in bn_ops] train_op = opt.apply_gradients(grads_and_vars, global_step=global_step) grads_and_vars_b = opt.compute_gradients(cost_b, var_list) with tf.control_dependencies(bn_ops): train_op_b = opt.apply_gradients( grads_and_vars_b, global_step=global_step) with tf.control_dependencies(bn_ops): train_op_a = opt.minimize(cost_a, global_step=global_step) self._train_op = train_op self._train_op_a = train_op_a self._train_op_b = train_op_b self._initializer = tf.global_variables_initializer() self._w_class_a = w_class_a
def _get_model(config): m = get_model(args.model, config, args.dataset) return m.cuda()
def build_pretrain_net(config, backbone=None): """Builds a regular classification network for pretraining.""" if backbone is None: backbone = build_backbone(config) model = get_model("pretrain_net", config, backbone) return model