def _lstm_cell(self, name, n_hidden, x_in, h=None, c=None): if h is None: h = nn.Variable.from_numpy_array( np.zeros((self._batch_size, self._cols_size))) if c is None: c = nn.Variable.from_numpy_array( np.zeros((self._batch_size, n_hidden))) h = F.concatenate(h, x_in, axis=1) # LSTM_Concatenate -> cols_size * 2 with nn.parameter_scope(name + '_Affine'): # LSTM_Affine -> n_hidden h1 = PF.affine(h, (n_hidden, ), base_axis=1) with nn.parameter_scope(name + '_IGate'): # LSTM_IGate -> n_hidden h2 = PF.affine(h, (n_hidden, ), base_axis=1) with nn.parameter_scope(name + '_FGate'): # LSTM_FGate -> n_hidden h3 = PF.affine(h, (n_hidden, ), base_axis=1) with nn.parameter_scope(name + '_OGate'): # LSTM_OGate -> n_hidden h4 = PF.affine(h, (n_hidden, ), base_axis=1) h1 = F.tanh(h1) # LSTM_Tanh h2 = F.sigmoid(h2) # LSTM_Sigmoid h3 = F.sigmoid(h3) # LSTM_Sigmoid_2 h4 = F.sigmoid(h4) # LSTM_Sigmoid_3 h5 = F.mul2(h2, h1) # LSTM_Mul2 -> n_hidden h6 = F.mul2(h3, c) # LSTM_Mul2_2 -> n_hidden h7 = F.add2(h5, h6, inplace=True) # LSTM_Add2 -> n_hidden h8 = F.tanh(h7) # LSTM_Tanh_2 -> n_hidden h9 = F.mul2(h4, h8) # LSTM_Mul2_3 -> n_hidden c = h7 # LSTM_C h = h9 # LSTM_H return (h, c)
def LSTMCell(x, h2, h1): units = h1.shape[1] #first stack h2=hidden, h1= cell h2 = F.concatenate(h2, x, axis=1) h3 = PF.affine(h2, (units), name='Affine') h4 = PF.affine(h2, (units), name='InputGate') h5 = PF.affine(h2, (units), name='ForgetGate') h6 = PF.affine(h2, (units), name='OutputGate') h3 = F.tanh(h3) h4 = F.sigmoid(h4) h5 = F.sigmoid(h5) h6 = F.sigmoid(h6) h4 = F.mul2(h4, h3) h5 = F.mul2(h5, h1) h4 = F.add2(h4, h5, True) h7 = F.tanh(h4) h6 = F.mul2(h6, h7) return h6, h4 # hidden, cell
def graph(x1): x1 = F.identity(x1).apply(recompute=True) x2 = F.randn(shape=x1.shape, seed=123).apply(recompute=True) x3 = F.rand(shape=x1.shape, seed=456).apply(recompute=True) y = F.mul2(x1, x2).apply(recompute=True) y = F.mul2(y, x3).apply(recompute=True) y = F.identity(y) return y
def network_LSTM(x, D, C, InputShape, HiddenSize, test=False): # Input_2:x -> 687 # Delya_in:D -> 100 # Cell_in:C -> 100 # Concatenate -> 787 h = F.concatenate(D, x, axis=1) # Affine -> 100 h1 = PF.affine(h, HiddenSize, name='Affine') # InputGate -> 100 h2 = PF.affine(h, HiddenSize, name='InputGate') # OutputGate -> 100 h3 = PF.affine(h, HiddenSize, name='OutputGate') # ForgetGate -> 100 h4 = PF.affine(h, HiddenSize, name='ForgetGate') # Sigmoid h1 = F.sigmoid(h1) # Sigmoid_2 h2 = F.sigmoid(h2) # Sigmoid_3 h3 = F.sigmoid(h3) # Sigmoid_4 h4 = F.sigmoid(h4) # Mul2 -> 100 h1 = F.mul2(h1, h2) # Mul2_3 -> 100 h4 = F.mul2(h4, C) # Add2 -> 100 h1 = F.add2(h1, h4, True) # Tanh h5 = F.tanh(h1) # Cell_out h6 = F.identity(h1) # Mul2_2 -> 100 h5 = F.mul2(h5, h3) # Dropout if not test: h5 = F.dropout(h5) # Output h5 = F.identity(h5) # Concatenate_2 -> 200 h5 = F.concatenate(h5, h6, axis=1) return h5
def propagate(h, edges, state_size=None, w_initializer=None, u_initializer1=None, u_initializer2=None, bias_initializer=None, edge_initializers=None): """ Propagate vertex representations Arguments: h -- the input vertex representations (nnabla.Variable with shape (|V|, D)) edges -- the dictionary that represents the graph edge ({label, [in, out]}) state_size -- (optional) the size of hidden state (h.shape[1] is used if this argument is None) w_initializer -- (optional) u_initializer1 -- (optional) u_initializer2 -- (optional) bias_initializer -- (optional) edge_initializers -- (optional) Return value - Return a variable with shape (|V|, D) """ if state_size is None: state_size = h.shape[1] h_size = h.shape[1] with nn.parameter_scope("activate"): a = activate(h, edges, state_size, bias_initializer=bias_initializer, edge_initializers=edge_initializers) with nn.parameter_scope("W_zr"): ws = PF.affine(a, (3, h_size), with_bias=False, w_init=w_initializer) (z1, r1, h_hat1) = split(ws, axis=1) with nn.parameter_scope("U_zr"): us = PF.affine(h, (2, state_size), with_bias=False, w_init=u_initializer1) (z2, r2) = split(us, axis=1) z = F.sigmoid(F.add2(z1, z2)) r = F.sigmoid(F.add2(r1, r2)) with nn.parameter_scope("U"): h_hat2 = PF.affine(F.mul2(r, h), state_size, with_bias=False, w_init=u_initializer2) h_hat = F.tanh(F.add2(h_hat1, h_hat2)) return F.add2(F.sub2(h, F.mul2(z, h)), F.mul2(z, h_hat))
def disparityregression(x, maxdisp): disp = nn.Variable((x.shape), need_grad=False) for i in range(0, maxdisp): disp.d[:, :, i, :, :] = i dispx = F.mul2(disp, x) out = F.sum(dispx, axis=2) return out
def test_imperative_i2_o1(): import nnabla.functions as F x0 = nn.NdArray([2, 3, 4]) x1 = nn.NdArray([2, 1, 1]) x0.fill(3) x1.fill(0.5) y = F.mul2(x0, x1) assert np.allclose(y.data, 1.5)
def call(self, input): if self._drop_prob == 0: return input mask = F.rand(shape=(input.shape[0], 1, 1, 1)) mask = F.greater_equal_scalar(mask, self._drop_prob) out = F.mul_scalar(input, 1. / (1 - self._drop_prob)) out = F.mul2(out, mask) return out
def call(self, input): if self._mode == 'full': out = F.stack(*[op(input) for op in self._ops], axis=0) out = F.mul2(out, F.softmax(self._alpha, axis=0)) return F.sum(out, axis=0) # update active index self._update_active_index() return self._ops[self._active](input)
def test_large_transform_binary(fname, ctx, func_name): if not func_name.endswith('Cuda'): pytest.skip('Grid-strided loop is tested only for CUDA backend') with nn.context_scope(ctx), nn.auto_forward(True): a = nn.Variable.from_numpy_array(np.random.randn( 1024, 64, 1)).apply(need_grad=True) b = nn.Variable.from_numpy_array(np.random.randn( 1024, 64, 3)).apply(need_grad=True) c = F.mul2(a, b) c.backward()
def call(self, *input): if self._mode == 'concat' and len(input) > 1: return F.concatenate(*input, axis=self._axis) out = input[0] if self._mode == 'add': for i in range(1, len(input)): out = F.add2(out, input[i]) if self._mode == 'mul': for i in range(1, len(input)): out = F.mul2(out, input[i]) return out
def drop_path(x): """ The same implementation as PyTorch versions. rate: Variable. drop rate. if the random value drawn from uniform distribution is less than the drop_rate, corresponding element becomes 0. """ drop_prob = nn.parameter.get_parameter_or_create("drop_rate", shape=(1, 1, 1, 1), need_grad=False) mask = F.rand(shape=(x.shape[0], 1, 1, 1)) mask = F.greater_equal(mask, drop_prob) x = F.div2(x, 1 - drop_prob) x = F.mul2(x, mask) return x
def test_recomputed_data_value(self, seed): rng = np.random.RandomState(seed) a0 = nn.Variable((2, 3), need_grad=True) b0 = nn.Variable((2, 3), need_grad=True) a0.d = rng.randn(*a0.shape) b0.d = rng.randn(*b0.shape) a1 = F.sin(a0).apply(recompute=True) a2 = F.sin(a1) a3 = F.sin(a2) b1 = F.sin(b0) b2 = F.sin(b1).apply(recompute=True) b3 = F.sin(b2) c0 = F.mul2(a3, b3).apply(recompute=True) c1 = F.sin(c0) # Forward # Get output data which will be recomputed. ref_data = [] # data of a0, b2 and c0 will be stored. def get_output_data(nnabla_func): outputs = nnabla_func.outputs for output in outputs: if output.recompute: ref_data.append(copy.deepcopy(output.d)) c1.forward(function_post_hook=get_output_data) # Backward # Get recomputed data act_data = [] def get_recomputed_data(nnabla_func): inputs = nnabla_func.inputs for input in inputs: if input.recompute: act_data.append(copy.deepcopy(input.d)) c1.backward(function_pre_hook=get_recomputed_data) # Make the order the same as `ref_data`. act_data.reverse() # Check recomputed data for act, ref in zip(act_data, ref_data): assert_allclose(act, ref, rtol=0, atol=0)
def context_preserving_loss(xa, yb): def mask_weight(a, b): # much different from definition in the paper merged_mask = F.concatenate(a, b, axis=1) summed_mask = F.sum((merged_mask + 1) / 2, axis=1, keepdims=True) clipped = F.clip_by_value(summed_mask, F.constant(0, shape=summed_mask.shape), F.constant(1, shape=summed_mask.shape)) z = clipped * 2 - 1 mask = (1 - z) / 2 return mask x = xa[:, :3, :, :] a = xa[:, 3:, :, :] y = yb[:, :3, :, :] b = yb[:, 3:, :, :] assert x.shape == y.shape and a.shape == b.shape W = mask_weight(a, b) return F.mean(F.mul2(F.absolute_error(x, y), W))
def __mul__(self, other): """ Element-wise multiplication. Implements the multiplication operator expression ``A * B``, together with :func:`~nnabla.variable.__rmul__` . When a scalar is specified for ``other``, this function performs an element-wise operation for all elements in ``self``. Args: other (float or ~nnabla.Variable): Internally calling :func:`~nnabla.functions.mul2` or :func:`~nnabla.functions.mul_scalar` according to the type. Returns: :class:`nnabla.Variable` """ import nnabla.functions as F if isinstance(other, Variable): return F.mul2(self, other) return F.mul_scalar(self, other)
def test_clear_input_if_no_need_grad_branch1(self): x1 = nn.Variable([1, 5], need_grad=True) x2 = nn.Variable([1, 5], need_grad=True) x3 = nn.Variable([1, 5], need_grad=True) xx1 = F.identity(x1) xx2 = F.identity(x2) y1 = F.mul2(xx1, xx2) # (1) xx3 = F.identity(x3) y2 = F.add2(xx2, xx3) # (2) y3 = F.add2(y1, y2) # (3) answer = [] answer.append([False]) answer.append([False]) answer.append([False, False]) # (1) answer.append([False]) answer.append([False, True]) # (2) use xx2 in backward answer.append([True, True]) # (3) y3.forward(clear_no_need_grad=True) self.check_input_data_clear_called_flags(answer)
def test_unnecessary_traverse_1(self): a0 = nn.Variable((2, 3), need_grad=False) # `a1` will not be recomputed since `a2` will not be cleared. a1 = F.sin(a0).apply(recompute=True) a2 = F.cos(a1) a3 = F.sin(a2).apply(recompute=True) # 'a3` will be recomputed. b0 = nn.Variable((2, 3), need_grad=True).apply(recompute=True) b1 = F.identity(b0).apply(recompute=True) c = F.mul2(a3, b1).apply(recompute=True) # Check recomputation recursion stops when `a3.data` is calculated. c.forward(clear_buffer=False) # `a1.data` is cleared because `recompute` flag is `true`. assert(a1.data.clear_called == True) # `a2.data` is not cleared because `recompute` flag is `false`. assert(a2.data.clear_called == False) c.backward(clear_buffer=False) # If the recursive call reached to `a1`, `a1.data` should be set by recomputation. # However, the recursive call stops at `a2` whose data is not cleared. assert(a1.data.clear_called == True)
def graph_representation(h, x, n_outmaps, w_init=None, b_init=None): """ Outputs graph representaiton model Arguments: h -- the input vertex representations (nnabla.Variable with shape (|V|, H)) x -- the input vertex annotation (nnabla.Variable with shape (|V|, X)) n_outmaps -- the size of node representation w_init -- (optional) b_init -- (optional) Return value - Return a variable with shape (n_outmaps) """ with nn.parameter_scope("graph_representation"): output = F.concatenate(h, x) output = PF.affine(output, (2, n_outmaps), w_init=w_init, b_init=b_init) (s, t) = F.split(output, axis=1) return F.sum(F.mul2(F.sigmoid(s), F.tanh(t)), axis=0, keepdims=True)
def train(args): if args.c_dim != len(args.selected_attrs): print("c_dim must be the same as the num of selected attributes. Modified c_dim.") args.c_dim = len(args.selected_attrs) # Dump the config information. config = dict() print("Used config:") for k in args.__dir__(): if not k.startswith("_"): config[k] = getattr(args, k) print("'{}' : {}".format(k, getattr(args, k))) # Prepare Generator and Discriminator based on user config. generator = functools.partial( model.generator, conv_dim=args.g_conv_dim, c_dim=args.c_dim, num_downsample=args.num_downsample, num_upsample=args.num_upsample, repeat_num=args.g_repeat_num) discriminator = functools.partial(model.discriminator, image_size=args.image_size, conv_dim=args.d_conv_dim, c_dim=args.c_dim, repeat_num=args.d_repeat_num) x_real = nn.Variable( [args.batch_size, 3, args.image_size, args.image_size]) label_org = nn.Variable([args.batch_size, args.c_dim, 1, 1]) label_trg = nn.Variable([args.batch_size, args.c_dim, 1, 1]) with nn.parameter_scope("dis"): dis_real_img, dis_real_cls = discriminator(x_real) with nn.parameter_scope("gen"): x_fake = generator(x_real, label_trg) x_fake.persistent = True # to retain its value during computation. # get an unlinked_variable of x_fake x_fake_unlinked = x_fake.get_unlinked_variable() with nn.parameter_scope("dis"): dis_fake_img, dis_fake_cls = discriminator(x_fake_unlinked) # ---------------- Define Loss for Discriminator ----------------- d_loss_real = (-1) * loss.gan_loss(dis_real_img) d_loss_fake = loss.gan_loss(dis_fake_img) d_loss_cls = loss.classification_loss(dis_real_cls, label_org) d_loss_cls.persistent = True # Gradient Penalty. alpha = F.rand(shape=(args.batch_size, 1, 1, 1)) x_hat = F.mul2(alpha, x_real) + \ F.mul2(F.r_sub_scalar(alpha, 1), x_fake_unlinked) with nn.parameter_scope("dis"): dis_for_gp, _ = discriminator(x_hat) grads = nn.grad([dis_for_gp], [x_hat]) l2norm = F.sum(grads[0] ** 2.0, axis=(1, 2, 3)) ** 0.5 d_loss_gp = F.mean((l2norm - 1.0) ** 2.0) # total discriminator loss. d_loss = d_loss_real + d_loss_fake + args.lambda_cls * \ d_loss_cls + args.lambda_gp * d_loss_gp # ---------------- Define Loss for Generator ----------------- g_loss_fake = (-1) * loss.gan_loss(dis_fake_img) g_loss_cls = loss.classification_loss(dis_fake_cls, label_trg) g_loss_cls.persistent = True # Reconstruct Images. with nn.parameter_scope("gen"): x_recon = generator(x_fake_unlinked, label_org) x_recon.persistent = True g_loss_rec = loss.recon_loss(x_real, x_recon) g_loss_rec.persistent = True # total generator loss. g_loss = g_loss_fake + args.lambda_rec * \ g_loss_rec + args.lambda_cls * g_loss_cls # -------------------- Solver Setup --------------------- d_lr = args.d_lr # initial learning rate for Discriminator g_lr = args.g_lr # initial learning rate for Generator solver_dis = S.Adam(alpha=args.d_lr, beta1=args.beta1, beta2=args.beta2) solver_gen = S.Adam(alpha=args.g_lr, beta1=args.beta1, beta2=args.beta2) # register parameters to each solver. with nn.parameter_scope("dis"): solver_dis.set_parameters(nn.get_parameters()) with nn.parameter_scope("gen"): solver_gen.set_parameters(nn.get_parameters()) # -------------------- Create Monitors -------------------- monitor = Monitor(args.monitor_path) monitor_d_cls_loss = MonitorSeries( 'real_classification_loss', monitor, args.log_step) monitor_g_cls_loss = MonitorSeries( 'fake_classification_loss', monitor, args.log_step) monitor_loss_dis = MonitorSeries( 'discriminator_loss', monitor, args.log_step) monitor_recon_loss = MonitorSeries( 'reconstruction_loss', monitor, args.log_step) monitor_loss_gen = MonitorSeries('generator_loss', monitor, args.log_step) monitor_time = MonitorTimeElapsed("Training_time", monitor, args.log_step) # -------------------- Prepare / Split Dataset -------------------- using_attr = args.selected_attrs dataset, attr2idx, idx2attr = get_data_dict(args.attr_path, using_attr) random.seed(313) # use fixed seed. random.shuffle(dataset) # shuffle dataset. test_dataset = dataset[-2000:] # extract 2000 images for test if args.num_data: # Use training data partially. training_dataset = dataset[:min(args.num_data, len(dataset) - 2000)] else: training_dataset = dataset[:-2000] print("Use {} images for training.".format(len(training_dataset))) # create data iterators. load_func = functools.partial(stargan_load_func, dataset=training_dataset, image_dir=args.celeba_image_dir, image_size=args.image_size, crop_size=args.celeba_crop_size) data_iterator = data_iterator_simple(load_func, len( training_dataset), args.batch_size, with_file_cache=False, with_memory_cache=False) load_func_test = functools.partial(stargan_load_func, dataset=test_dataset, image_dir=args.celeba_image_dir, image_size=args.image_size, crop_size=args.celeba_crop_size) test_data_iterator = data_iterator_simple(load_func_test, len( test_dataset), args.batch_size, with_file_cache=False, with_memory_cache=False) # Keep fixed test images for intermediate translation visualization. test_real_ndarray, test_label_ndarray = test_data_iterator.next() test_label_ndarray = test_label_ndarray.reshape( test_label_ndarray.shape + (1, 1)) # -------------------- Training Loop -------------------- one_epoch = data_iterator.size // args.batch_size num_max_iter = args.max_epoch * one_epoch for i in range(num_max_iter): # Get real images and labels. real_ndarray, label_ndarray = data_iterator.next() label_ndarray = label_ndarray.reshape(label_ndarray.shape + (1, 1)) label_ndarray = label_ndarray.astype(float) x_real.d, label_org.d = real_ndarray, label_ndarray # Generate target domain labels randomly. rand_idx = np.random.permutation(label_org.shape[0]) label_trg.d = label_ndarray[rand_idx] # ---------------- Train Discriminator ----------------- # generate fake image. x_fake.forward(clear_no_need_grad=True) d_loss.forward(clear_no_need_grad=True) solver_dis.zero_grad() d_loss.backward(clear_buffer=True) solver_dis.update() monitor_loss_dis.add(i, d_loss.d.item()) monitor_d_cls_loss.add(i, d_loss_cls.d.item()) monitor_time.add(i) # -------------- Train Generator -------------- if (i + 1) % args.n_critic == 0: g_loss.forward(clear_no_need_grad=True) solver_dis.zero_grad() solver_gen.zero_grad() x_fake_unlinked.grad.zero() g_loss.backward(clear_buffer=True) x_fake.backward(grad=None) solver_gen.update() monitor_loss_gen.add(i, g_loss.d.item()) monitor_g_cls_loss.add(i, g_loss_cls.d.item()) monitor_recon_loss.add(i, g_loss_rec.d.item()) monitor_time.add(i) if (i + 1) % args.sample_step == 0: # save image. save_results(i, args, x_real, x_fake, label_org, label_trg, x_recon) if args.test_during_training: # translate images from test dataset. x_real.d, label_org.d = test_real_ndarray, test_label_ndarray label_trg.d = test_label_ndarray[rand_idx] x_fake.forward(clear_no_need_grad=True) save_results(i, args, x_real, x_fake, label_org, label_trg, None, is_training=False) # Learning rates get decayed if (i + 1) > int(0.5 * num_max_iter) and (i + 1) % args.lr_update_step == 0: g_lr = max(0, g_lr - (args.lr_update_step * args.g_lr / float(0.5 * num_max_iter))) d_lr = max(0, d_lr - (args.lr_update_step * args.d_lr / float(0.5 * num_max_iter))) solver_gen.set_learning_rate(g_lr) solver_dis.set_learning_rate(d_lr) print('learning rates decayed, g_lr: {}, d_lr: {}.'.format(g_lr, d_lr)) # Save parameters and training config. param_name = 'trained_params_{}.h5'.format( datetime.datetime.today().strftime("%m%d%H%M")) param_path = os.path.join(args.model_save_path, param_name) nn.save_parameters(param_path) config["pretrained_params"] = param_name with open(os.path.join(args.model_save_path, "training_conf_{}.json".format(datetime.datetime.today().strftime("%m%d%H%M"))), "w") as f: json.dump(config, f) # -------------------- Translation on test dataset -------------------- for i in range(args.num_test): real_ndarray, label_ndarray = test_data_iterator.next() label_ndarray = label_ndarray.reshape(label_ndarray.shape + (1, 1)) label_ndarray = label_ndarray.astype(float) x_real.d, label_org.d = real_ndarray, label_ndarray rand_idx = np.random.permutation(label_org.shape[0]) label_trg.d = label_ndarray[rand_idx] x_fake.forward(clear_no_need_grad=True) save_results(i, args, x_real, x_fake, label_org, label_trg, None, is_training=False)
def graph(x1, x2): x1 = F.identity(x1).apply(recompute=True) x2 = F.identity(x2).apply(recompute=True) y = F.mul2(x1, x2) y = F.identity(y) return y
def constructing_cell(args, ops, which_cell, cell_prev_prev, cell_prev, output_filter, is_reduced_curr, is_reduced_prev, test=False): """ Constructing one cell. input: args: arguments set by user. ops: operations used in the network. arch_dict: a dictionary containing architecture information. which_cell: int. An index of cell currently constructed. cell_prev_prev: Variable. Output of the cell behind the previous cell. cell_prev: Variable. Output of the previous cell. output_filter:t he number of the filter used for this cell. is_reduced_curr: bool. True if the current cell is the reduction cell. is_reduced_prev: bool. True if the previous cell is the reduction cell. test: bool. True if the network is for validation. """ # If True, all the parameters in batch_normalizations won't be updated. is_search = True if is_reduced_curr: keyname_basis = "alpha_reduction" output_shape = (cell_prev.shape[0], output_filter, cell_prev.shape[2] // 2, cell_prev.shape[3] // 2) else: keyname_basis = "alpha_normal" output_shape = (cell_prev.shape[0], output_filter, cell_prev.shape[2], cell_prev.shape[3]) if is_reduced_prev: scope = "fr{}".format(which_cell) cell_prev_prev = factorized_reduction(cell_prev_prev, output_filter, scope, test, is_search) else: scope = "preprocess_cell{}_node{}".format(which_cell, 0) cell_prev_prev = conv1x1(cell_prev_prev, output_filter, scope, test, is_search) scope = "preprocess_cell{}_node{}".format(which_cell, 1) cell_prev = conv1x1(cell_prev, output_filter, scope, test, is_search) num_of_nodes = args.num_nodes # latter_nodes are all the intermediate nodes, # except for 2 input nodes and 1 output node. latter_nodes = [ nn.Variable(output_shape) for _ in range(num_of_nodes - 2 - 1) ] for v in latter_nodes: v.d = 0 # initialize. num_of_ops = len(ops) # prepare a list to store all nodes. nodes = [cell_prev_prev, cell_prev] + latter_nodes for i in range(num_of_nodes - 2): successors = [_ for _ in range(i + 1, num_of_nodes - 1)] for j in successors: if j == 1: continue from_node, to_node = i, j scope = "cell{}/node{}_{}".format(which_cell, from_node, to_node) stacked_x = num_of_ops * (nodes[i], ) stacked_x = tuple([ op(x, output_filter, scope + "/ops{}".format(op_id), i, is_reduced_curr, test, is_search) for x, op, op_id in zip( stacked_x, tuple(ops.values()), tuple(ops.keys())) ]) y = F.stack(*stacked_x, axis=0) alpha_name = keyname_basis + "_{}_{}".format(i, j) current_alpha = nn.parameter.get_parameter_or_create( alpha_name, (num_of_ops, ) + (1, 1, 1, 1)) alpha_prob = F.softmax(current_alpha, axis=0) y = F.mul2(y, alpha_prob) if i == 0: nodes[j] = F.sum(y, axis=0) else: nodes[j] = F.add2(nodes[j], F.sum(y, axis=0)) intermediate_nodes = nodes[2:num_of_nodes - 1] output = F.concatenate(*intermediate_nodes, axis=1) is_reduced_prev = is_reduced_curr return output, is_reduced_curr, is_reduced_prev, output_filter
def sample_from_controller(args): """ 2-layer RNN(LSTM) based controller which outputs an architecture of CNN, represented as a sequence of integers and its list. Given the number of layers, for each layer, it executes 2 types of computation, one for sampling the operation at that layer, another for sampling the skip connection patterns. """ entropys = nn.Variable([1, 1], need_grad=True) log_probs = nn.Variable([1, 1], need_grad=True) skip_penaltys = nn.Variable([1, 1], need_grad=True) entropys.d = log_probs.d = skip_penaltys.d = 0.0 # initialize them all num_layers = args.num_layers lstm_size = args.lstm_size state_size = args.state_size lstm_num_layers = args.lstm_layers skip_target = args.skip_prob temperature = args.temperature tanh_constant = args.tanh_constant num_branch = args.num_ops arc_seq = [] initializer = I.UniformInitializer((-0.1, 0.1)) prev_h = [ nn.Variable([1, lstm_size], need_grad=True) for _ in range(lstm_num_layers) ] prev_c = [ nn.Variable([1, lstm_size], need_grad=True) for _ in range(lstm_num_layers) ] for i in range(len(prev_h)): prev_h[i].d = 0 # initialize variables in lstm layers. prev_c[i].d = 0 inputs = nn.Variable([1, lstm_size]) inputs.d = np.random.normal(0, 0.5, [1, lstm_size]) g_emb = nn.Variable([1, lstm_size]) g_emb.d = np.random.normal(0, 0.5, [1, lstm_size]) skip_targets = nn.Variable([1, 2]) skip_targets.d = np.array([[1.0 - skip_target, skip_target]]) for layer_id in range(num_layers): # One-step stacked LSTM. with nn.parameter_scope("controller_lstm"): next_h, next_c = stack_lstm(inputs, prev_h, prev_c, state_size) prev_h, prev_c = next_h, next_c # shape:(1, lstm_size) # Compute for operation. with nn.parameter_scope("ops"): logit = PF.affine(next_h[-1], num_branch, w_init=initializer, with_bias=False) if temperature is not None: logit = F.mul_scalar(logit, (1 / temperature)) if tanh_constant is not None: logit = F.mul_scalar(F.tanh(logit), tanh_constant) # (1, num_branch) # normalizing logits. normed_logit = np.e**logit.d normed_logit = normed_logit / np.sum(normed_logit) # Sampling operation id from multinomial distribution. ops_id = np.random.multinomial(1, normed_logit[0], 1).nonzero()[1] ops_id = nn.Variable.from_numpy_array(ops_id) # (1, ) arc_seq.append(ops_id.d) # log policy for operation. log_prob = F.softmax_cross_entropy(logit, F.reshape(ops_id, shape=(1, 1))) # (1, ) # accumulate log policy as log probs log_probs = F.add2(log_probs, log_prob) entropy = log_prob * F.exp(-log_prob) entropys = F.add2(entropys, entropy) # accumulate entropy as entropys. w_emb = nn.parameter.get_parameter_or_create("w_emb", [num_branch, lstm_size], initializer, need_grad=False) inputs = F.reshape(w_emb[int(ops_id.d)], (1, w_emb.shape[1])) # (1, lstm_size) with nn.parameter_scope("controller_lstm"): next_h, next_c = stack_lstm(inputs, prev_h, prev_c, lstm_size) prev_h, prev_c = next_h, next_c # (1, lstm_size) with nn.parameter_scope("skip_affine_3"): adding_w_1 = PF.affine(next_h[-1], lstm_size, w_init=initializer, with_bias=False) # (1, lstm_size) if layer_id == 0: inputs = g_emb # (1, lstm_size) anchors = next_h[-1] # (1, lstm_size) anchors_w_1 = adding_w_1 # then goes back to the entry point of the loop else: # (layer_id, lstm_size) this shape during the process query = anchors_w_1 with nn.parameter_scope("skip_affine_1"): query = F.tanh( F.add2( query, PF.affine(next_h[-1], lstm_size, w_init=initializer, with_bias=False))) # (layer_id, lstm_size) + (1, lstm_size) # broadcast occurs here. resulting shape is; (layer_id, lstm_size) with nn.parameter_scope("skip_affine_2"): query = PF.affine(query, 1, w_init=initializer, with_bias=False) # (layer_id, 1) # note that each weight for skip_affine_X is shared across all steps of LSTM. # re-define logits, now its shape is;(layer_id, 2) logit = F.concatenate(-query, query, axis=1) if temperature is not None: logit = F.mul_scalar(logit, (1 / temperature)) if tanh_constant is not None: logit = F.mul_scalar(F.tanh(logit), tanh_constant) skip_prob_unnormalized = F.exp(logit) # (layer_id, 2) # normalizing skip_prob_unnormalized. summed = F.sum(skip_prob_unnormalized, axis=1, keepdims=True).apply(need_grad=False) summed = F.concatenate(summed, summed, axis=1) skip_prob_normalized = F.div2(skip_prob_unnormalized, summed) # (layer_id, 2) # Sampling skip_pattern from multinomial distribution. skip_pattern = np.random.multinomial( 1, skip_prob_normalized.d[0], layer_id).nonzero()[1] # (layer_id, 1) arc_seq.append(skip_pattern) skip = nn.Variable.from_numpy_array(skip_pattern) # compute skip penalty. # (layer_id, 2) broadcast occurs here too kl = F.mul2(skip_prob_normalized, F.log(F.div2(skip_prob_normalized, skip_targets))) kl = F.sum(kl, keepdims=True) # get the mean value here in advance. kl = kl * (1.0 / (num_layers - 1)) # accumulate kl divergence as skip penalty. skip_penaltys = F.add2(skip_penaltys, kl) # log policy for connection. log_prob = F.softmax_cross_entropy( logit, F.reshape(skip, shape=(skip.shape[0], 1))) log_probs = F.add2(log_probs, F.sum(log_prob, keepdims=True)) entropy = F.sum(log_prob * F.exp(-log_prob), keepdims=True) # accumulate entropy as entropys. entropys = F.add2(entropys, entropy) skip = F.reshape(skip, (1, layer_id)) inputs = F.affine(skip, anchors).apply(need_grad=False) # (1, lstm_size) inputs = F.mul_scalar(inputs, (1.0 / (1.0 + (np.sum(skip.d))))) # add new row for the next computation # (layer_id + 1, lstm_size) anchors = F.concatenate(anchors, next_h[-1], axis=0) # (layer_id + 1, lstm_size) anchors_w_1 = F.concatenate(anchors_w_1, adding_w_1, axis=0) return arc_seq, log_probs, entropys, skip_penaltys
def styled_conv_block(conv_input, w, noise=None, res=4, inmaps=512, outmaps=512, kernel_size=3, pad_size=1, demodulate=True, namescope="Conv", up=False, act=F.leaky_relu): """ Conv block with skip connection for Generator """ batch_size = conv_input.shape[0] with nn.parameter_scope(f'G_synthesis/{res}x{res}/{namescope}'): W, bias = weight_init_fn(shape=(w.shape[1], inmaps)) runtime_coef = (1. / np.sqrt(512)).astype(np.float32) style = F.affine(w, W * runtime_coef, bias) + 1.0 runtime_coef_for_conv = ( 1 / np.sqrt(np.prod([inmaps, kernel_size, kernel_size]))).astype( np.float32) if up: init_function = weight_init_fn(shape=(inmaps, outmaps, kernel_size, kernel_size), return_init=True) conv_weight = nn.parameter.get_parameter_or_create( name=f'G_synthesis/{res}x{res}/{namescope}/conv/W', shape=(inmaps, outmaps, kernel_size, kernel_size), initializer=init_function) else: init_function = weight_init_fn(shape=(outmaps, inmaps, kernel_size, kernel_size), return_init=True) conv_weight = nn.parameter.get_parameter_or_create( name=f'G_synthesis/{res}x{res}/{namescope}/conv/W', shape=(outmaps, inmaps, kernel_size, kernel_size), initializer=init_function) conv_weight = F.mul_scalar(conv_weight, runtime_coef_for_conv) if up: scale = F.reshape(style, (style.shape[0], style.shape[1], 1, 1, 1), inplace=False) else: scale = F.reshape(style, (style.shape[0], 1, style.shape[1], 1, 1), inplace=False) mod_w = F.mul2( F.reshape(conv_weight, (1, ) + conv_weight.shape, inplace=False), scale) if demodulate: if up: denom_w = F.pow_scalar( F.sum(F.pow_scalar(mod_w, 2.), axis=[1, 3, 4], keepdims=True) + 1e-8, 0.5) else: denom_w = F.pow_scalar( F.sum(F.pow_scalar(mod_w, 2.), axis=[2, 3, 4], keepdims=True) + 1e-8, 0.5) demod_w = F.div2(mod_w, denom_w) else: demod_w = mod_w conv_input = F.reshape(conv_input, (1, -1, conv_input.shape[2], conv_input.shape[3]), inplace=False) demod_w = F.reshape( demod_w, (-1, demod_w.shape[2], demod_w.shape[3], demod_w.shape[4]), inplace=False) if up: k = [1, 3, 3, 1] conv_out = upsample_conv_2d(conv_input, demod_w, k, factor=2, gain=1, group=batch_size) else: conv_out = F.convolution(conv_input, demod_w, pad=(pad_size, pad_size), group=batch_size) conv_out = F.reshape( conv_out, (batch_size, -1, conv_out.shape[2], conv_out.shape[3]), inplace=False) if noise is not None: noise_coeff = nn.parameter.get_parameter_or_create( name=f'G_synthesis/{res}x{res}/{namescope}/noise_strength', shape=()) conv_out = F.add2(conv_out, noise * F.reshape(noise_coeff, (1, 1, 1, 1))) else: conv_out = conv_out bias = nn.parameter.get_parameter_or_create( name=f'G_synthesis/{res}x{res}/{namescope}/conv/b', shape=(outmaps, ), initializer=np.random.randn(outmaps, ).astype(np.float32)) conv_out = F.add2(conv_out, F.reshape(bias, (1, outmaps, 1, 1), inplace=False)) if act == F.leaky_relu: conv_out = F.mul_scalar(F.leaky_relu(conv_out, alpha=0.2, inplace=False), np.sqrt(2), inplace=False) else: conv_out = act(conv_out) return conv_out
def conv_block(input, w, noise=None, res=4, outmaps=512, inmaps=512, kernel_size=3, pad_size=1, demodulate=True, namescope="Conv", up=False, act=F.leaky_relu): """ single convoluiton block used in each resolution. """ batch_size = input.shape[0] with nn.parameter_scope(f"G_synthesis/{res}x{res}/{namescope}"): runtime_coef = 1. / np.sqrt(512) W, bias = weight_init_fn(shape=(w.shape[1], inmaps)) runtime_coef = 1. / np.sqrt(512) s = F.affine(w, W * runtime_coef, bias) + 1.0 runtime_coef_for_conv = 1 / \ np.sqrt(np.prod([inmaps, kernel_size, kernel_size])) if up: conv_weight = nn.parameter.get_parameter_or_create( name=f"G_synthesis/{res}x{res}/{namescope}/conv/W", shape=(inmaps, outmaps, kernel_size, kernel_size)) else: conv_weight = nn.parameter.get_parameter_or_create( name=f"G_synthesis/{res}x{res}/{namescope}/conv/W", shape=(outmaps, inmaps, kernel_size, kernel_size)) conv_weight = conv_weight * runtime_coef_for_conv if up: scale = F.reshape(s, (s.shape[0], s.shape[1], 1, 1, 1), inplace=True) else: scale = F.reshape(s, (s.shape[0], 1, s.shape[1], 1, 1), inplace=True) mod_w = F.mul2( F.reshape(conv_weight, (1, ) + conv_weight.shape, inplace=True), scale) if demodulate: if up: denom_w = F.pow_scalar( F.sum(F.pow_scalar(mod_w, 2.), axis=[1, 3, 4], keepdims=True) + 1e-8, 0.5) else: denom_w = F.pow_scalar( F.sum(F.pow_scalar(mod_w, 2.), axis=[2, 3, 4], keepdims=True) + 1e-8, 0.5) demod_w = F.div2(mod_w, denom_w) else: demod_w = mod_w input = F.reshape(input, (1, -1, input.shape[2], input.shape[3]), inplace=True) demod_w = F.reshape( demod_w, (-1, demod_w.shape[2], demod_w.shape[3], demod_w.shape[4]), inplace=True) if up: k = [1, 3, 3, 1] conv_out = upsample_conv_2d(input, demod_w, k, factor=2, gain=1, group=batch_size) else: conv_out = F.convolution(input, demod_w, pad=(pad_size, pad_size), group=batch_size) conv_out = F.reshape( conv_out, (batch_size, -1, conv_out.shape[2], conv_out.shape[3]), inplace=True) if noise is not None: noise_coeff = nn.parameter.get_parameter_or_create( name=f"G_synthesis/{res}x{res}/{namescope}/noise_strength", shape=()) output = conv_out + noise * \ F.reshape(noise_coeff, (1, 1, 1, 1), inplace=False) else: output = conv_out bias = nn.parameter.get_parameter_or_create( name=f"G_synthesis/{res}x{res}/{namescope}/conv/b", shape=(outmaps, )) output = output + F.reshape(bias, (1, outmaps, 1, 1), inplace=False) if act == F.leaky_relu: output = F.leaky_relu(output, alpha=0.2) * np.sqrt(2) else: output = act(output) return output
def ssd_loss(_ssd_confs, _ssd_locs, _label, _alpha=1): # input # _ssd_confs : type=nn.Variable, prediction of class. shape=(batch_size, default boxes, class num + 1) # _ssd_locs : type=nn.Variable, prediction of location. shape=(batch_size, default boxes, 4) # _label : type=nn.Variable, shape=(batch_size, default boxes, class num + 1 + 4) # _alpha : type=float, hyperparameter. this is weight of loc_loss. # output # loss : type=nn.Variable def smooth_L1(__pred_locs, __label_locs): # input # __pred_locs : type=nn.Variable, # __label_locs : type=nn.Variable, # output # _loss : type=nn.Variable, loss of location. return F.mul_scalar(F.huber_loss(__pred_locs, __label_locs), 0.5) # _label_conf : type=nn.Variable, label of class. shape=(batch_size, default boxes, class num + 1) (after one_hot) # _label_loc : type=nn.Variable, label of location. shape=(batch_size, default boxes, 4) label_conf = F.slice( _label, start=(0,0,4), stop=_label.shape, step=(1,1,1) ) label_loc = F.slice( _label, start=(0,0,0), stop=(_label.shape[0], _label.shape[1], 4), step=(1,1,1) ) # conf ssd_pos_conf, ssd_neg_conf = ssd_separate_conf_pos_neg(_ssd_confs) label_conf_pos, _ = ssd_separate_conf_pos_neg(label_conf) # pos pos_loss = F.sum( F.mul2( F.softmax(ssd_pos_conf, axis=2), label_conf_pos ) , axis=2 ) # neg neg_loss = F.sum(F.log(ssd_neg_conf), axis=2) conf_loss = F.sum(F.sub2(pos_loss, neg_loss), axis=1) # loc pos_label = F.sum(label_conf_pos, axis=2) # =1 (if there is sonething), =0 (if there is nothing) loc_loss = F.sum(F.mul2(F.sum(smooth_L1(_ssd_locs, label_loc), axis=2), pos_label), axis=1) # [2019/07/18] label_match_default_box_num = F.slice( _label, start=(0,0,_label.shape[2] - 1), stop=_label.shape, step=(1,1,1) ) label_match_default_box_num = F.sum(label_match_default_box_num, axis=1) label_match_default_box_num = F.r_sub_scalar(label_match_default_box_num, _label.shape[1]) label_match_default_box_num = F.reshape(label_match_default_box_num, (label_match_default_box_num.shape[0],), inplace=False) # label_match_default_box_num : type=nn.Variable, inverse number of default boxes that matches with pos. # loss loss = F.mul2(F.add2(conf_loss, F.mul_scalar(loc_loss, _alpha)), label_match_default_box_num) loss = F.mean(loss) return loss