def downblock(x, out_features, norm=False, kernel_size=4, pool=False, sn=False, test=False): out = x if sn: def apply_w(w): return PF.spectral_norm(w, dim=0, test=test) else: apply_w = None inmaps, outmaps = out.shape[1], out_features k_w = I.calc_normal_std_he_forward( inmaps, outmaps, kernel=(kernel_size, kernel_size)) / np.sqrt(2.) k_b = I.calc_normal_std_he_forward(inmaps, outmaps) / np.sqrt(2.) w_init = I.UniformInitializer((-k_w, k_w)) b_init = I.UniformInitializer((-k_b, k_b)) out = PF.convolution(out, out_features, kernel=(kernel_size, kernel_size), pad=(0, 0), stride=(1, 1), w_init=w_init, b_init=b_init, apply_w=apply_w) if norm: out = PF.instance_normalization(out) out = F.leaky_relu(out, 0.2, inplace=True) if pool: out = F.average_pooling(out, kernel=(2, 2)) return out
def detect_keypoint(x, block_expansion, num_kp, num_channels, max_features, num_blocks, temperature, estimate_jacobian=False, scale_factor=1, single_jacobian_map=False, pad=0, test=False, comm=None): if scale_factor != 1: x = anti_alias_interpolate(x, num_channels, scale_factor) with nn.parameter_scope("hourglass"): feature_map = hourglass(x, block_expansion, num_blocks=num_blocks, max_features=max_features, test=test, comm=comm) with nn.parameter_scope("keypoint_detector"): inmaps, outmaps = feature_map.shape[1], num_kp k_w = I.calc_normal_std_he_forward( inmaps, outmaps, kernel=(7, 7)) / np.sqrt(2.) k_b = I.calc_normal_std_he_forward(inmaps, outmaps) / np.sqrt(2.) w_init = I.UniformInitializer((-k_w, k_w)) b_init = I.UniformInitializer((-k_b, k_b)) prediction = PF.convolution(feature_map, outmaps=num_kp, kernel=(7, 7), pad=(pad, pad), w_init=w_init, b_init=b_init) final_shape = prediction.shape heatmap = F.reshape(prediction, (final_shape[0], final_shape[1], -1)) heatmap = F.softmax(heatmap / temperature, axis=2) heatmap = F.reshape(heatmap, final_shape, inplace=False) out = gaussian2kp(heatmap) # {"value": value}, keypoint positions. if estimate_jacobian: if single_jacobian_map: num_jacobian_maps = 1 else: num_jacobian_maps = num_kp with nn.parameter_scope("jacobian_estimator"): jacobian_map = PF.convolution(feature_map, outmaps=4*num_jacobian_maps, kernel=(7, 7), pad=(pad, pad), w_init=I.ConstantInitializer(0), b_init=np.array([1, 0, 0, 1]*num_jacobian_maps)) jacobian_map = F.reshape( jacobian_map, (final_shape[0], num_jacobian_maps, 4, final_shape[2], final_shape[3])) heatmap = F.reshape( heatmap, heatmap.shape[:2] + (1,) + heatmap.shape[2:], inplace=False) jacobian = heatmap * jacobian_map jacobian = F.sum(jacobian, axis=(3, 4)) jacobian = F.reshape( jacobian, (jacobian.shape[0], jacobian.shape[1], 2, 2), inplace=False) out['jacobian'] = jacobian # jacobian near each keypoint. # out is a dictionary containing {"value": value, "jacobian": jacobian} return out
def resblock(x, in_features: int, kernel_size: int, padding: int, test: bool = False, comm=None): if comm: batchnorm = functools.partial(PF.sync_batch_normalization, comm=comm, group='world', axes=[1], decay_rate=0.9, eps=1e-05, batch_stat=not test) else: # 1 GPU batchnorm = functools.partial(PF.batch_normalization, axes=[1], decay_rate=0.9, eps=1e-05, batch_stat=not test) inmaps, outmaps = x.shape[1], in_features k_w = I.calc_normal_std_he_forward( inmaps, outmaps, kernel=(kernel_size, kernel_size)) / np.sqrt(2.) k_b = I.calc_normal_std_he_forward(inmaps, outmaps) / np.sqrt(2.) w_init = I.UniformInitializer((-k_w, k_w)) b_init = I.UniformInitializer((-k_b, k_b)) with nn.parameter_scope("convblock_0"): out = batchnorm(x) out = F.relu(out, inplace=True) out = PF.convolution(out, outmaps=in_features, kernel=(kernel_size, kernel_size), pad=(padding, padding), w_init=w_init, b_init=b_init) with nn.parameter_scope("convblock_2"): out = batchnorm(out) out = F.relu(out, inplace=True) out = PF.convolution(out, outmaps=in_features, kernel=(kernel_size, kernel_size), pad=(padding, padding), w_init=w_init, b_init=b_init) out = F.add2(out, x, inplace=True) return out
def inspecs_params(): inspecs = [] u = I.UniformInitializer((0.5, 1.0)) inspecs.append([Inspec((64, 1000), u)]) inspecs.append([Inspec((64, 32, 224, 224), u)]) inspecs.append([Inspec((64, 128, 56, 56), u)]) return inspecs
def pad_params(): inspecs = [] u = I.UniformInitializer((0.5, 1.0)) inspecs.append([Inspec((2, 2, 2, 2), u)]) inspecs.append([Inspec((2, 3, 2, 3), u)]) inspecs.append([Inspec((2, 20, 200, 200), u)]) return inspecs
def pairwise_inspecs_params(): inspecs = [] u = I.UniformInitializer((0, 2)) inspecs.append( [Inspec((64, 32, 224, 224), u), Inspec((64, 32, 224, 224), u)]) return inspecs
def nin(x, c, name, zeroing_w=False): lim = np.sqrt(x.shape[1])**-1 w_init = I.UniformInitializer(lim=(-lim, lim)) # same as pytorch's default b_init = I.UniformInitializer(lim=(-lim, lim)) # same as pytorch's default if zeroing_w: w_init = I.ConstantInitializer(0) b_init = I.ConstantInitializer(0) return PF.convolution(x, c, kernel=(1, 1), pad=(0, 0), stride=(1, 1), name=name, w_init=w_init, b_init=b_init)
def discriminator(x, kp=None, num_channels=3, block_expansion=64, num_blocks=4, max_features=512, sn=False, use_kp=False, num_kp=10, kp_variance=0.01, test=False, **kwargs): down_blocks = [] for i in range(num_blocks): down_blocks.append( functools.partial(downblock, out_features=min( max_features, block_expansion * (2 ** (i + 1))), norm=(i != 0), kernel_size=4, pool=(i != num_blocks - 1), sn=sn, test=test)) feature_maps = [] out = x if use_kp: heatmap = kp2gaussian(kp, x.shape[2:], kp_variance) out = F.concatenate(out, heatmap, axis=1) for i, down_block in enumerate(down_blocks): with nn.parameter_scope(f"downblock_{i}"): feature_maps.append(down_block(out)) out = feature_maps[-1] if sn: def apply_w(w): return PF.spectral_norm(w, dim=0, test=test) else: apply_w = None with nn.parameter_scope("prediction"): inmaps, outmaps = out.shape[1], 1 k_w = I.calc_normal_std_he_forward( inmaps, outmaps, kernel=(1, 1)) / np.sqrt(2.) k_b = I.calc_normal_std_he_forward(inmaps, outmaps) / np.sqrt(2.) w_init = I.UniformInitializer((-k_w, k_w)) b_init = I.UniformInitializer((-k_b, k_b)) prediction_map = PF.convolution(out, 1, kernel=(1, 1), pad=(0, 0), stride=(1, 1), w_init=w_init, b_init=b_init, apply_w=apply_w) return feature_maps, prediction_map
def downblock(x, out_features, kernel_size=3, padding=1, groups=1, test=False, comm=None): if comm: batchnorm = functools.partial(PF.sync_batch_normalization, comm=comm, group='world', axes=[1], decay_rate=0.9, eps=1e-05, batch_stat=not test) else: # 1 GPU batchnorm = functools.partial(PF.batch_normalization, axes=[1], decay_rate=0.9, eps=1e-05, batch_stat=not test) inmaps, outmaps = x.shape[1], out_features k_w = I.calc_normal_std_he_forward( inmaps, outmaps, kernel=(kernel_size, kernel_size)) / np.sqrt(2.) k_b = I.calc_normal_std_he_forward(inmaps, outmaps) / np.sqrt(2.) w_init = I.UniformInitializer((-k_w, k_w)) b_init = I.UniformInitializer((-k_b, k_b)) with nn.parameter_scope("downblock"): out = PF.convolution(x, outmaps=out_features, kernel=(kernel_size, kernel_size), pad=(padding, padding), group=groups, w_init=w_init, b_init=b_init) out = batchnorm(out) out = F.relu(out, inplace=True) out = F.average_pooling(out, kernel=(2, 2)) return out
def __init__(self, embedding_dim, num_embedding, commitment_cost, rng, scope_name='vector_quantizer'): self.embedding_dim = embedding_dim self.num_embedding = num_embedding self.commitment_cost = commitment_cost self.rng = rng self.scope_name = scope_name with nn.parameter_scope(scope_name): self.embedding_weight = nn.parameter.get_parameter_or_create('W', shape=(self.num_embedding, self.embedding_dim), initializer=I.UniformInitializer((-1./self.num_embedding, 1./self.num_embedding), rng=self.rng), need_grad=True)
def pf_affine(r, num_classes=1000, channel_last=False): # Initializer supposes the final classifaction layer fan_in = int(np.prod(r.shape[1:])) k = 1 / np.sqrt(fan_in) init = I.UniformInitializer((-k, k), rng=RNG) r = PF.convolution(r, num_classes, (1, 1), channel_last=channel_last, w_init=init, b_init=init, name='fc') return F.reshape(r, (r.shape[0], -1), inplace=False)
def embedding(x, input_dim, output_dim, init=None, mask_zero=False): if init is None: init = I.UniformInitializer((-0.1, 0.1)) initialized = "embed/W" in nn.get_parameters() result = PF.embed(x, input_dim, output_dim) if not initialized: nn.get_parameters()["embed/W"].d = init( nn.get_parameters()["embed/W"].shape) if mask_zero: return result, 1 - F.equal_scalar(x, 0) else: return result
def conv(x, c, name, kernel=(3, 3), pad=(1, 1), stride=(1, 1), zeroing_w=False): # init weight and bias with uniform, which is the same as pytorch lim = I.calc_normal_std_he_forward(x.shape[1] * 2, c, tuple(kernel)) w_init = I.UniformInitializer(lim=(-lim, lim), rng=None) b_init = I.UniformInitializer(lim=(-lim, lim), rng=None) if zeroing_w: w_init = I.ConstantInitializer(0) b_init = I.ConstantInitializer(0) return PF.convolution(x, c, kernel, pad=pad, stride=stride, name=name, w_init=w_init, b_init=b_init)
def dense(x, output_dim, base_axis=1, w_init=None, b_init=I.ConstantInitializer(0), activation=F.tanh): if w_init is None: w_init = I.UniformInitializer( I.calc_uniform_lim_glorot(np.prod(x.shape[1:]), output_dim)) return activation( PF.affine(x, output_dim, base_axis=base_axis, w_init=w_init, b_init=b_init))
def resnet50_inspecs_params_without_broadcast(): inspecs = [] u = I.UniformInitializer((0.5, 1.0)) inspecs.append([Inspec((5, 2048, 7, 7), u), Inspec((5, 2048, 7, 7), u)]) inspecs.append( [Inspec((5, 1024, 14, 14), u), Inspec((5, 1024, 14, 14), u)]) inspecs.append([Inspec((5, 512, 28, 28), u), Inspec((5, 512, 28, 28), u)]) inspecs.append([Inspec((5, 256, 56, 56), u), Inspec((5, 256, 56, 56), u)]) inspecs.append([Inspec((5, 56, 56, 256), u), Inspec((5, 56, 56, 256), u)]) inspecs.append([Inspec((5, 28, 28, 512), u), Inspec((5, 28, 28, 512), u)]) inspecs.append( [Inspec((5, 14, 14, 1024), u), Inspec((5, 14, 14, 1024), u)]) inspecs.append([Inspec((5, 7, 7, 2048), u), Inspec((5, 7, 7, 2048), u)]) return inspecs
def conv_initializer(f_in, n_out, base_axis, kernel, mode): ''' Conv initializer function This function returns various types of initialization for weights and bias parameters in convolution layer. Args: f_in (~nnabla.Variable): input variable. n_out (int) : number of output neurons per data. base_axis (int): dimensions up to base_axis are treated as the sample dimensions. kernel (tuple of int) : convolution kernel size. mode (str) : type of initialization to use. Returns: w (~nnabla.initializer.BaseInitializer): weight parameters b (~nnabla.initializer.BaseInitializer): bias parameters ''' if mode == 'nnabla': # https://github.com/sony/nnabla/blob/master/python/src/nnabla/parametric_functions.py, line415, 417 # https://github.com/sony/nnabla/blob/master/python/src/nnabla/initializer.py, line224. 121 # uniform_lim_glorot = uniform(sqrt(6/(fin+fout))) n_input_plane = f_in.shape[base_axis] s = np.sqrt(6.0 / (n_input_plane * np.prod(kernel) + n_out)) w = I.UniformInitializer([-s, s]) b = I.ConstantInitializer(0) return w, b
def main(): """ Start architecture search. """ args = get_args() print(args) ctx = get_extension_context(args.context, device_id=args.device_id, type_config=args.type_config) nn.set_default_context(ctx) ext = nn.ext_utils.import_extension_module(args.context) ops = { 0: dil_conv_3x3, 1: dil_conv_5x5, 2: sep_conv_3x3, 3: sep_conv_5x5, 4: max_pool_3x3, 5: avg_pool_3x3, 6: identity, 7: zero } initializer = I.UniformInitializer((-0.1, 0.1)) num_of_nodes = args.num_nodes alphas_dict = dict() w_shape = (len(ops), ) + (1, 1, 1, 1) # prepare architecture parameters in advance for i in range(num_of_nodes): for j in range(i + 1, num_of_nodes - 1): if j < 2: continue # no connection exists between 1st and 2nd nodes. else: w_name_normal = "alpha_normal_{}_{}".format(i, j) w_name_reduction = "alpha_reduction_{}_{}".format(i, j) alphas_dict[w_name_normal] = \ nn.parameter.get_parameter_or_create(w_name_normal, w_shape, initializer) alphas_dict[w_name_reduction] = \ nn.parameter.get_parameter_or_create(w_name_reduction, w_shape, initializer) # run architecture search alphas_dict = CNN_run(args, ops, alphas_dict) for k in nn.get_parameters(grad_only=False).keys(): if "alpha_" not in k: nn.parameter.pop_parameter(k) # delete unnecessary parameters. print("Architecture Search is finished. The saved architecture is,") alpha_normal, alpha_reduction = arrange_weights(args, ops) arch_normal = parse_weights(args, alpha_normal) arch_reduction = parse_weights(args, alpha_reduction) show_derived_cell(args, ops, arch_normal, "normal") show_derived_cell(args, ops, arch_reduction, "reduction") arch_data = {"arch_normal": arch_normal, "arch_reduction": arch_reduction} print("Saving the architecture parameter: {}/{}".format( args.monitor_path, args.model_arch_name)) model_path = args.model_arch_name with open(model_path, 'w') as f: json.dump(arch_data, f) print("when you want to train the network from scratch\n\ type 'python darts_train.py <OPTION> \ --monitor-path {} --model-arch-name {}".format(args.monitor_path, args.model_arch_name)) return
F.exp(-distance(u, x)) for x in F.split(negative_samples, axis=2) ]))) u = nn.Variable((batch_size, )) v = nn.Variable((batch_size, )) negative_samples = nn.Variable((batch_size, negative_sample_size)) _u = PF.embed(u, vocab_size, embedding_size) _v = PF.embed(v, vocab_size, embedding_size) _neg = PF.embed(negative_samples, vocab_size, embedding_size) _neg = F.transpose(_neg, axes=(0, 2, 1)) loss = loss_function(_u, _v, _neg) nn.get_parameters()["embed/W"].d = I.UniformInitializer( [-0.01, 0.01])(shape=(vocab_size, embedding_size)) solver = RiemannianSgd(lr=0.1) solver.set_parameters(nn.get_parameters()) trainer = Trainer(inputs=[u, v, negative_samples], loss=loss, solver=solver) trainer.run(train_data_iter, None, epochs=max_epoch) line_points = [['mustang.n.01', 'odd-toed_ungulate.n.01'], ['elk.n.01', 'even-toed_ungulate.n.01'], ['even-toed_ungulate.n.01', 'ungulate.n.01'], ['squirrel.n.01', 'rodent.n.01'], ['beagle.n.01', 'dog.n.01'], ['dog.n.01', 'canine.n.02'], ['liger.n.01', 'carnivore.n.01'], ['bison.n.01', 'even-toed_ungulate.n.01'], ['collie.n.01', 'dog.n.01'], ['odd-toed_ungulate.n.01', 'ungulate.n.01'],
def lstm(x, mask, state_size, w_init=None, inner_w_init=None, forget_bias_init=I.ConstantInitializer(1), b_init=I.ConstantInitializer(0), initial_state=None, dropout=0, train=True, rng=np.random): """ x: (batch_size, length, input_size) mask: (batch_size, length) """ batch_size, length, input_size = x.shape if w_init is None: w_init = I.UniformInitializer( I.calc_uniform_lim_glorot(input_size, state_size)) if inner_w_init is None: inner_w_init = orthogonal retain_prob = 1.0 - dropout z_w = nn.Variable((batch_size, 4, input_size), need_grad=False) z_w.d = 1 z_u = nn.Variable((batch_size, 4, state_size), need_grad=False) z_u.d = 1 if dropout > 0: if train: z_w = F.dropout(z_w, p=retain_prob) z_u = F.dropout(z_u, p=retain_prob) z_w *= retain_prob z_u *= retain_prob z_w = F.reshape(z_w, (batch_size, 4, 1, input_size)) z_w = F.broadcast(z_w, (batch_size, 4, length, input_size)) z_w = F.split(z_w, axis=1) z_u = F.split(z_u, axis=1) xi = z_w[0] * x xf = z_w[1] * x xc = z_w[2] * x xo = z_w[3] * x with nn.parameter_scope("lstm"): # (batch_size, length, state_size) xi = PF.affine(xi, state_size, base_axis=2, w_init=w_init, b_init=b_init, name="Wi") xf = PF.affine(xf, state_size, base_axis=2, w_init=w_init, b_init=forget_bias_init, name="Wf") xc = PF.affine(xc, state_size, base_axis=2, w_init=w_init, b_init=b_init, name="Wc") xo = PF.affine(xo, state_size, base_axis=2, w_init=w_init, b_init=b_init, name="Wo") if initial_state is None: h = nn.Variable((batch_size, state_size), need_grad=False) h.data.zero() else: h = initial_state c = nn.Variable((batch_size, state_size), need_grad=False) c.data.zero() # (batch_size, state_size) xi = split(xi, axis=1) xf = split(xf, axis=1) xc = split(xc, axis=1) xo = split(xo, axis=1) mask = F.reshape(mask, [batch_size, length, 1]) # (batch_size, length, 1) mask = F.broadcast(mask, [batch_size, length, state_size]) # (batch_size, state_size) mask = split(mask, axis=1) hs = [] cs = [] with nn.parameter_scope("lstm"): for i, f, c2, o, m in zip(xi, xf, xc, xo, mask): i_t = PF.affine(z_u[0] * h, state_size, w_init=inner_w_init(state_size, state_size), with_bias=False, name="Ui") i_t = F.sigmoid(i + i_t) f_t = PF.affine(z_u[1] * h, state_size, w_init=inner_w_init(state_size, state_size), with_bias=False, name="Uf") f_t = F.sigmoid(f + f_t) c_t = PF.affine(z_u[2] * h, state_size, w_init=inner_w_init(state_size, state_size), with_bias=False, name="Uc") c_t = f_t * c + i_t * F.tanh(c2 + c_t) o_t = PF.affine(z_u[3] * h, state_size, w_init=inner_w_init(state_size, state_size), with_bias=False, name="Uo") o_t = F.sigmoid(o + o_t) h_t = o_t * F.tanh(c_t) h_t = (1 - m) * h + m * h_t c_t = (1 - m) * c + m * c_t h = h_t c = c_t h_t = F.reshape(h_t, (batch_size, 1, state_size), inplace=False) c_t = F.reshape(c_t, (batch_size, 1, state_size), inplace=False) hs.append(h_t) cs.append(c_t) return concatenate(*hs, axis=1), concatenate(*cs, axis=1)
def last_affine(self, x, dims, name): c = x.shape[1] l, u = I.calc_uniform_lim_glorot(c, 1) w_init = I.UniformInitializer((l, u)) return PF.affine(x, 1, w_init=w_init, name=name)
Inspec((64, 128, 56, 56)), Inspec((64, 128, 56, 56), label_init, False) ]) return inspecs @pytest.mark.parametrize('inspecs', pairwise_inspecs_params()) @pytest.mark.parametrize('loss', ['sigmoid_cross_entropy', 'binary_cross_entropy']) def test_binary_classification_loss(inspecs, loss, nnabla_opts): func = getattr(F, loss) fb = FunctionBenchmark(func, inspecs, [], {}, nnabla_opts.ext, nnabla_opts.ext_kwargs) fb.benchmark() fb.write(writer=nnabla_opts.function_benchmark_writer) @pytest.mark.parametrize('inspecs', pairwise_inspecs_params(I.UniformInitializer((0, 1)))) @pytest.mark.parametrize('loss', ['squared_error', 'huber_loss', 'kl_multinomial']) def test_pairwise_loss(inspecs, loss, nnabla_opts): func = getattr(F, loss) fb = FunctionBenchmark(func, inspecs, [], {}, nnabla_opts.ext, nnabla_opts.ext_kwargs) fb.benchmark() fb.write(writer=nnabla_opts.function_benchmark_writer) # ============================================================================
def predict_dense_motion(source_image, kp_driving, kp_source, block_expansion, num_blocks, max_features, num_kp, num_channels, estimate_occlusion_map=False, scale_factor=1, kp_variance=0.01, test=False, comm=None): if scale_factor != 1: source_image = anti_alias_interpolate(source_image, num_channels, scale_factor) bs, _, h, w = source_image.shape out_dict = dict() heatmap_representation = create_heatmap_representations( source_image, kp_driving, kp_source, kp_variance) sparse_motion = create_sparse_motions(source_image, kp_driving, kp_source, num_kp) deformed_source = create_deformed_source_image(source_image, sparse_motion, num_kp) out_dict['sparse_deformed'] = deformed_source input = F.concatenate(heatmap_representation, deformed_source, axis=2) input = F.reshape(input, (bs, -1, h, w)) with nn.parameter_scope("hourglass"): prediction = hourglass(input, block_expansion=block_expansion, num_blocks=num_blocks, max_features=max_features, test=test, comm=comm) with nn.parameter_scope("mask"): inmaps, outmaps = prediction.shape[1], num_kp + 1 k_w = I.calc_normal_std_he_forward(inmaps, outmaps, kernel=(7, 7)) / np.sqrt(2.) k_b = I.calc_normal_std_he_forward(inmaps, outmaps) / np.sqrt(2.) w_init = I.UniformInitializer((-k_w, k_w)) b_init = I.UniformInitializer((-k_b, k_b)) mask = PF.convolution(prediction, outmaps=num_kp + 1, kernel=(7, 7), pad=(3, 3), w_init=w_init, b_init=b_init) mask = F.softmax(mask, axis=1) out_dict['mask'] = mask reshaped_mask = F.reshape(mask, mask.shape[:2] + (1, ) + mask.shape[2:], inplace=False) sparse_motion = F.transpose(sparse_motion, (0, 1, 4, 2, 3)) deformation = F.sum(sparse_motion * reshaped_mask, axis=1) deformation = F.transpose(deformation, (0, 2, 3, 1)) out_dict['deformation'] = deformation if estimate_occlusion_map: with nn.parameter_scope("occlusion_map"): occlusion_map = F.sigmoid( PF.convolution(prediction, outmaps=1, kernel=(7, 7), pad=(3, 3), w_init=w_init, b_init=b_init)) out_dict['occlusion_map'] = occlusion_map else: occlusion_map = None return out_dict
def occlusion_aware_generator(source_image, kp_driving, kp_source, num_channels, num_kp, block_expansion, max_features, num_down_blocks, num_bottleneck_blocks, estimate_occlusion_map=False, dense_motion_params=None, estimate_jacobian=False, test=False, comm=None): # pre-downsampling out = sameblock(source_image, out_features=block_expansion, kernel_size=7, padding=3, test=test, comm=comm) # downsampling for i in range(num_down_blocks): with nn.parameter_scope(f"downblock_{i}"): out_features = min(max_features, block_expansion * (2 ** (i + 1))) out = downblock(out, out_features=out_features, kernel_size=3, padding=1, test=test, comm=comm) output_dict = {} if dense_motion_params is not None: with nn.parameter_scope("dense_motion_prediction"): dense_motion = predict_dense_motion(source_image=source_image, kp_driving=kp_driving, kp_source=kp_source, num_kp=num_kp, num_channels=num_channels, estimate_occlusion_map=estimate_occlusion_map, test=test, comm=comm, **dense_motion_params) # dense_motion is a dictionay containing: # 'sparse_deformed': <Variable((8, 11, 3, 256, 256)), # 'mask': <Variable((8, 11, 256, 256)), # 'deformation': <Variable((8, 256, 256, 2)), # 'occlusion_map': <Variable((8, 1, 256, 256))} output_dict['mask'] = dense_motion['mask'] output_dict['sparse_deformed'] = dense_motion['sparse_deformed'] # Transform feature representation by deformation (+ occlusion) if 'occlusion_map' in dense_motion: occlusion_map = dense_motion['occlusion_map'] output_dict['occlusion_map'] = occlusion_map else: occlusion_map = None deformation = dense_motion['deformation'] out = deform_input(out, deformation) if occlusion_map is not None: if out.shape[2] != occlusion_map.shape[2] or out.shape[3] != occlusion_map.shape[3]: resized_occlusion_map = F.interpolate(occlusion_map, output_size=out.shape[2:], mode="linear", align_corners=False, half_pixel=True) else: resized_occlusion_map = F.identity(occlusion_map) out = out * resized_occlusion_map if test: output_dict["deformed"] = deform_input(source_image, deformation) # intermediate residual blocks in_features = min(max_features, block_expansion * (2 ** num_down_blocks)) for i in range(num_bottleneck_blocks): with nn.parameter_scope(f"residual_block_{i}"): out = resblock(out, in_features=in_features, kernel_size=3, padding=1, test=test, comm=comm) # upsampling for i in range(num_down_blocks): with nn.parameter_scope(f"upblock_{i}"): out_features = min(max_features, block_expansion * (2 ** (num_down_blocks - i - 1))) out = upblock(out, out_features=out_features, kernel_size=3, padding=1, test=test, comm=comm) with nn.parameter_scope("final_conv"): inmaps, outmaps = out.shape[1], num_channels k_w = I.calc_normal_std_he_forward( inmaps, outmaps, kernel=(7, 7)) / np.sqrt(2.) k_b = I.calc_normal_std_he_forward(inmaps, outmaps) / np.sqrt(2.) w_init = I.UniformInitializer((-k_w, k_w)) b_init = I.UniformInitializer((-k_b, k_b)) out = PF.convolution(out, outmaps=num_channels, kernel=(7, 7), pad=(3, 3), w_init=w_init, b_init=b_init) out = F.sigmoid(out) output_dict["prediction"] = out return output_dict
def sample_from_controller(args): """ 2-layer RNN(LSTM) based controller which outputs an architecture of CNN, represented as a sequence of integers and its list. Given the number of layers, for each layer, it executes 2 types of computation, one for sampling the operation at that layer, another for sampling the skip connection patterns. """ entropys = nn.Variable([1, 1], need_grad=True) log_probs = nn.Variable([1, 1], need_grad=True) skip_penaltys = nn.Variable([1, 1], need_grad=True) entropys.d = log_probs.d = skip_penaltys.d = 0.0 # initialize them all num_layers = args.num_layers lstm_size = args.lstm_size state_size = args.state_size lstm_num_layers = args.lstm_layers skip_target = args.skip_prob temperature = args.temperature tanh_constant = args.tanh_constant num_branch = args.num_ops arc_seq = [] initializer = I.UniformInitializer((-0.1, 0.1)) prev_h = [ nn.Variable([1, lstm_size], need_grad=True) for _ in range(lstm_num_layers) ] prev_c = [ nn.Variable([1, lstm_size], need_grad=True) for _ in range(lstm_num_layers) ] for i in range(len(prev_h)): prev_h[i].d = 0 # initialize variables in lstm layers. prev_c[i].d = 0 inputs = nn.Variable([1, lstm_size]) inputs.d = np.random.normal(0, 0.5, [1, lstm_size]) g_emb = nn.Variable([1, lstm_size]) g_emb.d = np.random.normal(0, 0.5, [1, lstm_size]) skip_targets = nn.Variable([1, 2]) skip_targets.d = np.array([[1.0 - skip_target, skip_target]]) for layer_id in range(num_layers): # One-step stacked LSTM. with nn.parameter_scope("controller_lstm"): next_h, next_c = stack_lstm(inputs, prev_h, prev_c, state_size) prev_h, prev_c = next_h, next_c # shape:(1, lstm_size) # Compute for operation. with nn.parameter_scope("ops"): logit = PF.affine(next_h[-1], num_branch, w_init=initializer, with_bias=False) if temperature is not None: logit = F.mul_scalar(logit, (1 / temperature)) if tanh_constant is not None: logit = F.mul_scalar(F.tanh(logit), tanh_constant) # (1, num_branch) # normalizing logits. normed_logit = np.e**logit.d normed_logit = normed_logit / np.sum(normed_logit) # Sampling operation id from multinomial distribution. ops_id = np.random.multinomial(1, normed_logit[0], 1).nonzero()[1] ops_id = nn.Variable.from_numpy_array(ops_id) # (1, ) arc_seq.append(ops_id.d) # log policy for operation. log_prob = F.softmax_cross_entropy(logit, F.reshape(ops_id, shape=(1, 1))) # (1, ) # accumulate log policy as log probs log_probs = F.add2(log_probs, log_prob) entropy = log_prob * F.exp(-log_prob) entropys = F.add2(entropys, entropy) # accumulate entropy as entropys. w_emb = nn.parameter.get_parameter_or_create("w_emb", [num_branch, lstm_size], initializer, need_grad=False) inputs = F.reshape(w_emb[int(ops_id.d)], (1, w_emb.shape[1])) # (1, lstm_size) with nn.parameter_scope("controller_lstm"): next_h, next_c = stack_lstm(inputs, prev_h, prev_c, lstm_size) prev_h, prev_c = next_h, next_c # (1, lstm_size) with nn.parameter_scope("skip_affine_3"): adding_w_1 = PF.affine(next_h[-1], lstm_size, w_init=initializer, with_bias=False) # (1, lstm_size) if layer_id == 0: inputs = g_emb # (1, lstm_size) anchors = next_h[-1] # (1, lstm_size) anchors_w_1 = adding_w_1 # then goes back to the entry point of the loop else: # (layer_id, lstm_size) this shape during the process query = anchors_w_1 with nn.parameter_scope("skip_affine_1"): query = F.tanh( F.add2( query, PF.affine(next_h[-1], lstm_size, w_init=initializer, with_bias=False))) # (layer_id, lstm_size) + (1, lstm_size) # broadcast occurs here. resulting shape is; (layer_id, lstm_size) with nn.parameter_scope("skip_affine_2"): query = PF.affine(query, 1, w_init=initializer, with_bias=False) # (layer_id, 1) # note that each weight for skip_affine_X is shared across all steps of LSTM. # re-define logits, now its shape is;(layer_id, 2) logit = F.concatenate(-query, query, axis=1) if temperature is not None: logit = F.mul_scalar(logit, (1 / temperature)) if tanh_constant is not None: logit = F.mul_scalar(F.tanh(logit), tanh_constant) skip_prob_unnormalized = F.exp(logit) # (layer_id, 2) # normalizing skip_prob_unnormalized. summed = F.sum(skip_prob_unnormalized, axis=1, keepdims=True).apply(need_grad=False) summed = F.concatenate(summed, summed, axis=1) skip_prob_normalized = F.div2(skip_prob_unnormalized, summed) # (layer_id, 2) # Sampling skip_pattern from multinomial distribution. skip_pattern = np.random.multinomial( 1, skip_prob_normalized.d[0], layer_id).nonzero()[1] # (layer_id, 1) arc_seq.append(skip_pattern) skip = nn.Variable.from_numpy_array(skip_pattern) # compute skip penalty. # (layer_id, 2) broadcast occurs here too kl = F.mul2(skip_prob_normalized, F.log(F.div2(skip_prob_normalized, skip_targets))) kl = F.sum(kl, keepdims=True) # get the mean value here in advance. kl = kl * (1.0 / (num_layers - 1)) # accumulate kl divergence as skip penalty. skip_penaltys = F.add2(skip_penaltys, kl) # log policy for connection. log_prob = F.softmax_cross_entropy( logit, F.reshape(skip, shape=(skip.shape[0], 1))) log_probs = F.add2(log_probs, F.sum(log_prob, keepdims=True)) entropy = F.sum(log_prob * F.exp(-log_prob), keepdims=True) # accumulate entropy as entropys. entropys = F.add2(entropys, entropy) skip = F.reshape(skip, (1, layer_id)) inputs = F.affine(skip, anchors).apply(need_grad=False) # (1, lstm_size) inputs = F.mul_scalar(inputs, (1.0 / (1.0 + (np.sum(skip.d))))) # add new row for the next computation # (layer_id + 1, lstm_size) anchors = F.concatenate(anchors, next_h[-1], axis=0) # (layer_id + 1, lstm_size) anchors_w_1 = F.concatenate(anchors_w_1, adding_w_1, axis=0) return arc_seq, log_probs, entropys, skip_penaltys
def main(): """ Start architecture search and save the architecture found by the controller during the search. """ args = get_macro_args() arguments_assertion(args) ctx = get_extension_context(args.context, device_id=args.device_id, type_config=args.type_config) nn.set_default_context(ctx) ext = nn.ext_utils.import_extension_module(args.context) if args.sampling_only: sample_from_pretrained_controller(args) return data_iterator = data_iterator_cifar10 tdata = data_iterator(args.batch_size, True) vdata = data_iterator(args.batch_size, False) mean_val_train, std_val_train, channel, img_height, img_width, num_class = get_data_stats( tdata) mean_val_valid, std_val_valid, _, _, _, _ = get_data_stats(vdata) data_dict = { "train_data": (tdata, mean_val_train, std_val_train), "valid_data": (vdata, mean_val_valid, std_val_valid), "basic_info": (channel, img_height, img_width, num_class) } initializer = I.UniformInitializer((-0.1, 0.1)) # Prepare all the weights in advance controller_weights_and_shape = { 'controller_lstm/0/lstm/affine/W': (2 * args.lstm_size, 4, args.lstm_size), 'controller_lstm/0/lstm/affine/b': (4, args.lstm_size), 'controller_lstm/1/lstm/affine/W': (2 * args.lstm_size, 4, args.lstm_size), 'controller_lstm/1/lstm/affine/b': (4, args.lstm_size), 'ops/affine/W': (args.lstm_size, args.num_ops), 'skip_affine_1/affine/W': (args.lstm_size, args.lstm_size), 'skip_affine_2/affine/W': (args.lstm_size, 1), 'skip_affine_3/affine/W': (args.lstm_size, args.lstm_size) } for w_name, w_shape in controller_weights_and_shape.items(): nn.parameter.get_parameter_or_create(w_name, w_shape, initializer=initializer, need_grad=True) # create dictionary of controller's weights controller_weights_dict = { w_name: nn.get_parameters()[w_name] for w_name in controller_weights_and_shape.keys() } arch_change, best_arch = search_architecture(args, data_dict, controller_weights_dict) if args.select_strategy == "best": print( "saving the model which achieved the best validation accuracy as {}." .format(args.recommended_arch)) check_arch = best_arch else: # Use the latest architecture. it's not necessarily the one with the best architecture. print("saving the latest model recommended by the controller as {}.". format(args.recommended_arch)) check_arch = arch_change[-1] np.save(args.recommended_arch, np.array(check_arch)) print("The saved architecture is;") show_arch(check_arch) print("when you want to train the network from scratch,\n\ type 'python macro_retrain.py <OPTION> --recommended-arch {}'".format( args.recommended_arch)) # save the controller's weights so that another architectures can be made. all_params = nn.get_parameters(grad_only=False) controller_weights = list(controller_weights_and_shape.keys()) + ["w_emb"] for param_name in all_params.keys(): if param_name not in controller_weights: nn.parameter.pop_parameter(param_name) nn.save_parameters( os.path.join(args.model_save_path, 'controller_params.h5')) # If you want to train the model recommended by the controller from scratch # right after architecture search, uncomment the lines below # nn.clear_parameters() # ext.clear_memory_cache() # clear all the Variables # val_acc = CNN_run(args, check_arch, data_dict, with_train=True, after_search=True) return
def main(): args = get_args() state_size = args.state_size batch_size = args.batch_size num_steps = args.num_steps num_layers = args.num_layers max_epoch = args.max_epoch max_norm = args.gradient_clipping_max_norm num_words = 10000 lr = args.learning_rate train_data, val_data, test_data = get_data() # Get context. from nnabla.ext_utils import get_extension_context logger.info("Running in %s" % args.context) ctx = get_extension_context( args.context, device_id=args.device_id, type_config=args.type_config) nn.set_default_context(ctx) from nnabla.monitor import Monitor, MonitorSeries monitor = Monitor(args.work_dir) monitor_perplexity = MonitorSeries( "Training perplexity", monitor, interval=10) monitor_vperplexity = MonitorSeries("Validation perplexity", monitor, interval=( len(val_data)//(num_steps*batch_size))) monitor_tperplexity = MonitorSeries( "Test perplexity", monitor, interval=(len(test_data)//(num_steps*1))) l1 = LSTMWrapper(batch_size, state_size) l2 = LSTMWrapper(batch_size, state_size) # train graph x = nn.Variable((batch_size, num_steps)) t = nn.Variable((batch_size, num_steps)) w = I.UniformInitializer((-0.1, 0.1)) b = I.ConstantInitializer(1) loss = get_loss(l1, l2, x, t, w, b, num_words, batch_size, state_size, True) l1.share_data() l2.share_data() # validation graph vx = nn.Variable((batch_size, num_steps)) vt = nn.Variable((batch_size, num_steps)) vloss = get_loss(l1, l2, vx, vt, w, b, num_words, batch_size, state_size) solver = S.Sgd(lr) solver.set_parameters(nn.get_parameters()) if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) best_val = 10000 for epoch in range(max_epoch): l1.reset_state() l2.reset_state() for i in range(len(train_data)//(num_steps*batch_size)): x.d, t.d = get_batch(train_data, i*num_steps, batch_size, num_steps) solver.zero_grad() loss.forward() loss.backward(clear_buffer=True) solver.weight_decay(1e-5) gradient_clipping(nn.get_parameters().values(), max_norm) solver.update() perp = perplexity(loss.d.copy()) monitor_perplexity.add( (len(train_data)//(num_steps*batch_size))*(epoch)+i, perp) l1.reset_state() l2.reset_state() vloss_avg = 0 for i in range(len(val_data)//(num_steps * batch_size)): vx.d, vt.d = get_batch(val_data, i*num_steps, batch_size, num_steps) vloss.forward() vloss_avg += vloss.d.copy() vloss_avg /= float((len(val_data)//(num_steps*batch_size))) vper = perplexity(vloss_avg) if vper < best_val: best_val = vper if vper < 200: save_name = "params_epoch_{:02d}.h5".format(epoch) nn.save_parameters(os.path.join(args.save_dir, save_name)) else: solver.set_learning_rate(solver.learning_rate()*0.25) logger.info("Decreased learning rate to {:05f}".format( solver.learning_rate())) monitor_vperplexity.add( (len(val_data)//(num_steps*batch_size))*(epoch)+i, vper) # for final test split t_batch_size = 1 tl1 = LSTMWrapper(t_batch_size, state_size) tl2 = LSTMWrapper(t_batch_size, state_size) tloss_avg = 0 tx = nn.Variable((t_batch_size, num_steps)) tt = nn.Variable((t_batch_size, num_steps)) tloss = get_loss(tl1, tl2, tx, tt, w, b, num_words, 1, state_size) tl1.share_data() tl2.share_data() for i in range(len(test_data)//(num_steps * t_batch_size)): tx.d, tt.d = get_batch(test_data, i*num_steps, 1, num_steps) tloss.forward() tloss_avg += tloss.d.copy() tloss_avg /= float((len(test_data)//(num_steps*t_batch_size))) tper = perplexity(tloss_avg) monitor_tperplexity.add( (len(test_data)//(num_steps*t_batch_size))*(epoch)+i, tper)
def pytorch_conv_init(inmaps, kernel): scale = 1 / np.sqrt(inmaps * np.prod(kernel)) return I.UniformInitializer(lim=(-scale, scale))
def cond_att_lstm(x, parent_index, mask, context, context_mask, state_size, att_hidden_size, initial_state=None, initial_cell=None, hist=None, dropout=0, train=True, w_init=None, inner_w_init=None, b_init=I.ConstantInitializer(0), forget_bias_init=I.ConstantInitializer(1)): """ x: (batch_size, length, input_size) parent_index: (batch_size, length) mask: (batch_size, length) context: (batch_size, context_length, context_size) context_mask: (batch_size, context_length) hist: (batch_size, l, state_size) """ batch_size, length, input_size = x.shape _, context_length, context_size = context.shape if w_init is None: w_init = I.UniformInitializer( I.calc_uniform_lim_glorot(input_size, state_size)) if inner_w_init is None: inner_w_init = orthogonal retain_prob = 1.0 - dropout z_w = nn.Variable((batch_size, 4, input_size), need_grad=False) z_w.d = 1 z_u = nn.Variable((batch_size, 4, state_size), need_grad=False) z_u.d = 1 if dropout > 0: if train: z_w = F.dropout(z_w, p=retain_prob) z_u = F.dropout(z_u, p=retain_prob) z_w *= retain_prob z_u *= retain_prob z_w = F.reshape(z_w, (batch_size, 4, 1, input_size)) z_w = F.broadcast(z_w, (batch_size, 4, length, input_size)) z_w = F.split(z_w, axis=1) z_u = F.split(z_u, axis=1) xi = z_w[0] * x xf = z_w[1] * x xc = z_w[2] * x xo = z_w[3] * x with nn.parameter_scope("cond_att_lstm"): # (batch_size, length, state_size) with nn.parameter_scope("lstm"): xi = PF.affine( xi, state_size, base_axis=2, w_init=w_init, b_init=b_init, name="Wi") xf = PF.affine( xf, state_size, base_axis=2, w_init=w_init, b_init=forget_bias_init, name="Wf") xc = PF.affine( xc, state_size, base_axis=2, w_init=w_init, b_init=b_init, name="Wc") xo = PF.affine( xo, state_size, base_axis=2, w_init=w_init, b_init=b_init, name="Wo") with nn.parameter_scope("context"): # context_att_trans: (batch_size, context_size, att_hidden_size) context_att_trans = PF.affine( context, att_hidden_size, base_axis=2, w_init=w_init, b_init=b_init, name="layer1_c") if initial_state is None: h = nn.Variable((batch_size, state_size), need_grad=False) h.data.zero() else: h = initial_state if initial_cell is None: c = nn.Variable((batch_size, state_size), need_grad=False) c.data.zero() else: c = initial_cell if hist is None: hist = nn.Variable((batch_size, 1, state_size), need_grad=False) hist.data.zero() # (batch_size, state_size) xi = split(xi, axis=1) xf = split(xf, axis=1) xc = split(xc, axis=1) xo = split(xo, axis=1) mask = F.reshape(mask, [batch_size, length, 1]) # (batch_size, length, 1) mask = F.broadcast(mask, [batch_size, length, state_size]) # (batch_size, state_size) mask = split(mask, axis=1) # (batch_size, max_action_length) parent_index = parent_index + 1 # index == 0 means that parent is root # (batch_size) parent_index = split(parent_index, axis=1) hs = [] cs = [] ctx = [] for i, f, c2, o, m, p in zip(xi, xf, xc, xo, mask, parent_index): h_num = hist.shape[1] with nn.parameter_scope("context"): h_att_trans = PF.affine( h, att_hidden_size, with_bias=False, w_init=w_init, name="layer1_h") # (batch_size, att_hidden_size) h_att_trans = F.reshape(h_att_trans, (batch_size, 1, att_hidden_size)) h_att_trans = F.broadcast( h_att_trans, (batch_size, context_length, att_hidden_size)) att_hidden = F.tanh(context_att_trans + h_att_trans) att_raw = PF.affine( att_hidden, 1, base_axis=2, w_init=w_init, b_init=b_init) # (batch_size, context_length, 1) att_raw = F.reshape(att_raw, (batch_size, context_length)) ctx_att = F.exp(att_raw - F.max(att_raw, axis=1, keepdims=True)) ctx_att = ctx_att * context_mask ctx_att = ctx_att / F.sum(ctx_att, axis=1, keepdims=True) ctx_att = F.reshape(ctx_att, (batch_size, context_length, 1)) ctx_att = F.broadcast(ctx_att, (batch_size, context_length, context_size)) ctx_vec = F.sum( context * ctx_att, axis=1) # (batch_size, context_size) # parent_history p = F.reshape(p, (batch_size, 1)) p = F.one_hot(p, (h_num, )) p = F.reshape(p, (batch_size, 1, h_num)) par_h = F.batch_matmul(p, hist) # [batch_size, 1, state_size] par_h = F.reshape(par_h, (batch_size, state_size)) with nn.parameter_scope("lstm"): i_t = PF.affine( z_u[0] * h, state_size, w_init=inner_w_init(state_size, state_size), with_bias=False, name="Ui") i_t += PF.affine( ctx_vec, state_size, w_init=inner_w_init(context_size, state_size), with_bias=False, name="Ci") i_t += PF.affine( par_h, state_size, w_init=inner_w_init(state_size, state_size), with_bias=False, name="Pi") i_t = F.sigmoid(i + i_t) f_t = PF.affine( z_u[1] * h, state_size, w_init=inner_w_init(state_size, state_size), with_bias=False, name="Uf") f_t += PF.affine( ctx_vec, state_size, w_init=inner_w_init(context_size, state_size), with_bias=False, name="Cf") f_t += PF.affine( par_h, state_size, w_init=inner_w_init(state_size, state_size), with_bias=False, name="Pf") f_t = F.sigmoid(f + f_t) c_t = PF.affine( z_u[2] * h, state_size, w_init=inner_w_init(state_size, state_size), with_bias=False, name="Uc") c_t += PF.affine( ctx_vec, state_size, w_init=inner_w_init(context_size, state_size), with_bias=False, name="Cc") c_t += PF.affine( par_h, state_size, w_init=inner_w_init(state_size, state_size), with_bias=False, name="Pc") c_t = f_t * c + i_t * F.tanh(c2 + c_t) o_t = PF.affine( z_u[3] * h, state_size, w_init=inner_w_init(state_size, state_size), with_bias=False, name="Uo") o_t += PF.affine( ctx_vec, state_size, w_init=inner_w_init(context_size, state_size), with_bias=False, name="Co") o_t += PF.affine( par_h, state_size, w_init=inner_w_init(state_size, state_size), with_bias=False, name="Po") o_t = F.sigmoid(o + o_t) h_t = o_t * F.tanh(c_t) h_t = (1 - m) * h + m * h_t c_t = (1 - m) * c + m * c_t h = h_t c = c_t h_t = F.reshape(h_t, (batch_size, 1, state_size), inplace=False) c_t = F.reshape(c_t, (batch_size, 1, state_size), inplace=False) ctx_vec = F.reshape( ctx_vec, (batch_size, 1, context_size), inplace=False) hs.append(h_t) cs.append(c_t) ctx.append(ctx_vec) hist = F.concatenate( hist, h_t, axis=1) # (batch_size, h_num + 1, state_size) return concatenate( *hs, axis=1), concatenate( *cs, axis=1), concatenate( *ctx, axis=1), hist
def sample_from_controller(args): """ 2-layer RNN(LSTM) based controller which outputs an architecture of CNN, represented as a sequence of integers and its list. Given the number of layers, for each layer, it executes 2 types of computation, one for sampling the operation at that layer, another for sampling the skip connection patterns. """ entropys = nn.Variable([1, 1], need_grad=True) log_probs = nn.Variable([1, 1], need_grad=True) entropys.d = log_probs.d = 0.0 # initialize them all num_cells = args.num_cells num_nodes = args.num_nodes lstm_size = args.lstm_size state_size = args.state_size lstm_num_layers = args.lstm_layers temperature = args.temperature tanh_constant = args.tanh_constant op_tanh_reduce = args.op_tanh_reduce num_branch = args.num_ops both_archs = [list(), list()] initializer = I.UniformInitializer((-0.1, 0.1)) prev_h = [ nn.Variable([1, lstm_size], need_grad=True) for _ in range(lstm_num_layers) ] prev_c = [ nn.Variable([1, lstm_size], need_grad=True) for _ in range(lstm_num_layers) ] for i in range(len(prev_h)): prev_h[i].d = 0 # initialize. prev_c[i].d = 0 inputs = nn.Variable([1, lstm_size]) inputs.d = np.random.normal(0, 0.5, [1, lstm_size]) g_emb = nn.Variable([1, lstm_size]) g_emb.d = np.random.normal(0, 0.5, [1, lstm_size]) for ind in range(2): # first create conv cell and then reduc cell. idx_seq = list() ops_seq = list() for node_id in range(num_nodes): if node_id == 0: anchors = nn.parameter.get_parameter_or_create("anchors", [2, lstm_size], initializer, need_grad=False) anchors_w_1 = nn.parameter.get_parameter_or_create( "anchors_w_1", [2, lstm_size], initializer, need_grad=False) else: assert anchors.shape[0] == node_id + \ 2, "Something wrong with anchors." assert anchors_w_1.shape[0] == node_id + \ 2, "Something wrong with anchors_w_1." # for each node, get the index used as inputs for i in range(2): # One-step stacked LSTM. with nn.parameter_scope("controller_lstm"): next_h, next_c = stack_lstm(inputs, prev_h, prev_c, state_size) prev_h, prev_c = next_h, next_c # shape:(1, lstm_size) query = anchors_w_1 with nn.parameter_scope("skip_affine_1"): query = F.tanh( F.add2( query, PF.affine(next_h[-1], lstm_size, w_init=initializer, with_bias=False))) # (node_id + 2, lstm_size) + (1, lstm_size) # broadcast occurs here. resulting shape is; (node_id + 2, lstm_size) with nn.parameter_scope("skip_affine_2"): # (node_id + 2, 1) logit = PF.affine(query, 1, w_init=initializer, with_bias=False) if temperature is not None: logit = F.mul_scalar(logit, (1 / temperature)) if tanh_constant is not None: logit = F.mul_scalar(F.tanh(logit), tanh_constant) index = F.exp(logit) index = F.mul_scalar(index, (1 / index.d.sum())) # Sampling input indices from multinomial distribution. index = np.random.multinomial( 1, np.reshape(index.d, (1, index.d.size))[0], 1) idx_seq.append(index.nonzero()[1]) label = nn.Variable.from_numpy_array( index.transpose()) # (node_id + 2, 1) log_prob = F.softmax_cross_entropy(logit, label) log_probs = F.add2(log_probs, F.sum(log_prob, keepdims=True)) curr_ent = F.softmax_cross_entropy(logit, F.softmax(logit)) entropy = F.sum(curr_ent, keepdims=True) entropys = F.add2(entropys, entropy) taking_ind = int(index.nonzero()[1][0]) # (1, lstm_size) inputs = F.reshape(anchors[taking_ind], (1, anchors.shape[1])) # ops for j in range(2): with nn.parameter_scope("controller_lstm"): next_h, next_c = stack_lstm(inputs, prev_h, prev_c, state_size) prev_h, prev_c = next_h, next_c # shape:(1, lstm_size) # Compute for operation. with nn.parameter_scope("ops"): logit = PF.affine(next_h[-1], num_branch, w_init=initializer, with_bias=False) # shape of logit : (1, num_branch) if temperature is not None: logit = F.mul_scalar(logit, (1 / temperature)) if tanh_constant is not None: op_tanh = tanh_constant / op_tanh_reduce logit = F.mul_scalar(F.tanh(logit), op_tanh) # normalizing logits. normed_logit = np.e**logit.d normed_logit = normed_logit / np.sum(normed_logit) # Sampling operation id from multinomial distribution. branch_id = np.random.multinomial(1, normed_logit[0], 1).nonzero()[1] branch_id = nn.Variable.from_numpy_array(branch_id) ops_seq.append(branch_id.d) # log policy for operation. log_prob = F.softmax_cross_entropy( logit, F.reshape(branch_id, shape=(1, 1))) # accumulate log policy as log probs log_probs = F.add2(log_probs, log_prob) logit = F.transpose(logit, axes=(1, 0)) curr_ent = F.softmax_cross_entropy(logit, F.softmax(logit)) entropy = F.sum(curr_ent, keepdims=True) entropys = F.add2(entropys, entropy) w_emb = nn.parameter.get_parameter_or_create( "w_emb", [num_branch, lstm_size], initializer, need_grad=False) # (1, lstm_size) inputs = F.reshape(w_emb[int(branch_id.d)], (1, w_emb.shape[1])) with nn.parameter_scope("controller_lstm"): next_h, next_c = stack_lstm(inputs, prev_h, prev_c, lstm_size) prev_h, prev_c = next_h, next_c with nn.parameter_scope("skip_affine_3"): adding_w_1 = PF.affine(next_h[-1], lstm_size, w_init=initializer, with_bias=False) # (node_id + 2 + 1, lstm_size) anchors = F.concatenate(anchors, next_h[-1], axis=0) # (node_id + 2 + 1, lstm_size) anchors_w_1 = F.concatenate(anchors_w_1, adding_w_1, axis=0) for idx, ops in zip(idx_seq, ops_seq): both_archs[ind].extend([int(idx), int(ops)]) return both_archs, log_probs, entropys