def test_spares_ftrl(): """ test sparse ftrl""" indices = Tensor(np.array([0, 1]).astype(np.int32)) label = Tensor(np.zeros([2, 1, 2]).astype(np.float32)) net = NetWithSparseGatherV2() net.set_train() optimizer = FTRL(net.trainable_params(), weight_decay=0.9, loss_scale=2.0) optimizer.target = 'Ascend' train_network = TrainOneStepCell(net, optimizer) _executor.compile(train_network, indices, label)
def __init__(self, network, sens=1024.0, host_device_mix=False, parameter_server=False, sparse=False): super(TrainStepWrap, self).__init__() parallel_mode = context.get_auto_parallel_context("parallel_mode") is_auto_parallel = parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL) self.network = network self.network.set_train() self.trainable_params = network.trainable_params() weights_w = [] weights_d = [] for params in self.trainable_params: if 'wide' in params.name: weights_w.append(params) else: weights_d.append(params) self.weights_w = ParameterTuple(weights_w) self.weights_d = ParameterTuple(weights_d) if (sparse and is_auto_parallel) or parameter_server: self.optimizer_d = LazyAdam( self.weights_d, learning_rate=3.5e-4, eps=1e-8, loss_scale=sens) self.optimizer_w = FTRL(learning_rate=5e-2, params=self.weights_w, l1=1e-8, l2=1e-8, initial_accum=1.0, loss_scale=sens) if host_device_mix or parameter_server: self.optimizer_w.target = "CPU" self.optimizer_d.target = "CPU" else: self.optimizer_d = Adam( self.weights_d, learning_rate=3.5e-4, eps=1e-8, loss_scale=sens) self.optimizer_w = FTRL(learning_rate=5e-2, params=self.weights_w, l1=1e-8, l2=1e-8, initial_accum=1.0, loss_scale=sens) self.hyper_map = C.HyperMap() self.grad_w = C.GradOperation(get_by_list=True, sens_param=True) self.grad_d = C.GradOperation(get_by_list=True, sens_param=True) self.sens = sens self.loss_net_w = IthOutputCell(network, output_index=0) self.loss_net_d = IthOutputCell(network, output_index=1) self.loss_net_w.set_grad() self.loss_net_d.set_grad() self.reducer_flag = False self.grad_reducer_w = None self.grad_reducer_d = None self.reducer_flag = parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL) if self.reducer_flag: mean = context.get_auto_parallel_context("gradients_mean") degree = context.get_auto_parallel_context("device_num") self.grad_reducer_w = DistributedGradReducer(self.optimizer_w.parameters, mean, degree) self.grad_reducer_d = DistributedGradReducer(self.optimizer_d.parameters, mean, degree)
def __init__(self, network, sens=1000.0): super(TrainStepWarp, self).__init__() self.network = network self.network.set_train() self.trainable_params = network.trainable_params() weights_w = [] weights_d = [] for params in self.trainable_params: weights_w.append(params) weights_d.append(params) self.weights_w = ParameterTuple(weights_w) self.weights_d = ParameterTuple(weights_d) self.optimizer_w = FTRL(learning_rate=1e-2, params=self.weights_w, l1=1e-8, l2=1e-8, initial_accum=1.0) self.optimizer_d = Adam(self.weights_d, learning_rate=3.5e-4, eps=1e-8, loss_scale=sens) self.hyper_map = C.HyperMap() self.grad_w = C.GradOperation(get_by_list=True, sens_param=True) self.grad_d = C.GradOperation(get_by_list=True, sens_param=True) self.sens = sens self.loss_net_w = IthOutputCell(network, output_index=0) self.loss_net_d = IthOutputCell(network, output_index=1)
def test_ftrl_net(): indices = Tensor(np.array([0, 0, 1]).astype(np.int32)) label = Tensor(np.zeros([2, 1, 2]).astype(np.float32)) net = NetWithSparseGatherV2() optimizer = FTRL(net.trainable_params(), learning_rate=0.1, weight_decay=0.9, loss_scale=2.0) optimizer.target = 'Ascend' train_network = TrainOneStepCell(net, optimizer) output = train_network(indices, label) np.allclose(output.asnumpy(), np.array([[[2, 2]], [[2, 2]], [[2, 2]]])) np.allclose(net.weight1.asnumpy(), np.array([[[0.7884067, 0.7884067]], [[0.68213105, 0.68213105]], [[1.0, 1.0]]])) np.allclose(net.weight2.asnumpy(), np.array([[[0.6821311, 0.6821311]], [[0.6821311, 0.6821311]], [[0.6821311, 0.6821311]]]))
def test_ftrl(): epoch = 3 net = NetFtrl() optimizer = FTRL(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate=0.01) criterion = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') net_with_criterion = WithLossCell(net, criterion) train_network = TrainOneStepCell( net_with_criterion, optimizer) train_network.set_train() context.set_context(mode=context.GRAPH_MODE, device_target="GPU") losses1 = [] for _ in range(epoch): data = Tensor(np.arange(0, 16).reshape( 1, 1, 4, 4).astype(np.float32) * 0.01) label = Tensor(np.array([0]).astype(np.int32)) loss = train_network(data, label) losses1.append(loss.asnumpy()) assert losses1[0] > losses1[1] assert losses1[1] > losses1[2] context.set_context(mode=context.PYNATIVE_MODE, device_target="GPU") losses2 = [] for _ in range(epoch): data = Tensor(np.arange(0, 16).reshape( 1, 1, 4, 4).astype(np.float32) * 0.01) label = Tensor(np.array([0]).astype(np.int32)) loss = train_network(data, label) losses2.append(loss.asnumpy())
def test_ftrl_target(): """ test_ftrl_target """ net = NetWithSparseGatherV2() net.set_train() optimizer = FTRL(net.trainable_params(), weight_decay=0.9, loss_scale=2.0) if optimizer.target not in ('CPU', 'Ascend'): raise ValueError("The value must be 'CPU' or 'Ascend', but got value {}".format(optimizer.target))
def __init__(self, network, config, sens=1000.0): super(TrainStepWrap, self).__init__() self.network = network self.network.set_train() self.trainable_params = network.trainable_params() weights_w = [] weights_d = [] for params in self.trainable_params: if 'wide' in params.name: weights_w.append(params) else: weights_d.append(params) self.weights_w = ParameterTuple(weights_w) self.weights_d = ParameterTuple(weights_d) self.optimizer_w = FTRL(learning_rate=config.ftrl_lr, params=self.weights_w, l1=5e-4, l2=5e-4, initial_accum=0.1, loss_scale=sens) #self.optimizer_d = ProximalAdagrad(self.weights_d, learning_rate=config.adam_lr,loss_scale=sens) self.optimizer_d = Adam(self.weights_d, learning_rate=config.adam_lr, eps=1e-6, loss_scale=sens) self.hyper_map = C.HyperMap() self.grad_w = C.GradOperation('grad_w', get_by_list=True, sens_param=True) self.grad_d = C.GradOperation('grad_d', get_by_list=True, sens_param=True) self.sens = sens self.loss_net_w = IthOutputCell(network, output_index=0) self.loss_net_d = IthOutputCell(network, output_index=1) self.reducer_flag = False self.grad_reducer_w = None self.grad_reducer_d = None parallel_mode = _get_parallel_mode() if parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL): self.reducer_flag = True if self.reducer_flag: mean = _get_mirror_mean() degree = _get_device_num() self.grad_reducer_w = DistributedGradReducer( self.optimizer_w.parameters, mean, degree) self.grad_reducer_d = DistributedGradReducer( self.optimizer_d.parameters, mean, degree)
def test_ftrl(): """ test_ftrl """ inputs = Tensor(np.ones([1, 64]).astype(np.float32)) label = Tensor(np.zeros([1, 10]).astype(np.float32)) net = Net() net.set_train() loss = nn.SoftmaxCrossEntropyWithLogits() optimizer = FTRL(net.trainable_params(), weight_decay=0.9, loss_scale=2.0) net_with_loss = WithLossCell(net, loss) train_network = TrainOneStepCell(net_with_loss, optimizer) _executor.compile(train_network, inputs, label)