def _test_max(test_case, placement, sbp, np_out, np_out_grad, input_arr, shape, dim, keepdims): # of result global_x = flow.tensor( input_arr, dtype=flow.float32, requires_grad=True, placement=flow.env.all_device_placement("cpu"), sbp=flow.sbp.broadcast, ) if dim is None: of_out = flow.max(global_x) else: of_out = flow.max(global_x, dim, keepdims)[0] test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05)) of_out = of_out.sum() of_out.backward() test_case.assertTrue( np.allclose(global_x.grad.numpy(), np_out_grad, 0.0001, 0.0001))
def forward(self, src): src_key_padding_mask = self.make_len_mask(src).to(src.device) src_mask = None src = self.src_embedding(src) src = self.pos(src) out = self.transformer_encoder(src, src_mask, src_key_padding_mask) out = flow.max(out, dim=1) out = self.linear(out) return out
def fill_tensor_based_index(tensor, index, value, blank=BLK): assert tensor.dim() == 2 assert value.dim() == 1 assert value.size(0) == tensor.size(0) assert index.size(0) == value.size(0) assert tensor.size(1) >= int(flow.max(index)) for b in range(index.size(0)): pos = int(index[b]) if int(value[b]) == blank: continue else: tensor[b, pos] = value[b] return tensor
def compute_loss(self, est, egs): # spks x n x S ests = est # spks x n x S refs = egs["ref"] num_spks = len(refs) def sisnr_loss(permute): # for one permute return sum( [self.sisnr(ests[s], refs[t]) for s, t in enumerate(permute)]) / len(permute) # P x N N = egs["mix"].size(0) sisnr_mat = flow.stack( [sisnr_loss(p) for p in permutations(range(num_spks))]) max_perutt, _ = flow.max(sisnr_mat, dim=0) # si-snr return -flow.sum(max_perutt) / N
def _test_max(test_case, device, shape, dim, keepdims): input_arr = np.random.randn(*shape) np_out = np.amax(input_arr, axis=dim, keepdims=keepdims) x = flow.tensor(input_arr, dtype=flow.float32, device=flow.device(device), requires_grad=True) of_out = flow.max(x, dim, keepdims) if dim != None: of_out = of_out[0] test_case.assertTrue(np.allclose(of_out.numpy(), np_out, 1e-05, 1e-05)) of_out = of_out.sum() of_out.backward() np_out_grad = np.zeros_like(input_arr) if dim == None: arg_max = np.argmax(input_arr) np.put(np_out_grad, arg_max, 1) else: arg_max = np.expand_dims(np.argmax(input_arr, axis=dim), axis=dim) np.put_along_axis(np_out_grad, arg_max, 1, axis=dim) test_case.assertTrue( np.allclose(x.grad.numpy(), np_out_grad, 0.0001, 0.0001))
def _max(self, *args, **kwargs): return flow.max(self, *args, **kwargs)
def _max(self, dim=None, keepdim=False): return flow.max(self, dim, keepdim)
def clip_grad_norm_( parameters: _tensor_or_tensors, max_norm: float, norm_type: float = 2.0, error_if_nonfinite: bool = False, ) -> Tensor: r"""Clips gradient norm of an iterable of parameters. The norm is computed over all gradients together, as if they were concatenated into a single vector. Args: parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a single Tensor that will have gradients normalized max_norm (float or int): max norm of the gradients norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for infinity norm. error_if_nonfinite (bool): if True, an error is thrown if the total norm of the gradients from :attr:``parameters`` is ``nan``, ``inf``, or ``-inf``. Default: False (will switch to True in the future) Returns: Parameters after cliping gradient norm Total norm of the parameters (viewed as a single vector). For example: .. code-block:: python >>> import oneflow as flow >>> import numpy as np >>> x1 = flow.tensor(np.array([[2, 3, 4], [1.5, 2.6, 3.7]]).astype(np.float32), requires_grad=True) >>> m1 = flow.nn.ReLU() >>> out1 = m1(x1) >>> out1 = out1.sum() >>> out1.backward() >>> norm1 = flow.nn.utils.clip_grad_norm_(x1, 0.6, 1.0) >>> norm1 tensor(6., dtype=oneflow.float32) >>> x1.grad tensor([[0.1000, 0.1000, 0.1000], [0.1000, 0.1000, 0.1000]], dtype=oneflow.float32) >>> x2 = flow.tensor(np.array([[-2, -3, -4], [2.5, 0, 3.2]]).astype(np.float32), requires_grad=True) >>> out2 = flow.atan(x2) >>> out2 = out2.sum() >>> out2.backward() >>> norm2 = flow.nn.utils.clip_grad_norm_(x2, 0.5) >>> norm2 tensor(1.0394, dtype=oneflow.float32) >>> x2.grad tensor([[0.0962, 0.0481, 0.0283], [0.0663, 0.4810, 0.0428]], dtype=oneflow.float32) """ if isinstance(parameters, (Tensor, flow._oneflow_internal.Tensor)): parameters = [parameters] parameters = [p for p in parameters if p.grad is not None] max_norm = float(max_norm) norm_type = float(norm_type) if len(parameters) == 0: return flow.tensor(0.0) if parameters[0].is_global: assert all([p.is_global for p in parameters ]), "All parameters must be consistent tensor." sbp_broadcast = [flow.sbp.broadcast for _ in parameters[0].sbp] param0_placement = parameters[0].placement if norm_type == float("inf"): norms = [ p.grad.detach().to_global( sbp=sbp_broadcast).abs().max().to_global( placement=param0_placement) for p in parameters ] total_norm = norms[0] if len(norms) == 1 else flow.max( flow.stack(norms)) elif norm_type == float("-inf"): norms = [ p.grad.detach().to_global( sbp=sbp_broadcast).abs().min().to_global( placement=param0_placement) for p in parameters ] total_norm = norms[0] if len(norms) == 1 else flow.min( flow.stack(norms)) else: total_norm = flow.linalg.vector_norm( flow.stack([ flow.linalg.vector_norm( p.grad.detach().to_global(sbp=sbp_broadcast), norm_type).to_global(placement=param0_placement) for p in parameters ]), norm_type, ) if error_if_nonfinite and flow.logical_or(total_norm.isnan(), total_norm.isinf()): raise RuntimeError( f"The total norm of order {norm_type} for gradients from " "`parameters` is non-finite, so it cannot be clipped. To disable " "this error and scale the gradients by the non-finite norm anyway, " "set `error_if_nonfinite=False`") clip_coef = max_norm / (total_norm + 1e-6) clip_coef_clamped = clip_coef.clamp(max=1.0) for p in parameters: p.grad.detach().mul_( clip_coef_clamped.to_global(placement=p.placement)) else: device = parameters[0].grad.device if norm_type == float("inf"): norms = [ p.grad.detach().abs().max().to(device) for p in parameters ] total_norm = norms[0] if len(norms) == 1 else flow.max( flow.stack(norms)) elif norm_type == float("-inf"): norms = [ p.grad.detach().abs().min().to(device) for p in parameters ] total_norm = norms[0] if len(norms) == 1 else flow.min( flow.stack(norms)) else: total_norm = flow.linalg.vector_norm( flow.stack([ flow.linalg.vector_norm(p.grad.detach(), norm_type).to(device) for p in parameters ]), norm_type, ) if error_if_nonfinite and flow.logical_or(total_norm.isnan(), total_norm.isinf()): raise RuntimeError( f"The total norm of order {norm_type} for gradients from " "`parameters` is non-finite, so it cannot be clipped. To disable " "this error and scale the gradients by the non-finite norm anyway, " "set `error_if_nonfinite=False`") clip_coef = max_norm / (total_norm + 1e-6) clip_coef_clamped = clip_coef.clamp(max=1.0) for p in parameters: p.grad.detach().mul_(clip_coef_clamped.to(p.grad.device)) return total_norm
def train(opt): # Step 1: init BrainDQN model = DeepQNetwork() model.to("cuda") optimizer = flow.optim.Adam(model.parameters(), lr=opt.lr) criterion = flow.nn.MSELoss() criterion.to("cuda") # Step 2: init Flappy Bird Game game_state = GameState() # Step 3: play game # image.shape = (288,512,3), reward: float, terminal: boolean image, reward, terminal = game_state.frame_step(0) # image.shape = (84, 84) image = pre_processing( image[: game_state.SCREENWIDTH, : int(game_state.BASEY)], opt.image_size, opt.image_size, ) image = flow.Tensor(image, dtype=flow.float32) image = image.to("cuda") state = flow.cat(tuple(image for _ in range(4))).unsqueeze(0) replay_memory = [] iter = 0 # Step 4: run the game while iter < opt.num_iters: model.train() prediction = model(state)[0] # Exploration or exploitation epsilon = opt.final_epsilon + ( (opt.num_iters - iter) * (opt.initial_epsilon - opt.final_epsilon) / opt.num_iters ) u = random() random_action = u <= epsilon if random_action: print("Perform a random action") action = randint(0, 1) else: action = flow.argmax(prediction).numpy()[0] next_image, reward, terminal = game_state.frame_step(action) next_image = pre_processing( next_image[: game_state.SCREENWIDTH, : int(game_state.BASEY)], opt.image_size, opt.image_size, ) next_image = flow.Tensor(next_image) next_image = next_image.to("cuda") next_state = flow.cat((state[0, 1:, :, :], next_image)).unsqueeze(0) replay_memory.append([state, action, reward, next_state, terminal]) if len(replay_memory) > opt.replay_memory_size: del replay_memory[0] batch = sample(replay_memory, min(len(replay_memory), opt.batch_size)) state_batch, action_batch, reward_batch, next_state_batch, terminal_batch = zip( *batch ) state_batch = flow.cat(tuple(state for state in state_batch)) action_batch = flow.Tensor( np.array( [[1, 0] if action == 0 else [0, 1] for action in action_batch], dtype=np.float32, ) ) reward_batch = flow.Tensor(np.array(reward_batch, dtype=np.float32)[:, None]) next_state_batch = flow.cat(tuple(state for state in next_state_batch)) state_batch = state_batch.to("cuda") action_batch = action_batch.to("cuda") reward_batch = reward_batch.to("cuda") next_state_batch = next_state_batch.to("cuda") current_prediction_batch = model(state_batch) next_prediction_batch = model(next_state_batch) y_batch = flow.cat( tuple( reward_batch[i] if terminal_batch[i] else reward_batch[i] + opt.gamma * flow.max(next_prediction_batch[i]) for i in range(reward_batch.shape[0]) ) ) q_value = flow.sum(current_prediction_batch * action_batch, dim=1) loss = criterion(q_value, y_batch) loss.backward() optimizer.step() optimizer.zero_grad() state = next_state iter += 1 print( "Iteration: {}/{}, Action: {}, Loss: {}, Epsilon {}, Reward: {}, Q-value: {}".format( iter + 1, opt.num_iters, action, loss.numpy(), epsilon, reward, flow.max(prediction).numpy()[0], ) ) if (iter + 1) % 100000 == 0: flow.save( model.state_dict(), os.path.join(opt.save_checkpoint_path, "epoch_%d" % (iter + 1)), ) flow.save( model.state_dict(), os.path.join(opt.save_checkpoint_path, "epoch_%d" % (iter + 1)), ) print("train success!")
def main(args): transform = vision.transforms.Compose([ vision.transforms.ToTensor(), vision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), ]) trainset = vision.datasets.CIFAR10(root=args.data_root, train=True, download=True, transform=transform) trainloader = flow.utils.data.DataLoader(trainset, batch_size=args.train_batch_size, shuffle=True, num_workers=1) testset = vision.datasets.CIFAR10(root=args.data_root, train=False, download=True, transform=transform) testloader = flow.utils.data.DataLoader(testset, batch_size=args.val_batch_size, shuffle=False, num_workers=1) classes = ( "plane", "car", "bird", "cat", "deer", "dog", "frog", "horse", "ship", "truck", ) device = flow.device("cuda") expert_network = MLP(input_size=3072, output_size=10, hidden_size=256) net = MoE(expert_network, 3072, 10, num_experts=10, noisy_gating=True, k=4) net.to(device) optimizer = optim.SGD(net.parameters(), lr=args.learning_rate, momentum=args.mom) criterion = nn.CrossEntropyLoss() criterion.to(device) for epoch in range(args.epochs): # loop over the dataset multiple times running_loss = 0.0 for i, data in enumerate(trainloader, 0): # get the inputs; data is a list of [inputs, labels] inputs, labels = data inputs, labels = inputs.to(device), labels.to(device) # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize inputs = inputs.view(inputs.shape[0], -1) outputs, aux_loss = net(inputs) loss = criterion(outputs, labels) total_loss = loss + aux_loss total_loss.backward() optimizer.step() # print statistics running_loss += loss.item() if i % 100 == 99: # print every 2000 mini-batches print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1, running_loss / 100)) running_loss = 0.0 print("Finished Training") correct = 0 total = 0 with torch.no_grad(): for i, data in enumerate(testloader, 0): images, labels = data images, labels = images.to(device), labels.to(device) outputs, _ = net(images.view(images.shape[0], -1)) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() print("Accuracy of the network on the 10000 test images: %d %%" % (100 * correct / total))