def linear(input: Tensor, weight: Tensor, bias: Tensor = None): if input.dim == 2 and bias is not None: ret = input.matmul(weight) + bias else: output = input.matmul(weight.t()) if bias is not None: output += bias ret = output return ret
print("GOING TO START") losses = [] losses_t = [] for _ in range(15): for i in range(0, (len(X_train) + batch_size - 1) // batch_size): X_batch = Tensor(X_train[i * batch_size:(i + 1) * batch_size], compute_grad=False) y_batch = onehot(y_train[i * batch_size:(i + 1) * batch_size], 10) # onehot it b_size = len(X_batch.value) y1_out = (X_batch.matmul(W1) + b1).max(0.0) y1_out_t = relu(torch.matmul(torch.tensor(X_batch.value), W1t) + b1t) y_out = softmax(y1_out.matmul(W2) + b2, axis=-1) y_out_t = torch.softmax(torch.matmul(y1_out_t, W2t) + b2t, axis=-1) loss = 0 - (y_out.log() * y_batch).sum(axis=None) / b_size loss_t = -(torch.tensor(y_batch) * torch.log(y_out_t)).sum() / b_size # print(loss.value, loss_t) losses.append(loss.value) losses_t.append(loss_t) optimizer_t.zero_grad() optimizer.clear()
print("GOING TO START") losses = [] losses_t = [] for _ in range(15): for i in range(0, (len(X_train) + batch_size - 1) // batch_size): X_batch = Tensor(X_train[i * batch_size:(i + 1) * batch_size], compute_grad=False) y_batch = onehot(y_train[i * batch_size:(i + 1) * batch_size], 10) # onehot it b_size = len(X_batch.value) y_out = X_batch.matmul(W) + b y_out_t = torch.matmul(torch.tensor(X_batch.value), Wt) + bt loss = 0 - (softmax(y_out, axis=-1).log() * y_batch).sum(axis=None) / b_size loss_t = -(torch.tensor(y_batch) * torch.log(torch.softmax(y_out_t, axis=-1))).sum() / b_size # print(loss.value, loss_t) losses.append(loss.value) losses_t.append(loss_t) optimizer_t.zero_grad() optimizer.clear() loss.backward(np.ones_like(loss.value))
print("GOING TO START") losses = [] losses_t = [] for _ in range(15): for i in range(0, (len(X_train) + batch_size - 1) // batch_size): X_batch = Tensor(X_train[i * batch_size: (i + 1) * batch_size], compute_grad=False) y_batch = Tensor(y_train[i * batch_size: (i + 1) * batch_size].reshape(-1, 1), compute_grad=False) # Tensor(np.array([[2*len(X_batch.value)]]), compute_grad=False) b_size = 2 * len(X_batch.value) y_out = X_batch.matmul(W) + b y_out_t = torch.matmul(torch.tensor(X_batch.value), Wt) + bt l1 = y_out - y_batch l1_t = y_out_t - torch.tensor(y_batch.value) loss = (l1 * l1).sum(axis=None) / b_size loss_t = 0.5 * (l1_t * l1_t).mean() # print(loss.value, loss_t) losses.append(loss.value) losses_t.append(loss_t) optimizer_t.zero_grad() optimizer.clear() loss.backward(np.ones_like(loss.value))