def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens, vocab_size, device, corpus_indices, idx_to_char, char_to_idx, is_random_iter, num_epochs, num_steps, lr, clipping_theta, batch_size, pred_period, pred_len, prefixes): if is_random_iter: data_iter_fn = utils.data_iter_random else: data_iter_fn = utils.data_iter_consecutive params = get_params() loss = nn.CrossEntropyLoss() for epoch in range(num_epochs): if not is_random_iter: # 如使用相邻采样,在epoch开始时初始化隐藏状态 state = init_rnn_state(batch_size, num_hiddens, device) l_sum, n, start = 0.0, 0, time.time() data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, device) for X, Y in data_iter: if is_random_iter: # 如使用随机采样,在每个小批量更新前初始化隐藏状态 state = init_rnn_state(batch_size, num_hiddens, device) else: # 否则需要使用detach函数从计算图分离隐藏状态, 这是为了 # 使模型参数的梯度计算只依赖一次迭代读取的小批量序列(防止梯度计算开销太大) for s in state: s.detach_() inputs = to_onehot(X, vocab_size) # outputs有num_steps个形状为(batch_size, vocab_size)的矩阵 (outputs, state) = rnn(inputs, state, params) # 拼接之后形状为(num_steps * batch_size, vocab_size) outputs = torch.cat(outputs, dim=0) # Y的形状是(batch_size, num_steps),转置后再变成长度为 # batch * num_steps 的向量,这样跟输出的行一一对应 y = torch.transpose(Y, 0, 1).contiguous().view(-1) # 使用交叉熵损失计算平均分类误差 l = loss(outputs, y.long()) # 梯度清0 if params[0].grad is not None: for param in params: param.grad.data.zero_() l.backward() grad_clipping(params, clipping_theta, device) # 裁剪梯度 utils.sgd(params, lr, 1) # 因为误差已经取过均值,梯度不用再做平均 l_sum += l.item() * y.shape[0] n += y.shape[0] if (epoch + 1) % pred_period == 0: print('epoch %d, perplexity %f, time %.2f sec' % (epoch + 1, math.exp(l_sum / n), time.time() - start)) for prefix in prefixes: print( ' -', predict_rnn(prefix, pred_len, rnn, params, init_rnn_state, num_hiddens, vocab_size, device, idx_to_char, char_to_idx))
def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens, vocab_size, device, corpus_indices, idx_to_char, char_to_idx, is_random_iter, num_epochs, num_steps, lr, clipping_theta, batch_size, pred_period, pred_len, prefixes): if is_random_iter: data_iter_fn = d2l.data_iter_random else: data_iter_fn = d2l.data_iter_consecutive params = get_params() loss = nn.CrossEntropyLoss() for epoch in range(num_epochs): # 250 if not is_random_iter: # 如使用相邻采样,在epoch开始时初始化隐藏状态 state = init_rnn_state(batch_size, num_hiddens, device) l_sum, n, start = 0.0, 0, time.time() data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, device) for X, Y in data_iter: # [32, 35], [32, 35] if is_random_iter: # 如使用随机采样,在每个小批量更新前初始化隐藏状态 state = init_rnn_state(batch_size, num_hiddens, device) # [32, 256] else: # 否则需要使用detach函数从计算图分离隐藏状态 for s in state: s.detach_() # inputs是num_steps个形状为(batch_size, vocab_size)的矩阵 inputs = to_onehot(X, vocab_size) # [35, 32, 1027] # outputs有num_steps个形状为(batch_size, vocab_size)的矩阵 (outputs, state) = rnn(inputs, state, params) # [35, 32, 1027], [1, 32, 256] # 拼接之后形状为(num_steps * batch_size, vocab_size) outputs = torch.cat(outputs, dim=0) # [1120, 1027] # Y的形状是(batch_size, num_steps),转置后再变成形状为 # (num_steps * batch_size,)的向量,这样跟输出的行一一对应 y = torch.flatten(Y.t()) # [1120,] # 使用交叉熵损失计算平均分类误差 l = loss(outputs, y.long()) # 梯度清0 if params[0].grad is not None: for param in params: param.grad.data.zero_() l.backward() grad_clipping(params, clipping_theta, device) # 裁剪梯度 d2l.sgd(params, lr, 1) # 因为误差已经取过均值,梯度不用再做平均 l_sum += l.item() * y.shape[0] n += y.shape[0] if (epoch + 1) % pred_period == 0: print('epoch %d, perplexity %f, time %.2f sec' % (epoch + 1, math.exp(l_sum / n), time.time() - start)) for prefix in prefixes: print( ' -', predict_rnn(prefix, pred_len, rnn, params, init_rnn_state, num_hiddens, vocab_size, device, idx_to_char, char_to_idx))
def train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens, vocab_size, ctx, corpus_indices, idx_to_char, char_to_idx, is_random_iter, num_epochs, num_steps, lr, clipping_theta, batch_size, pred_period, pred_len, prefixes): if is_random_iter: data_iter_fn = us.data_iter_random else: data_iter_fn = us.data_iter_consecutive params = get_params() loss = gloss.SoftmaxCrossEntropyLoss() for epoch in range(num_epochs): #单纯的训练轮数,与训练数据集分成多少批无关 if not is_random_iter: # 如使用相邻采样,在epoch开始时初始化隐藏状态 state = init_rnn_state(batch_size, num_hiddens, ctx) l_sum, n, start = 0.0, 0, time.time() data_iter = data_iter_fn(corpus_indices, batch_size, num_steps, ctx) # 用所有训练数据产生多批歌词用于小批量随机梯度下降 for X, Y in data_iter: # 每一批数据产生一组歌词段索引序列X(每个序列都作为RNN模型的一个多时间步输入),和一组对应的歌词下一个字标签序列Y if is_random_iter: # 如使用随机采样,在每个小批量更新前初始化隐藏状态 state = init_rnn_state(batch_size, num_hiddens, ctx) else: # 否则需要使用detach函数从计算图分离隐藏状态 ''' 当多个相邻小批量通过传递隐藏状态串联起来时,模型参数的梯度计算将依赖所有串联起来的小批量序列。同一迭代周期中(epoch),随着迭代次数的增加,梯度的计算开销会越来越大。 为了使模型参数的梯度计算只依赖一次迭代读取的小批量序列,我们可以在每次读取小批量前将隐藏状态从计算图中分离出来 批之间单行(歌词段)语义连续,用于利用RNN模型在批之间传递隐藏状态,产生语义连贯加强的训练效果 ''' for s in state: s.detach() with autograd.record(): inputs = us.to_onehot(X, vocab_size) # 单批各歌词段的字符都转为one-hot # outputs有num_steps个形状为(batch_size, vocab_size)的矩阵 (outputs, state) = rnn(inputs, state, params) # 连结之后形状为(num_steps * batch_size, vocab_size) outputs = nd.concat(*outputs, dim=0) # Y的形状是(batch_size, num_steps),转置后再变成长度为 # batch * num_steps 的向量,这样跟输出的行一一对应 y = Y.T.reshape((-1,)) # Y,X都是转置,处理,然后按行序并为一列,因为Y,X原来就是对应的(歌词序列对应歌词下一次标签序列),所以这里也是一一对应的,用于计算交叉熵损失 # 使用交叉熵损失计算平均分类误差 每个one-hot向量(对应一个字)计算交叉熵再求和取平均 l = loss(outputs, y).mean() l.backward() us.grad_clipping(params, clipping_theta, ctx) # 裁剪梯度 # 每个小批量的所有歌词输出结果与对应标签计算交叉熵求和,并求了均值,这里对此采用梯度下降 us.sgd(params, lr, 1) # 因为误差已经取过均值,梯度不用再做平均 l_sum += l.asscalar() * y.size #所有批总损失 n += y.size # 所有批总字数 if (epoch + 1) % pred_period == 0: print('epoch %d, perplexity %f, time %.2f sec' % ( epoch + 1, math.exp(l_sum / n), time.time() - start)) # 计算一个平均总损失,这里做了指数运算 for prefix in prefixes: #每pred_period轮,每轮用所有批进行的完整训练后,打印损失,并打印预测出的歌词段 print(' -', predict_rnn( prefix, pred_len, rnn, params, init_rnn_state, num_hiddens, vocab_size, ctx, idx_to_char, char_to_idx))
def train(options, train_data, valid_data, test_data): np.random.seed(12345) if not os.path.exists(options['saveto']): os.makedirs(options['saveto']) print 'Building the model...' params = init_params(options) users_id, items_id, bow, y, y_pred, bow_pred, mse, nll, cost = build_model(options, params) print 'Computing gradients...' lrt = sharedX(options['lr']) grads = T.grad(cost, params.values()) updates = sgd(params.values(), grads, lrt) print 'Compiling theano functions...' eval_fn = theano.function([users_id, items_id, y], mse) train_fn = theano.function([users_id, items_id, bow, y], [cost, mse, nll], updates=updates) print "Training..." train_iter = MultiFixDimIterator(*train_data, batch_size=options['batch_size'], shuffle=True) valid_iter = MultiFixDimIterator(*valid_data, batch_size=100) test_iter = MultiFixDimIterator(*test_data, batch_size=100) best_valid = float('inf') best_test = float('inf') n_batches = np.ceil(train_data[0].shape[0]*1./options['batch_size']).astype('int') disp_str = ['Train COST', 'Train MSE', 'Train NLL'] for eidx in range(options['n_epochs']): accum_cost, accum_mse, accum_nll = 0., 0., 0. for batch in train_iter: batch = prepare_batch_data(options, batch) b_cost, b_mse, b_nll = train_fn(*batch) accum_cost += b_cost accum_mse += b_mse accum_nll += b_nll disp_val = [val/n_batches for val in [accum_cost, accum_mse, accum_nll]] res_str = ('[%d] ' % eidx) + ", ".join("%s: %.4f" %(s,v) for s,v in zip(disp_str, disp_val)) print res_str if (eidx+1) % options['valid_freq'] == 0: disp_val = [np.mean([eval_fn(*vbatch) for vbatch in valid_iter]), np.mean([eval_fn(*tbatch) for tbatch in test_iter])] res_str = ", ".join("%s: %.4f" %(s,v) for s,v in zip(['Valid MSE', 'Test MSE'], disp_val)) print res_str if best_valid > disp_val[0]: best_valid, best_test = disp_val dump_params(options['saveto'], eidx, "best_params", params) print "Done training..." print "Best Valid MSE: %.4f and Test MSE: %.4f" % best_test
def fit_and_plot(lambd): w, b = init_params() train_ls, test_ls = [], [] for _ in range(num_epochs): for X, y in train_iter: l = loss(net(X, w, b), y) + lambd * l2_penalty(w) l = l.sum() if w.grad is not None: w.grad.data.zero_() b.grad.data.zero_() l.backward() utils.sgd([w, b], lr, batch_size) train_ls.append(loss(net(train_features, w, b), train_labels).mean().item()) test_ls.append(loss(net(test_features, w, b), test_labels).mean().item()) utils.semilogy(range(1, num_epochs+1), train_ls, 'epochs', 'loss', range(1, num_epochs+1), test_ls, ['train', 'test']) print('L2 norm of w:', w.norm().item())
def fit_and_plot(lambd): w = nd.random.normal(scale=1, shape=true_w.shape) b = nd.zeros(shape=(1, )) w.attach_grad() b.attach_grad() train_ls, test_ls = [], [] for _ in range(num_epochs): for X, y in train_iter: with autograd.record(): l = loss(net(X, w, b), y) + lambd * l2_penalty(w) l.backward() utils.sgd([w, b], learning_rate, batch_size) train_ls.append( loss(net(train_features, w, b), train_labels).mean().asscalar()) test_ls.append( loss(net(test_features, w, b), test_labels).mean().asscalar()) utils.semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'loss', range(1, num_epochs + 1), test_ls, ['train', 'test']) print("L2 norm of w:", w.norm().asscalar())
def fit(self, X, y, eta=0.1, epochs=10000): """Fits multiclass SVM :param X: array-like, shape = [num_samples,num_inFeatures], input data :param y: array-like, shape = [num_samples,], input classes :param eta: learning rate for SGD :param T: maximum number of iterations :return: self """ self.coef_ = sgd(X, y, self.n_out, self.subgradient, eta, epochs) self.is_fit = True return self
def configure(self, flags): for name, para in self.network.named_parameters(): print(name, para.size()) self.optimizer = sgd(model=self.network, parameters=self.network.parameters(), lr=flags.lr, weight_decay=flags.weight_decay, momentum=flags.momentum) self.scheduler = lr_scheduler.StepLR(optimizer=self.optimizer, step_size=flags.step_size, gamma=0.1) self.loss_fn = crossentropyloss()
def train(rnn, get_params, init_rnn_state, num_hiddens, vocabulary_size, context, indices, index_to_char, char_to_index, is_random_iter, num_epochs, num_steps, learning_rate, clipping_theta, batch_size, predict_period, predict_length, prefixes): params = get_params() loss = gloss.SoftmaxCrossEntropyLoss() for epoch in range(num_epochs): l_sum = 0.0 n = 0 start = time.time() if is_random_iter == False: state = init_rnn_state(batch_size, num_hiddens, context) for X, y in data_iter(indices, batch_size, num_steps, is_random_iter, context): if is_random_iter: state = init_rnn_state(batch_size, num_hiddens, context) else: for s in state: s.detach() with autograd.record(): inputs = to_onehot(nd.array(X), vocabulary_size) (outputs, state) = rnn(inputs, state, params) outputs = nd.concat(*outputs, dim=0) y = nd.array(y).T.reshape((-1, )) l = loss(outputs, y).mean() l.backward() grad_clipping(params, clipping_theta, context) utils.sgd(params, learning_rate, 1) l_sum += l.asscalar() * y.size n += y.size if (epoch + 1) % predict_period == 0: print('epoch %d, perplexity %f, time %.2f sec' % (epoch + 1, math.exp(l_sum / n), time.time() - start)) for prefix in prefixes: print( '-', predict(prefix, predict_length, rnn, params, init_rnn_state, num_hiddens, vocabulary_size, context, index_to_char, char_to_index))
def fit_and_plot(self, lambd): [w, b] = self.init_weights() train_iter, train_features, test_features, train_labels, test_labels = self.load_dataset( ) train_ls, test_ls = [], [] for _ in range(self.n_epochs): for X, y in train_iter: l = self.loss()(self.net()(X, w, b), y) + lambd * self.l2_penalty(w) l = l.sum() if w.grad is not None: w.grad.data.zero_() b.grad.data.zero_() l.backward() utils.sgd([w, b], self.lr, self.batch_size) train_ls.append(self.loss()(self.net()(train_features, w, b), train_labels).mean().item()) test_ls.append(self.loss()(self.net()(test_features, w, b), test_labels).mean().item()) utils.semilogy(range(1, self.n_epochs + 1), train_ls, 'epoch', 'loss', range(1, self.n_epochs + 1), test_ls, ['train', 'test']) print('L2 norm of w:', w.norm().item())
def fit_sgd(self, Y, R): n_jokes = Y.shape[0] n_users = Y.shape[1] X, Theta = utils.init_par(n_users, n_jokes, self.n_features) start = time.time() for i in range(self.n_iter): X, Theta = utils.sgd(X, Theta, Y, self.lamb, R, init_learning_rate=self.learning_rate, max_iter=8) J = utils.cost(X, Theta, Y, self.lamb, R) print('cost: ' + str(J),', n_iter: '+str(i)) if J < 200: break self.features = X self.coef = Theta self.cost = utils.cost(X, Theta, Y, self.lamb, R) end = time.time() self.train_time = end-start print('final cost: '+ str(self.cost),'\n' 'train time: '+str(self.train_time)) return
# insert 1 in every row for intercept b X.insert(loc=len(X.columns), column='intercept', value=1) # split data into train and test set print("splitting dataset into train and test sets...") X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42) X_train = X_train.to_numpy() X_test = X_test.to_numpy() # train print("Training started...") W, lossHistory = sgd(X_train, y_train, learning_rate, regularization_strength, max_epochs) print("Training finished.") # testing print("Testing...") y_train_predicted = np.array([]) for i in range(X_train.shape[0]): yp = np.sign(np.dot(X_train[i], W)) y_train_predicted = np.append(y_train_predicted, yp) y_test_predicted = np.array([]) for i in range(X_test.shape[0]): yp = np.sign(np.dot(X_test[i], W)) y_test_predicted = np.append(y_test_predicted, yp)
def net(X): X = X.reshape((-1, 784)) H = relu(nd.dot(X, W1) + b1) H1 = relu(nd.dot(H, W2) + b2) return nd.dot(H1, W3) + b3 #train loss_func = gluon.loss.SoftmaxCrossEntropyLoss() lr = 0.03 #trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate':0.03}) num_epochs = 10 for epoch in range(num_epochs): train_loss, train_acc = 0., 0. for data, label in train_data: with ag.record(): out = net(data) loss = loss_func(out, label) loss.backward() #trainer.step(batch_size) utils.sgd(params, lr, batch_size) label = label.astype('float32') train_loss += nd.mean(loss).asscalar() train_acc += nd.mean(out.argmax(axis=1) == label).asscalar() test_acc = utils.evaluate_accuracy(test_data, net) print('epoch %d. Loss: %f, Train acc %f, Test acc %f' % (epoch, train_loss / len(train_data), train_acc / len(train_data), test_acc))
# print(softmax(x).sum(axis=1)) def cross_entropy(yhat, y): return -nd.pick(nd.log(yhat), y) def net(X): return softmax(nd.dot(X.reshape((-1, num_inputs)), W) + b) epochs = 10 base_rate = 0.001 for epoch in range(epochs): train_loss = .0 train_acc = .0 for data, label in train_iter: with autograd.record(): output = net(data) loss = cross_entropy(output, label) loss.backward() learning_rate = base_rate / (epoch + 1) sgd(params, learning_rate) train_loss += nd.mean(loss).asscalar() train_acc += accuracy(output, label) test_acc = evaluate_accuracy(test_iter, net) print('Epoch %d. Loss: %f, Train acc: %f, Test acc:%f' % (epoch, train_loss / len(train_iter), train_acc / len(train_iter), test_acc))
return nd.maximum(X, 0) def net(data): h1 = nd.dot(data.reshape((-1, num_inputs)), w1) + b1 h1 = relu(h1) output = nd.dot(h1, w2) + b2 return output learing_rate = 0.1 softmax_cross_loss = gluon.loss.SoftmaxCrossEntropyLoss() epochs = 5 for epoch in range(epochs): total_loss = .0 total_acc = .0 for data, label in train_iter: with autograd.record(): output = net(data) loss = softmax_cross_loss(output, label) loss.backward() sgd(params, learing_rate / batch_size) total_loss += nd.mean(loss).asscalar() total_acc += accuracy(output, label) test_acc = evaluate_accuracy(test_iter, net) print('Epoch %d, Train Loss: %f, Train Acc: %f, Test Acc: %f' % (epoch, total_loss / len(train_iter), total_acc / len(train_iter), test_acc))