def main(): with DefaultScopeDevice(CPUDevice()): pw1 = Parameter("w1", [8, 2], I.XavierUniform()) pb1 = Parameter("b1", [8], I.Constant(0)) pw2 = Parameter("w2", [1, 8], I.XavierUniform()) pb2 = Parameter("b2", [], I.Constant(0)) trainer = T.SGD(0.1) trainer.add_parameter(pw1) trainer.add_parameter(pb1) trainer.add_parameter(pw2) trainer.add_parameter(pb2) input_data = np.array( [ [1, 1], # Sample 1 [1, -1], # Sample 2 [-1, 1], # Sample 3 [-1, -1], # Sample 4 ], dtype=np.float32) output_data = np.array( [ 1, # Label 1 -1, # Label 2 -1, # Label 3 1, # Label 4 ], dtype=np.float32) for i in range(100): g = Graph() with DefaultScopeGraph(g): # Builds a computation graph. #x = F.input(shape=Shape([2], 4), data=input_data) x = F.input(data=input_data) w1 = F.input(param=pw1) b1 = F.input(param=pb1) w2 = F.input(param=pw2) b2 = F.input(param=pb2) h = F.tanh(F.matmul(w1, x) + b1) y = F.matmul(w2, h) + b2 # Calculates values. y_val = g.forward(y).to_list() print("epoch ", i, ":") for j in range(4): print(" [", j, "]: ", y_val[j]) #t = F.input(shape=Shape([], 4), data=output_data) t = F.input(data=output_data) diff = t - y loss = F.batch.mean(diff * diff) loss_val = g.forward(loss).to_list()[0] print(" loss: ", loss_val) trainer.reset_gradients() g.backward(loss) trainer.update()
def test_sgd_virtual(self): t = T.SGD() uint_configs = {'Trainer.epoch': 1} float_configs = {'SGD.eta': 0.0, 'Trainer.clip_threshold': 0.0, 'Trainer.lr_scale': 1.0, 'Trainer.l2_strength': 0.0, } t.set_configs(uint_configs, float_configs) uint_configs, float_configs = t.get_configs() self.assertEqual(uint_configs['Trainer.epoch'], 1)
def main(): # Loads data train_inputs = load_images("data/train-images-idx3-ubyte", NUM_TRAIN_SAMPLES) train_labels = load_labels("data/train-labels-idx1-ubyte", NUM_TRAIN_SAMPLES) test_inputs = load_images("data/t10k-images-idx3-ubyte", NUM_TEST_SAMPLES) test_labels = load_labels("data/t10k-labels-idx1-ubyte", NUM_TEST_SAMPLES) # Initializes 2 device objects which manage different GPUs. dev0 = D.CUDA(0) dev1 = D.CUDA(1) # Parameters on GPU 0. pw1 = Parameter([NUM_HIDDEN_UNITS, NUM_INPUT_UNITS], I.XavierUniform(), dev0) pb1 = Parameter([NUM_HIDDEN_UNITS], I.Constant(0), dev0) # Parameters on GPU 1. pw2 = Parameter([NUM_OUTPUT_UNITS, NUM_HIDDEN_UNITS], I.XavierUniform(), dev1) pb2 = Parameter([NUM_OUTPUT_UNITS], I.Constant(0), dev1) trainer = T.SGD(.1) trainer.add_parameter(pw1) trainer.add_parameter(pb1) trainer.add_parameter(pw2) trainer.add_parameter(pb2) def make_graph(inputs): # We first store input values explicitly on GPU 0. x = F.input(inputs, device=dev0) w1 = F.parameter(pw1) b1 = F.parameter(pb1) w2 = F.parameter(pw2) b2 = F.parameter(pb2) # The hidden layer is calculated and implicitly stored on GPU 0. h_on_gpu0 = F.relu(w1 @ x + b1) # `copy()` transfers the hiddne layer to GPU 1. h_on_gpu1 = F.copy(h_on_gpu0, dev1) # The output layer is calculated and implicitly stored on GPU 1. return w2 @ h_on_gpu1 + b2 ids = list(range(NUM_TRAIN_SAMPLES)) g = Graph() Graph.set_default(g) for epoch in range(MAX_EPOCH): random.shuffle(ids) # Training loop for batch in range(NUM_TRAIN_BATCHES): print("\rTraining... %d / %d" % (batch + 1, NUM_TRAIN_BATCHES), end="") inputs = [train_inputs[ids[batch * BATCH_SIZE + i]] for i in range(BATCH_SIZE)] labels = [train_labels[ids[batch * BATCH_SIZE + i]] for i in range(BATCH_SIZE)] g.clear() y = make_graph(inputs) loss = F.softmax_cross_entropy(y, labels, 0) avg_loss = F.batch.mean(loss) trainer.reset_gradients() avg_loss.backward() trainer.update() print() match = 0 # Test loop for batch in range(NUM_TEST_BATCHES): print("\rTesting... %d / %d" % (batch + 1, NUM_TEST_BATCHES), end="") inputs = [test_inputs[batch * BATCH_SIZE + i] for i in range(BATCH_SIZE)] g.clear() y = make_graph(inputs) y_val = y.to_list() for i in range(BATCH_SIZE): maxval = -1e10 argmax = -1 for j in range(NUM_OUTPUT_UNITS): v = y_val[j + i * NUM_OUTPUT_UNITS] if (v > maxval): maxval = v argmax = j if argmax == test_labels[i + batch * BATCH_SIZE]: match += 1 accuracy = 100.0 * match / NUM_TEST_SAMPLES print("\nepoch %d: accuracy: %.2f%%\n" % (epoch, accuracy))
def main(): # Loads vocab. vocab = make_vocab("data/ptb.train.txt") print("#vocab:", len(vocab)) # maybe 10001 eos_id = vocab["<eos>"] # Loads all corpus. train_corpus = load_corpus("data/ptb.train.txt", vocab) valid_corpus = load_corpus("data/ptb.valid.txt", vocab) num_train_sents = len(train_corpus) num_valid_sents = len(valid_corpus) num_train_labels = count_labels(train_corpus) num_valid_labels = count_labels(valid_corpus) print("train:", num_train_sents, "sentences,", num_train_labels, "labels") print("valid:", num_valid_sents, "sentences,", num_valid_labels, "labels") dev = D.CUDA(0) Device.set_default(dev) # Trainer. trainer = T.SGD(1) #trainer.set_weight_decay(1e-6) trainer.set_gradient_clipping(5) # Our LM. lm = RNNLM(len(vocab), eos_id, trainer) # Sentence IDs. train_ids = list(range(num_train_sents)) valid_ids = list(range(num_valid_sents)) best_valid_ppl = 1e10 g = Graph() Graph.set_default(g) # Train/valid loop. for epoch in range(MAX_EPOCH): print("epoch", epoch + 1, "/", MAX_EPOCH, ":") # Shuffles train sentence IDs. random.shuffle(train_ids) # Training. train_loss = 0 for ofs in range(0, num_train_sents, BATCH_SIZE): batch_ids = train_ids[ofs:min(ofs + BATCH_SIZE, num_train_sents)] batch = make_batch(train_corpus, batch_ids, eos_id) g.clear() outputs = lm.forward(batch, True) loss = lm.loss(outputs, batch) train_loss += loss.to_float() * len(batch_ids) trainer.reset_gradients() loss.backward() trainer.update() print("\r%d" % ofs, end="") sys.stdout.flush() print() train_ppl = math.exp(train_loss / num_train_labels) print(" train ppl =", train_ppl) # Validation. valid_loss = 0 for ofs in range(0, num_valid_sents, BATCH_SIZE): batch_ids = valid_ids[ofs:min(ofs + BATCH_SIZE, num_valid_sents)] batch = make_batch(valid_corpus, batch_ids, eos_id) g.clear() outputs = lm.forward(batch, False) loss = lm.loss(outputs, batch) valid_loss += loss.to_float() * len(batch_ids) print("\r%d" % ofs, end="") sys.stdout.flush() print() valid_ppl = math.exp(valid_loss / num_valid_labels) print(" valid ppl =", valid_ppl) if valid_ppl < best_valid_ppl: best_valid_ppl = valid_ppl print(" BEST") else: old_lr = trainer.get_learning_rate_scaling() new_lr = 0.5 * old_lr trainer.set_learning_rate_scaling(new_lr) print(" learning rate scaled:", old_lr, "->", new_lr)
def main(): dev = D.Naive() # or D.CUDA(gpuid) Device.set_default(dev) # Parameters pw1 = Parameter([8, 2], I.XavierUniform()) pb1 = Parameter([8], I.Constant(0)) pw2 = Parameter([1, 8], I.XavierUniform()) pb2 = Parameter([], I.Constant(0)) # Trainer trainer = T.SGD(0.1) # Registers parameters. trainer.add_parameter(pw1) trainer.add_parameter(pb1) trainer.add_parameter(pw2) trainer.add_parameter(pb2) # Training data input_data = [ np.array([1, 1], dtype=np.float32), # Sample 1 np.array([1, -1], dtype=np.float32), # Sample 2 np.array([-1, 1], dtype=np.float32), # Sample 3 np.array([-1, -1], dtype=np.float32), # Sample 4 ] output_data = [ np.array([1], dtype=np.float32), # Label 1 np.array([-1], dtype=np.float32), # Label 2 np.array([-1], dtype=np.float32), # Label 3 np.array([1], dtype=np.float32), # Label 4 ] g = Graph() Graph.set_default(g) for i in range(10): g.clear() # Builds a computation graph. x = F.input(input_data) w1 = F.parameter(pw1) b1 = F.parameter(pb1) w2 = F.parameter(pw2) b2 = F.parameter(pb2) h = F.tanh(w1 @ x + b1) y = w2 @ h + b2 # Obtains values. y_val = y.to_list() print("epoch ", i, ":") for j in range(4): print(" [", j, "]: ", y_val[j]) # Extends the computation graph to calculate loss values. t = F.input(output_data) diff = t - y loss = F.batch.mean(diff * diff) # Obtains the loss. loss_val = loss.to_float() print(" loss: ", loss_val) # Updates parameters. trainer.reset_gradients() loss.backward() trainer.update()
def main(): # Loads data train_inputs = load_images("data/train-images-idx3-ubyte", NUM_TRAIN_SAMPLES) train_labels = load_labels("data/train-labels-idx1-ubyte", NUM_TRAIN_SAMPLES) test_inputs = load_images("data/t10k-images-idx3-ubyte", NUM_TEST_SAMPLES) test_labels = load_labels("data/t10k-labels-idx1-ubyte", NUM_TEST_SAMPLES) dev = D.Naive() # or D.CUDA(gpuid) Device.set_default(dev) pw1 = Parameter([NUM_HIDDEN_UNITS, NUM_INPUT_UNITS], I.XavierUniform()) pb1 = Parameter([NUM_HIDDEN_UNITS], I.Constant(0)) pw2 = Parameter([NUM_OUTPUT_UNITS, NUM_HIDDEN_UNITS], I.XavierUniform()) pb2 = Parameter([NUM_OUTPUT_UNITS], I.Constant(0)) trainer = T.SGD(.5) trainer.add_parameter(pw1) trainer.add_parameter(pb1) trainer.add_parameter(pw2) trainer.add_parameter(pb2) def make_graph(inputs, train): x = F.input(inputs) w1 = F.parameter(pw1) b1 = F.parameter(pb1) h = F.relu(w1 @ x + b1) h = F.dropout(h, .5, train) w2 = F.parameter(pw2) b2 = F.parameter(pb2) return w2 @ h + b2 ids = list(range(NUM_TRAIN_SAMPLES)) g = Graph() Graph.set_default(g) for epoch in range(MAX_EPOCH): random.shuffle(ids) # Training loop for batch in range(NUM_TRAIN_BATCHES): print("\rTraining... %d / %d" % (batch + 1, NUM_TRAIN_BATCHES), end="") inputs = [train_inputs[ids[batch * BATCH_SIZE + i]] for i in range(BATCH_SIZE)] labels = [train_labels[ids[batch * BATCH_SIZE + i]] for i in range(BATCH_SIZE)] g.clear() y = make_graph(inputs, True) loss = F.softmax_cross_entropy(y, labels, 0) avg_loss = F.batch.mean(loss) trainer.reset_gradients() avg_loss.backward() trainer.update() print() match = 0 # Test loop for batch in range(NUM_TEST_BATCHES): print("\rTesting... %d / %d" % (batch + 1, NUM_TEST_BATCHES), end="") inputs = [test_inputs[batch * BATCH_SIZE + i] for i in range(BATCH_SIZE)] g.clear() y = make_graph(inputs, False) y_val = y.to_list() for i in range(BATCH_SIZE): maxval = -1e10 argmax = -1 for j in range(NUM_OUTPUT_UNITS): v = y_val[j + i * NUM_OUTPUT_UNITS] if (v > maxval): maxval = v argmax = j if argmax == test_labels[i + batch * BATCH_SIZE]: match += 1 accuracy = 100.0 * match / NUM_TEST_SAMPLES print("\nepoch %d: accuracy: %.2f%%\n" % (epoch, accuracy))