def setUp(self): self.device = Naive() self.graph = Graph() Device.set_default(self.device) Graph.set_default(self.graph) self.a = np.array([[1, 2], [3, 4]], np.float32) self.b = np.array([[1, 1], [4, 8]], np.float32)
def test_batch(encdec, src_vocab, trg_vocab, lines): g = Graph() Graph.set_default(g) src_batch = make_batch(lines, list(range(len(lines))), src_vocab) encdec.encode(src_batch, False) # Generates target words one-by-one. trg_ids = [np.array([trg_vocab["<bos>"]] * len(lines))] eos_id = trg_vocab["<eos>"] eos_ids = np.array([eos_id] * len(lines)) while (trg_ids[-1] != eos_ids).any(): if len(trg_ids) > GENERATION_LIMIT + 1: print("Warning: Sentence generation did not finish in", GENERATION_LIMIT, "iterations.", file=sys.stderr) trg_ids.append(eos_ids) break y = encdec.decode_step(trg_ids[-1], False) trg_ids.append(np.array(y.argmax(0)).T) return [ hyp[:np.where(hyp == eos_id)[0][0]] for hyp in np.array(trg_ids[1:]).T ]
def setUp(self): self.device = Naive() self.graph = Graph() Device.set_default(self.device) Graph.set_default(self.graph) self.ndarray_data = [ np.array([ [1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12], ], np.float32), np.array([ [13, 14, 15], [16, 17, 18], [19, 20, 21], [22, 23, 24], ], np.float32), ] self.list_data = [ 1.0, 4.0, 7.0, 10.0, 2.0, 5.0, 8.0, 11.0, 3.0, 6.0, 9.0, 12.0, 13.0, 16.0, 19.0, 22.0, 14.0, 17.0, 20.0, 23.0, 15.0, 18.0, 21.0, 24.0, ]
def train_func(trainer): dev = D.Naive(12345) Device.set_default(dev) g = Graph() Graph.set_default(g) pw1 = Parameter([8, 2], I.XavierUniform()) pb1 = Parameter([8], I.Constant(0)) pw2 = Parameter([1, 8], I.XavierUniform()) pb2 = Parameter([1], I.Constant(0)) trainer.add_parameter(pw1) trainer.add_parameter(pb1) trainer.add_parameter(pw2) trainer.add_parameter(pb2) input_data = [1, 1, 1, -1, -1, 1, -1, -1] output_data = [1, -1, -1, 1] for i in range(10): g.clear() x = F.input(input_data, Shape([2], 4)) w1 = F.parameter(pw1) b1 = F.parameter(pb1) w2 = F.parameter(pw2) b2 = F.parameter(pb2) h = F.tanh(w1 @ x + b1) y = w2 @ h + b2 t = F.input(output_data, Shape([], 4)) diff = t - y loss = F.batch.mean(diff * diff) trainer.reset_gradients() loss.backward() trainer.update() return [ pw1.value.to_list(), pb1.value.to_list(), pw2.value.to_list(), pb2.value.to_list() ]
def setUp(self): self.device = Naive() self.graph = Graph() Device.set_default(self.device) Graph.set_default(self.graph) self.input_data = [ np.array([ [ 1, 2, 3], [ 4, 5, 6], [ 7, 8, 9], ], np.float32), np.array([ [11,12,13], [14,15,16], [17,18,19], ], np.float32), ] self.list_expected = [ 1.0, 4.0, 7.0, 2.0, 5.0, 8.0, 3.0, 6.0, 9.0, 11.0, 14.0, 17.0, 12.0, 15.0, 18.0, 13.0, 16.0, 19.0, ]
def main(): # Loads vocab. vocab = make_vocab("data/ptb.train.txt") print("#vocab:", len(vocab)) # maybe 10000 eos_id = vocab["<s>"] # Loads all corpus. train_corpus = load_corpus("data/ptb.train.txt", vocab) valid_corpus = load_corpus("data/ptb.valid.txt", vocab) num_train_sents = len(train_corpus) num_valid_sents = len(valid_corpus) num_train_labels = count_labels(train_corpus) num_valid_labels = count_labels(valid_corpus) print("train:", num_train_sents, "sentences,", num_train_labels, "labels") print("valid:", num_valid_sents, "sentences,", num_valid_labels, "labels") # Device and computation graph. dev = D.CUDA(0) Device.set_default(dev) g = Graph() Graph.set_default(g) # Our LM. lm = RNNLM(len(vocab), eos_id) # Optimizer. optimizer = O.SGD(1) #optimizer.set_weight_decay(1e-6) optimizer.set_gradient_clipping(5) optimizer.add(lm) # Sentence IDs. train_ids = list(range(num_train_sents)) valid_ids = list(range(num_valid_sents)) best_valid_ppl = 1e10 # Train/valid loop. for epoch in range(MAX_EPOCH): print("epoch", epoch + 1, "/", MAX_EPOCH, ":") # Shuffles train sentence IDs. random.shuffle(train_ids) # Training. train_loss = 0 for ofs in range(0, num_train_sents, BATCH_SIZE): batch_ids = train_ids[ofs:min(ofs + BATCH_SIZE, num_train_sents)] batch = make_batch(train_corpus, batch_ids, eos_id) g.clear() outputs = lm.forward(batch, True) loss = lm.loss(outputs, batch) train_loss += loss.to_float() * len(batch_ids) optimizer.reset_gradients() loss.backward() optimizer.update() print("%d" % ofs, end="\r") sys.stdout.flush() train_ppl = math.exp(train_loss / num_train_labels) print(" train ppl =", train_ppl) # Validation. valid_loss = 0 for ofs in range(0, num_valid_sents, BATCH_SIZE): batch_ids = valid_ids[ofs:min(ofs + BATCH_SIZE, num_valid_sents)] batch = make_batch(valid_corpus, batch_ids, eos_id) g.clear() outputs = lm.forward(batch, False) loss = lm.loss(outputs, batch) valid_loss += loss.to_float() * len(batch_ids) print("%d" % ofs, end="\r") sys.stdout.flush() valid_ppl = math.exp(valid_loss / num_valid_labels) print(" valid ppl =", valid_ppl) if valid_ppl < best_valid_ppl: best_valid_ppl = valid_ppl print(" BEST") else: old_lr = optimizer.get_learning_rate_scaling() new_lr = 0.5 * old_lr optimizer.set_learning_rate_scaling(new_lr) print(" learning rate scaled:", old_lr, "->", new_lr)
def main(): # Loads data train_inputs = load_images("data/train-images-idx3-ubyte", NUM_TRAIN_SAMPLES) train_labels = load_labels("data/train-labels-idx1-ubyte", NUM_TRAIN_SAMPLES) test_inputs = load_images("data/t10k-images-idx3-ubyte", NUM_TEST_SAMPLES) test_labels = load_labels("data/t10k-labels-idx1-ubyte", NUM_TEST_SAMPLES) # Initializes 2 device objects which manage different GPUs. dev0 = D.CUDA(0) dev1 = D.CUDA(1) # Parameters on GPU 0. pw1 = Parameter([NUM_HIDDEN_UNITS, NUM_INPUT_UNITS], I.XavierUniform(), dev0) pb1 = Parameter([NUM_HIDDEN_UNITS], I.Constant(0), dev0) # Parameters on GPU 1. pw2 = Parameter([NUM_OUTPUT_UNITS, NUM_HIDDEN_UNITS], I.XavierUniform(), dev1) pb2 = Parameter([NUM_OUTPUT_UNITS], I.Constant(0), dev1) trainer = T.SGD(.1) trainer.add_parameter(pw1) trainer.add_parameter(pb1) trainer.add_parameter(pw2) trainer.add_parameter(pb2) def make_graph(inputs): # We first store input values explicitly on GPU 0. x = F.input(inputs, device=dev0) w1 = F.parameter(pw1) b1 = F.parameter(pb1) w2 = F.parameter(pw2) b2 = F.parameter(pb2) # The hidden layer is calculated and implicitly stored on GPU 0. h_on_gpu0 = F.relu(w1 @ x + b1) # `copy()` transfers the hiddne layer to GPU 1. h_on_gpu1 = F.copy(h_on_gpu0, dev1) # The output layer is calculated and implicitly stored on GPU 1. return w2 @ h_on_gpu1 + b2 ids = list(range(NUM_TRAIN_SAMPLES)) g = Graph() Graph.set_default(g) for epoch in range(MAX_EPOCH): random.shuffle(ids) # Training loop for batch in range(NUM_TRAIN_BATCHES): print("\rTraining... %d / %d" % (batch + 1, NUM_TRAIN_BATCHES), end="") inputs = [train_inputs[ids[batch * BATCH_SIZE + i]] for i in range(BATCH_SIZE)] labels = [train_labels[ids[batch * BATCH_SIZE + i]] for i in range(BATCH_SIZE)] g.clear() y = make_graph(inputs) loss = F.softmax_cross_entropy(y, labels, 0) avg_loss = F.batch.mean(loss) trainer.reset_gradients() avg_loss.backward() trainer.update() print() match = 0 # Test loop for batch in range(NUM_TEST_BATCHES): print("\rTesting... %d / %d" % (batch + 1, NUM_TEST_BATCHES), end="") inputs = [test_inputs[batch * BATCH_SIZE + i] for i in range(BATCH_SIZE)] g.clear() y = make_graph(inputs) y_val = y.to_list() for i in range(BATCH_SIZE): maxval = -1e10 argmax = -1 for j in range(NUM_OUTPUT_UNITS): v = y_val[j + i * NUM_OUTPUT_UNITS] if (v > maxval): maxval = v argmax = j if argmax == test_labels[i + batch * BATCH_SIZE]: match += 1 accuracy = 100.0 * match / NUM_TEST_SAMPLES print("\nepoch %d: accuracy: %.2f%%\n" % (epoch, accuracy))
def main(): dev = D.Naive() # or D.CUDA(gpuid) Device.set_default(dev) # Parameters pw1 = Parameter([8, 2], I.XavierUniform()) pb1 = Parameter([8], I.Constant(0)) pw2 = Parameter([1, 8], I.XavierUniform()) pb2 = Parameter([], I.Constant(0)) # Optimizer optimizer = O.SGD(0.1) # Registers parameters. optimizer.add_parameter(pw1) optimizer.add_parameter(pb1) optimizer.add_parameter(pw2) optimizer.add_parameter(pb2) # Training data input_data = [ np.array([1, 1], dtype=np.float32), # Sample 1 np.array([1, -1], dtype=np.float32), # Sample 2 np.array([-1, 1], dtype=np.float32), # Sample 3 np.array([-1, -1], dtype=np.float32), # Sample 4 ] output_data = [ np.array([1], dtype=np.float32), # Label 1 np.array([-1], dtype=np.float32), # Label 2 np.array([-1], dtype=np.float32), # Label 3 np.array([1], dtype=np.float32), # Label 4 ] g = Graph() Graph.set_default(g) for i in range(10): g.clear() # Builds a computation graph. x = F.input(input_data) w1 = F.parameter(pw1) b1 = F.parameter(pb1) w2 = F.parameter(pw2) b2 = F.parameter(pb2) h = F.tanh(w1 @ x + b1) y = w2 @ h + b2 # Obtains values. y_val = y.to_list() print("epoch ", i, ":") for j in range(4): print(" [", j, "]: ", y_val[j]) # Extends the computation graph to calculate loss values. t = F.input(output_data) diff = t - y loss = F.batch.mean(diff * diff) # Obtains the loss. loss_val = loss.to_float() print(" loss: ", loss_val) # Updates parameters. optimizer.reset_gradients() loss.backward() optimizer.update()
def train(model, optimizer, config, best_valid): max_epoch = config.get("max_epoch", int(1e9)) max_iteration = config.get("max_iteration", int(1e9)) max_sentences = config.get("max_sentences", 1e9) max_tokens = config.get("max_tokens", 1e9) update_freq = config.get('update_freq', 1) optimizer.add(model) corpus_prefix = Path(config['corpus_prefix']) / "subword" model_path = corpus_prefix / "spm.model" tokenizer = spm.SentencePieceProcessor() tokenizer.Load(str(model_path)) train_src = load_corpus(corpus_prefix / Path(config["train_source"]).name, tokenizer) train_trg = load_corpus(corpus_prefix / Path(config["train_target"]).name, tokenizer) train_src, train_trg = clean_corpus(train_src, train_trg, config) dev_src = load_corpus(corpus_prefix / Path(config["dev_source"]).name, tokenizer) dev_trg = load_corpus(corpus_prefix / Path(config["dev_target"]).name, tokenizer) dev_src, dev_trg = clean_corpus(dev_src, dev_trg, config) num_train_sents = len(train_src) num_dev_sents = len(dev_src) eos_id = tokenizer.eos_id() epoch = 0 iteration = 0 while epoch < max_epoch and iteration < max_iteration: epoch += 1 g = Graph() Graph.set_default(g) train_itr = create_batch_itr(train_src, train_trg, max_tokens, max_sentences, shuffle=True) train_itr = tqdm(train_itr, desc='train epoch {}'.format(epoch)) train_loss = 0. itr_loss = 0. itr_tokens = 0 itr_sentences = 0 optimizer.reset_gradients() for step, batch_ids in enumerate(train_itr): src_batch = make_batch(train_src, batch_ids, eos_id) trg_batch = make_batch(train_trg, batch_ids, eos_id) src_mask = padding_mask(src_batch, eos_id) trg_mask = [ x | subsequent_mask(len(trg_batch) - 1) for x in padding_mask(trg_batch[:-1], eos_id) ] itr_tokens += len(src_batch) * len(src_batch[0]) itr_sentences += len(batch_ids) g.clear() loss = model.loss(src_batch, trg_batch, src_mask, trg_mask) loss /= update_freq loss.backward() loss_val = loss.to_float() train_loss += loss_val * update_freq * len(batch_ids) itr_loss += loss_val # with open('graph.dot', 'w') as f: # print(g.dump("dot"), end="", file=f) if (step + 1) % update_freq == 0: step_num = optimizer.get_epoch() + 1 new_scale = config['d_model'] ** (-0.5) * \ min(step_num ** (-0.5), step_num * config['warmup_steps'] ** (-1.5)) optimizer.set_learning_rate_scaling(new_scale) optimizer.update() optimizer.reset_gradients() iteration += 1 train_itr.set_postfix(itr=("%d" % (iteration)), loss=("%.3lf" % (itr_loss)), wpb=("%d" % (itr_tokens)), spb=("%d" % (itr_sentences)), lr=optimizer.get_learning_rate_scaling()) itr_loss = 0. itr_tokens = 0 itr_sentences = 0 if iteration >= max_iteration: break print("\ttrain loss = %.4f" % (train_loss / num_train_sents)) g.clear() valid_loss = 0. valid_itr = create_batch_itr(dev_src, dev_trg, max_tokens, max_sentences, shuffle=False) valid_itr = tqdm(valid_itr, desc='valid epoch {}'.format(epoch)) for batch_ids in valid_itr: src_batch = make_batch(dev_src, batch_ids, eos_id) trg_batch = make_batch(dev_trg, batch_ids, eos_id) src_mask = padding_mask(src_batch, eos_id) trg_mask = [ x | subsequent_mask(len(trg_batch) - 1) for x in padding_mask(trg_batch[:-1], eos_id) ] loss = model.loss(src_batch, trg_batch, src_mask, trg_mask, train=False) valid_loss += loss.to_float() * len(batch_ids) valid_itr.set_postfix(loss=loss.to_float()) print("\tvalid loss = %.4f" % (valid_loss / num_dev_sents)) if valid_loss < best_valid: best_valid = valid_loss print('\tsaving model/optimizer ... ', end="", flush=True) prefix = config['model_prefix'] model.save(prefix + '.model') optimizer.save(prefix + '.optimizer') with Path(prefix).with_suffix('.valid').open('w') as f: f.write(str(best_valid)) print('done.')
def main(): # Loads data train_inputs = load_images("data/train-images-idx3-ubyte", NUM_TRAIN_SAMPLES) train_labels = load_labels("data/train-labels-idx1-ubyte", NUM_TRAIN_SAMPLES) test_inputs = load_images("data/t10k-images-idx3-ubyte", NUM_TEST_SAMPLES) test_labels = load_labels("data/t10k-labels-idx1-ubyte", NUM_TEST_SAMPLES) dev = D.Naive() # or D.CUDA(gpuid) Device.set_default(dev) pw1 = Parameter([NUM_HIDDEN_UNITS, NUM_INPUT_UNITS], I.XavierUniform()) pb1 = Parameter([NUM_HIDDEN_UNITS], I.Constant(0)) pw2 = Parameter([NUM_OUTPUT_UNITS, NUM_HIDDEN_UNITS], I.XavierUniform()) pb2 = Parameter([NUM_OUTPUT_UNITS], I.Constant(0)) optimizer = O.SGD(.5) optimizer.add(pw1, pb1, pw2, pb2) def make_graph(inputs, train): x = F.input(inputs) w1 = F.parameter(pw1) b1 = F.parameter(pb1) h = F.relu(w1 @ x + b1) h = F.dropout(h, .5, train) w2 = F.parameter(pw2) b2 = F.parameter(pb2) return w2 @ h + b2 ids = list(range(NUM_TRAIN_SAMPLES)) g = Graph() Graph.set_default(g) for epoch in range(MAX_EPOCH): random.shuffle(ids) # Training loop for batch in range(NUM_TRAIN_BATCHES): print("\rTraining... %d / %d" % (batch + 1, NUM_TRAIN_BATCHES), end="") inputs = [train_inputs[ids[batch * BATCH_SIZE + i]] for i in range(BATCH_SIZE)] labels = [train_labels[ids[batch * BATCH_SIZE + i]] for i in range(BATCH_SIZE)] g.clear() y = make_graph(inputs, True) loss = F.softmax_cross_entropy(y, labels, 0) avg_loss = F.batch.mean(loss) optimizer.reset_gradients() avg_loss.backward() optimizer.update() print() match = 0 # Test loop for batch in range(NUM_TEST_BATCHES): print("\rTesting... %d / %d" % (batch + 1, NUM_TEST_BATCHES), end="") inputs = [test_inputs[batch * BATCH_SIZE + i] for i in range(BATCH_SIZE)] g.clear() y = make_graph(inputs, False) y_val = y.to_list() for i in range(BATCH_SIZE): maxval = -1e10 argmax = -1 for j in range(NUM_OUTPUT_UNITS): v = y_val[j + i * NUM_OUTPUT_UNITS] if (v > maxval): maxval = v argmax = j if argmax == test_labels[i + batch * BATCH_SIZE]: match += 1 accuracy = 100.0 * match / NUM_TEST_SAMPLES print("\nepoch %d: accuracy: %.2f%%\n" % (epoch, accuracy))
def train(encdec, optimizer, prefix, best_valid_ppl): # Registers all parameters to the optimizer. optimizer.add_model(encdec) # Loads vocab. src_vocab = make_vocab(SRC_TRAIN_FILE, SRC_VOCAB_SIZE) trg_vocab = make_vocab(TRG_TRAIN_FILE, TRG_VOCAB_SIZE) inv_trg_vocab = make_inv_vocab(trg_vocab) print("#src_vocab:", len(src_vocab)) print("#trg_vocab:", len(trg_vocab)) # Loads all corpus train_src_corpus = load_corpus(SRC_TRAIN_FILE, src_vocab) train_trg_corpus = load_corpus(TRG_TRAIN_FILE, trg_vocab) valid_src_corpus = load_corpus(SRC_VALID_FILE, src_vocab) valid_trg_corpus = load_corpus(TRG_VALID_FILE, trg_vocab) test_src_corpus = load_corpus(SRC_TEST_FILE, src_vocab) test_ref_corpus = load_corpus_ref(REF_TEST_FILE, trg_vocab) num_train_sents = len(train_trg_corpus) num_valid_sents = len(valid_trg_corpus) num_test_sents = len(test_ref_corpus) num_train_labels = count_labels(train_trg_corpus) num_valid_labels = count_labels(valid_trg_corpus) print("train:", num_train_sents, "sentences,", num_train_labels, "labels") print("valid:", num_valid_sents, "sentences,", num_valid_labels, "labels") # Sentence IDs train_ids = list(range(num_train_sents)) valid_ids = list(range(num_valid_sents)) # Train/valid loop. for epoch in range(MAX_EPOCH): # Computation graph. g = Graph() Graph.set_default(g) print("epoch %d/%d:" % (epoch + 1, MAX_EPOCH)) print(" learning rate scale = %.4e" % optimizer.get_learning_rate_scaling()) # Shuffles train sentence IDs. random.shuffle(train_ids) # Training. train_loss = 0. for ofs in range(0, num_train_sents, BATCH_SIZE): print("%d" % ofs, end="\r") sys.stdout.flush() batch_ids = train_ids[ofs:min(ofs + BATCH_SIZE, num_train_sents)] src_batch = make_batch(train_src_corpus, batch_ids, src_vocab) trg_batch = make_batch(train_trg_corpus, batch_ids, trg_vocab) g.clear() encdec.encode(src_batch, True) loss = encdec.loss(trg_batch, True) train_loss += loss.to_float() * len(batch_ids) optimizer.reset_gradients() loss.backward() optimizer.update() train_ppl = math.exp(train_loss / num_train_labels) print(" train PPL = %.4f" % train_ppl) # Validation. valid_loss = 0. for ofs in range(0, num_valid_sents, BATCH_SIZE): print("%d" % ofs, end="\r") sys.stdout.flush() batch_ids = valid_ids[ofs:min(ofs + BATCH_SIZE, num_valid_sents)] src_batch = make_batch(valid_src_corpus, batch_ids, src_vocab) trg_batch = make_batch(valid_trg_corpus, batch_ids, trg_vocab) g.clear() encdec.encode(src_batch, False) loss = encdec.loss(trg_batch, False) valid_loss += loss.to_float() * len(batch_ids) valid_ppl = math.exp(valid_loss / num_valid_labels) print(" valid PPL = %.4f" % valid_ppl) # Calculates test BLEU. stats = defaultdict(int) for ofs in range(0, num_test_sents, BATCH_SIZE): print("%d" % ofs, end="\r") sys.stdout.flush() src_batch = test_src_corpus[ofs:min(ofs + BATCH_SIZE, num_test_sents)] ref_batch = test_ref_corpus[ofs:min(ofs + BATCH_SIZE, num_test_sents)] hyp_ids = test_batch(encdec, src_vocab, trg_vocab, src_batch) for hyp_line, ref_line in zip(hyp_ids, ref_batch): for k, v in get_bleu_stats(ref_line[1:-1], hyp_line).items(): stats[k] += v bleu = calculate_bleu(stats) print(" test BLEU = %.2f" % (100 * bleu)) # Saves best model/optimizer. if valid_ppl < best_valid_ppl: best_valid_ppl = valid_ppl print(" saving model/optimizer ... ", end="") sys.stdout.flush() encdec.save(prefix + ".model") optimizer.save(prefix + ".optimizer") save_ppl(prefix + ".valid_ppl", best_valid_ppl) print("done.") else: # Learning rate decay by 1/sqrt(2) new_scale = .7071 * optimizer.get_learning_rate_scaling() optimizer.set_learning_rate_scaling(new_scale)
def main(): # Loads data train_inputs = load_images("data/train-images-idx3-ubyte", NUM_TRAIN_SAMPLES) train_labels = load_labels("data/train-labels-idx1-ubyte", NUM_TRAIN_SAMPLES) test_inputs = load_images("data/t10k-images-idx3-ubyte", NUM_TEST_SAMPLES) test_labels = load_labels("data/t10k-labels-idx1-ubyte", NUM_TEST_SAMPLES) dev = D.CUDA(0) Device.set_default(dev) g = Graph() Graph.set_default(g) # Parameters of CNNs # Shape: {kernel_height, kernel_width, in_channels, out_channels} pw_cnn1 = Parameter(Shape([KERNEL_SIZE1, KERNEL_SIZE1, 1, NUM_CHANNELS1]), I.XavierUniformConv2D()) pw_cnn2 = Parameter( Shape([KERNEL_SIZE2, KERNEL_SIZE2, NUM_CHANNELS1, NUM_CHANNELS2]), I.XavierUniformConv2D()) # Parameters of FC layers pw_fc1 = Parameter(Shape([NUM_HIDDEN_UNITS, NUM_INPUT_UNITS]), I.XavierUniform()) pw_fc2 = Parameter(Shape([NUM_OUTPUT_UNITS, NUM_HIDDEN_UNITS]), I.XavierUniform()) pb_fc1 = Parameter(Shape([NUM_HIDDEN_UNITS]), I.Constant(0)) pb_fc2 = Parameter(Shape([NUM_OUTPUT_UNITS]), I.Constant(0)) # Optimizer optimizer = O.SGD(.1) optimizer.add(pw_cnn1, pw_cnn2, pw_fc1, pw_fc2, pb_fc1, pb_fc2) # Helper lambda to construct the predictor network. def make_graph(inputs, train): # Input and parameters. #x = F.input(Shape([IMAGE_HEIGHT, IMAGE_WIDTH], BATCH_SIZE), inputs) x = F.input(inputs) w_cnn1 = F.parameter(pw_cnn1) w_cnn2 = F.parameter(pw_cnn2) w_fc1 = F.parameter(pw_fc1) w_fc2 = F.parameter(pw_fc2) b_fc1 = F.parameter(pb_fc1) b_fc2 = F.parameter(pb_fc2) # CNNs h_cnn1 = F.relu(F.conv2d(x, w_cnn1, PADDING1, PADDING1, 1, 1, 1, 1)) h_pool1 = F.max_pool2d(h_cnn1, 2, 2, 0, 0, 2, 2) h_cnn2 = F.relu( F.conv2d(h_pool1, w_cnn2, PADDING2, PADDING2, 1, 1, 1, 1)) h_pool2 = F.max_pool2d(h_cnn2, 2, 2, 0, 0, 2, 2) # FC layers x_fc = F.dropout(F.flatten(h_pool2), .5, train) h_fc = F.dropout(F.relu(F.matmul(w_fc1, x_fc) + b_fc1), .5, train) return F.matmul(w_fc2, h_fc) + b_fc2 # Batch randomizer ids = list(range(NUM_TRAIN_SAMPLES)) for epoch in range(MAX_EPOCH): # Shuffles sample IDs. random.shuffle(ids) # Training loop for batch in range(NUM_TRAIN_BATCHES): print("\rTraining... %d / %d" % (batch + 1, NUM_TRAIN_BATCHES), end="") # Makes a minibatch for training. inputs = [ train_inputs[ids[batch * BATCH_SIZE + i]] for i in range(BATCH_SIZE) ] labels = [ train_labels[ids[batch * BATCH_SIZE + i]] for i in range(BATCH_SIZE) ] # Constructs the graph. g.clear() y = make_graph(inputs, True) loss = F.softmax_cross_entropy(y, labels, 0) avg_loss = F.batch.mean(loss) # Dump computation graph at the first time. # if epoch == 0 and batch == 0: # print(g.dump("dot")) # Implicit forward, backward, and updates parameters. optimizer.reset_gradients() avg_loss.backward() optimizer.update() print() match = 0 # Test loop for batch in range(NUM_TEST_BATCHES): print("\rTesting... %d / %d" % (batch + 1, NUM_TEST_BATCHES), end="") # Makes a test minibatch. inputs = [ test_inputs[batch * BATCH_SIZE + i] for i in range(BATCH_SIZE) ] # Constructs the graph. g.clear() y = make_graph(inputs, False) # Gets outputs, argmax, and compares them with the label. y_val = y.to_list() for i in range(BATCH_SIZE): maxval = -1e10 argmax = -1 for j in range(NUM_OUTPUT_UNITS): v = y_val[j + i * NUM_OUTPUT_UNITS] if v > maxval: maxval = v argmax = j if argmax == test_labels[i + batch * BATCH_SIZE]: match += 1 accuracy = 100.0 * match / NUM_TEST_SAMPLES print("epoch %d: accuracy: %.2f%%" % (epoch, accuracy)) return 0
def setUp(self): self.device = Naive() Device.set_default(self.device) self.graph = Graph() Graph.set_default(self.graph)
def main(): # Loads vocab. vocab = make_vocab("data/ptb.train.txt") print("#vocab:", len(vocab)) # maybe 10000 eos_id = vocab["<s>"] # Loads all corpus. train_corpus = load_corpus("data/ptb.train.txt", vocab) valid_corpus = load_corpus("data/ptb.valid.txt", vocab) num_train_sents = len(train_corpus) num_valid_sents = len(valid_corpus) num_train_labels = count_labels(train_corpus) num_valid_labels = count_labels(valid_corpus) print("train:", num_train_sents, "sentences,", num_train_labels, "labels") print("valid:", num_valid_sents, "sentences,", num_valid_labels, "labels") dev = D.CUDA(0) Device.set_default(dev) # Trainer. trainer = T.Adam() trainer.set_weight_decay(1e-6) trainer.set_gradient_clipping(5) # Our LM. lm = RNNLM(len(vocab), eos_id, trainer) # Sentence IDs. train_ids = list(range(num_train_sents)) valid_ids = list(range(num_valid_sents)) g = Graph() Graph.set_default(g) # Train/valid loop. for epoch in range(MAX_EPOCH): print("epoch", (epoch + 1), "/", MAX_EPOCH, ":") # Shuffles train sentence IDs. random.shuffle(train_ids) # Training. train_loss = 0 for ofs in range(0, num_train_sents, BATCH_SIZE): batch_ids = train_ids[ofs:min(ofs + BATCH_SIZE, num_train_sents)] batch = make_batch(train_corpus, batch_ids, eos_id) g.clear() outputs = lm.forward(batch) loss = lm.forward_loss(outputs, batch) train_loss += loss.to_float() * len(batch_ids) trainer.reset_gradients() loss.backward() trainer.update() print("\r%d" % ofs, end="") sys.stdout.flush() print() train_ppl = math.exp(train_loss / num_train_labels) print(" train ppl =", train_ppl) # Validation. valid_loss = 0 for ofs in range(0, num_valid_sents, BATCH_SIZE): batch_ids = valid_ids[ofs:min(ofs + BATCH_SIZE, num_valid_sents)] batch = make_batch(valid_corpus, batch_ids, eos_id) g.clear() outputs = lm.forward(batch) loss = lm.forward_loss(outputs, batch) valid_loss += loss.to_float() * len(batch_ids) print("\r%d" % ofs, end="") sys.stdout.flush() print() valid_ppl = math.exp(valid_loss / num_valid_labels) print(" valid ppl =", valid_ppl)