def test_pyoptimizer_parameter(self): dev = D.Naive() Device.set_default(dev) pw1 = Parameter([8, 2], I.XavierUniform()) self.t.add(pw1) self.assertIn("testadam-m1", pw1.stats) self.assertIn("testadam-m2", pw1.stats)
def setUp(self): self.device = Naive() self.graph = Graph() Device.set_default(self.device) Graph.set_default(self.graph) self.a = np.array([[1, 2], [3, 4]], np.float32) self.b = np.array([[1, 1], [4, 8]], np.float32)
def main(): args = get_arguments() print("initializing device ... ", end="", file=sys.stderr, flush=True) dev = D.Naive() if args.gpu < 0 else D.CUDA(args.gpu) Device.set_default(dev) print("done.", file=sys.stderr) mode = args.mode prefix = args.model if mode == "train": encdec = EncoderDecoder(args.dropout) encdec.init(args.src_vocab, args.trg_vocab, args.embed, args.hidden) optimizer = O.Adam() optimizer.set_weight_decay(1e-6) optimizer.set_gradient_clipping(5) train(encdec, optimizer, args, 1e10) elif mode == "resume": print("loading model/optimizer ... ", end="", file=sys.stderr, flush=True) encdec = EncoderDecoder(args.dropout) encdec.load(prefix + ".model") optimizer = O.Adam() optimizer.load(prefix + ".optimizer") valid_ppl = load_ppl(prefix + ".valid_ppl") print("done.", file=sys.stderr) train(encdec, optimizer, args, valid_ppl) else: print("loading model ... ", end="", file=sys.stderr, flush=True) encdec = EncoderDecoder(args.dropout) encdec.load(prefix + ".model") print("done.", file=sys.stderr) test(encdec, args)
def test_pyoptimizer_not_implemented(self): dev = D.Naive() Device.set_default(dev) optimizer = IncompleteOptimizer() p = Parameter() with self.assertRaises(NotImplementedError): optimizer.add(p) with self.assertRaises(NotImplementedError): optimizer.update() with self.assertRaises(NotImplementedError): Optimizer.get_configs(optimizer) with self.assertRaises(NotImplementedError): Optimizer.set_configs(optimizer, {'Optimizer.epoch': 1}, { 'Optimizer.clip_threshold': 0.0, 'Optimizer.lr_scale': 1.0, 'Optimizer.l2_strength': 0.0 })
def test_pytrainer_not_implemented(self): dev = D.Naive() Device.set_default(dev) trainer = IncompleteTrainer() p = Parameter(Shape([])) with self.assertRaises(NotImplementedError): trainer.add_parameter(p) with self.assertRaises(NotImplementedError): trainer.update() with self.assertRaises(NotImplementedError): Trainer.get_configs(trainer) with self.assertRaises(NotImplementedError): Trainer.set_configs(trainer, {'Trainer.epoch': 1}, { 'Trainer.clip_threshold': 0.0, 'Trainer.lr_scale': 1.0, 'Trainer.l2_strength': 0.0 })
def main(): parser = ArgumentParser() parser.add_argument("mode") parser.add_argument("model_prefix") args = parser.parse_args() mode = args.mode prefix = args.model_prefix print("mode:", mode, file=sys.stderr) print("prefix:", prefix, file=sys.stderr) if mode not in ("train", "resume", "test"): print("unknown mode:", mode, file=sys.stderr) return print("initializing device ... ", end="", file=sys.stderr) sys.stderr.flush() dev = D.Naive() # = D.CUDA(0) Device.set_default(dev) print("done.", file=sys.stderr) if mode == "train": encdec = EncoderDecoder("encdec", SRC_VOCAB_SIZE, TRG_VOCAB_SIZE, NUM_EMBED_UNITS, NUM_HIDDEN_UNITS, DROPOUT_RATE) trainer = T.Adam() trainer.set_weight_decay(1e-6) trainer.set_gradient_clipping(5) train(encdec, trainer, prefix, 1e10) elif mode == "resume": print("loading model/trainer ... ", end="", file=sys.stderr) sys.stderr.flush() encdec = EncoderDecoder.load("encdec", prefix + '.') trainer = T.Adam() trainer.load(prefix + ".trainer.config") valid_ppl = load_ppl(prefix + ".valid_ppl.config") print("done.", file=sys.stderr) train(encdec, trainer, prefix, valid_ppl) else: # mode == "test" print("loading model ... ", end="", file=sys.stderr) sys.stderr.flush() encdec = EncoderDecoder.load("encdec", prefix + '.') print("done.", file=sys.stderr) test(encdec)
def main(): parser = ArgumentParser() parser.add_argument("mode", help="(train|resume|test)") parser.add_argument("model_prefix", help="prefix of the model files.") args = parser.parse_args() mode = args.mode prefix = args.model_prefix print("mode:", mode, file=sys.stderr) print("prefix:", prefix, file=sys.stderr) if mode not in ("train", "resume", "test"): print("unknown mode:", mode, file=sys.stderr) return print("initializing device ... ", end="", file=sys.stderr) sys.stderr.flush() dev = D.CUDA(0) Device.set_default(dev) print("done.", file=sys.stderr) if mode == "train": encdec = AttentionalEncoderDecoder() encdec.init(SRC_VOCAB_SIZE, TRG_VOCAB_SIZE, NUM_EMBED_UNITS, NUM_HIDDEN_UNITS) optimizer = O.Adam() optimizer.set_weight_decay(1e-6) optimizer.set_gradient_clipping(5) train(encdec, optimizer, prefix, 1e10) elif mode == "resume": print("loading model/optimizer ... ", end="", file=sys.stderr) sys.stderr.flush() encdec = AttentionalEncoderDecoder() encdec.load(prefix + ".model") optimizer = O.Adam() optimizer.load(prefix + ".optimizer") valid_ppl = load_ppl(prefix + ".valid_ppl") print("done.", file=sys.stderr) train(encdec, optimizer, prefix, valid_ppl) else: print("loading model ... ", end="", file=sys.stderr) sys.stderr.flush() encdec = AttentionalEncoderDecoder() encdec.load(prefix + ".model") print("done.", file=sys.stderr) test(encdec)
def setUp(self): self.device = Naive() self.graph = Graph() Device.set_default(self.device) Graph.set_default(self.graph) self.ndarray_data = [ np.array([ [1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12], ], np.float32), np.array([ [13, 14, 15], [16, 17, 18], [19, 20, 21], [22, 23, 24], ], np.float32), ] self.list_data = [ 1.0, 4.0, 7.0, 10.0, 2.0, 5.0, 8.0, 11.0, 3.0, 6.0, 9.0, 12.0, 13.0, 16.0, 19.0, 22.0, 14.0, 17.0, 20.0, 23.0, 15.0, 18.0, 21.0, 24.0, ]
def train_func(trainer): dev = D.Naive(12345) Device.set_default(dev) g = Graph() Graph.set_default(g) pw1 = Parameter([8, 2], I.XavierUniform()) pb1 = Parameter([8], I.Constant(0)) pw2 = Parameter([1, 8], I.XavierUniform()) pb2 = Parameter([1], I.Constant(0)) trainer.add_parameter(pw1) trainer.add_parameter(pb1) trainer.add_parameter(pw2) trainer.add_parameter(pb2) input_data = [1, 1, 1, -1, -1, 1, -1, -1] output_data = [1, -1, -1, 1] for i in range(10): g.clear() x = F.input(input_data, Shape([2], 4)) w1 = F.parameter(pw1) b1 = F.parameter(pb1) w2 = F.parameter(pw2) b2 = F.parameter(pb2) h = F.tanh(w1 @ x + b1) y = w2 @ h + b2 t = F.input(output_data, Shape([], 4)) diff = t - y loss = F.batch.mean(diff * diff) trainer.reset_gradients() loss.backward() trainer.update() return [ pw1.value.to_list(), pb1.value.to_list(), pw2.value.to_list(), pb2.value.to_list() ]
def setUp(self): self.device = Naive() self.graph = Graph() Device.set_default(self.device) Graph.set_default(self.graph) self.input_data = [ np.array([ [ 1, 2, 3], [ 4, 5, 6], [ 7, 8, 9], ], np.float32), np.array([ [11,12,13], [14,15,16], [17,18,19], ], np.float32), ] self.list_expected = [ 1.0, 4.0, 7.0, 2.0, 5.0, 8.0, 3.0, 6.0, 9.0, 11.0, 14.0, 17.0, 12.0, 15.0, 18.0, 13.0, 16.0, 19.0, ]
def test_pyoptimizer_propagate_exception(self): dev = D.Naive() Device.set_default(dev) optimizer = ExceptionOptimizer() p = Parameter() with self.assertRaises(TestException) as ctx: optimizer.add(p) self.assertEqual(str(ctx.exception), "configure_parameter") with self.assertRaises(TestException) as ctx: optimizer.update() self.assertEqual(str(ctx.exception), "update_parameter") with self.assertRaises(TestException) as ctx: Optimizer.get_configs(optimizer) self.assertEqual(str(ctx.exception), "get_configs") with self.assertRaises(TestException) as ctx: Optimizer.set_configs(optimizer, {'Optimizer.epoch': 1}, { 'Optimizer.clip_threshold': 0.0, 'Optimizer.lr_scale': 1.0, 'Optimizer.l2_strength': 0.0 }) self.assertEqual(str(ctx.exception), "set_configs")
def main(config): mode = config['mode'] if mode == 'preproc': preproc(config) return print('initializing device ...', end='', file=sys.stderr, flush=True) dev = D.Naive() if config['gpu'] < 0 else D.CUDA(config['gpu']) Device.set_default(dev) print("done.", file=sys.stderr, flush=True) prefix = config['model_prefix'] if mode == 'train': model = Transformer(config['n_heads'], config['n_stacks'], config['dropout'], config['generation_limit']) model.init(config['vocabulary_size'], config['d_model'], config['d_ff']) optimizer = O.Adam(alpha=1, beta2=0.98, eps=1e-9) optimizer.set_gradient_clipping(5) train(model, optimizer, config, 1e10) elif mode == 'resume': print('loading model/optimizer ... ', end='', file=sys.stderr, flush=True) model = Transformer(config['n_heads'], config['n_stacks'], config['dropout'], config['generation_limit']) model.load(prefix + '.model') optimizer = O.Adam(alpha=1, beta2=0.98, eps=1e-9) optimizer.set_gradient_clipping(5) optimizer.load(prefix + '.optimizer') with Path(prefix).with_suffix('.valid').open() as f: valid_ppl = float(f.read().strip()) print('done.', file=sys.stderr, flush=True) train(model, optimizer, config, valid_ppl) elif mode == 'test': model = Transformer(config['n_heads'], config['n_stacks'], config['dropout'], config['generation_limit']) model.load(prefix + '.model') test(model, config)
def main(): # Loads data train_inputs = load_images("data/train-images-idx3-ubyte", NUM_TRAIN_SAMPLES) train_labels = load_labels("data/train-labels-idx1-ubyte", NUM_TRAIN_SAMPLES) test_inputs = load_images("data/t10k-images-idx3-ubyte", NUM_TEST_SAMPLES) test_labels = load_labels("data/t10k-labels-idx1-ubyte", NUM_TEST_SAMPLES) dev = D.CUDA(0) Device.set_default(dev) g = Graph() Graph.set_default(g) # Parameters of CNNs # Shape: {kernel_height, kernel_width, in_channels, out_channels} pw_cnn1 = Parameter(Shape([KERNEL_SIZE1, KERNEL_SIZE1, 1, NUM_CHANNELS1]), I.XavierUniformConv2D()) pw_cnn2 = Parameter( Shape([KERNEL_SIZE2, KERNEL_SIZE2, NUM_CHANNELS1, NUM_CHANNELS2]), I.XavierUniformConv2D()) # Parameters of FC layers pw_fc1 = Parameter(Shape([NUM_HIDDEN_UNITS, NUM_INPUT_UNITS]), I.XavierUniform()) pw_fc2 = Parameter(Shape([NUM_OUTPUT_UNITS, NUM_HIDDEN_UNITS]), I.XavierUniform()) pb_fc1 = Parameter(Shape([NUM_HIDDEN_UNITS]), I.Constant(0)) pb_fc2 = Parameter(Shape([NUM_OUTPUT_UNITS]), I.Constant(0)) # Optimizer optimizer = O.SGD(.1) optimizer.add(pw_cnn1, pw_cnn2, pw_fc1, pw_fc2, pb_fc1, pb_fc2) # Helper lambda to construct the predictor network. def make_graph(inputs, train): # Input and parameters. #x = F.input(Shape([IMAGE_HEIGHT, IMAGE_WIDTH], BATCH_SIZE), inputs) x = F.input(inputs) w_cnn1 = F.parameter(pw_cnn1) w_cnn2 = F.parameter(pw_cnn2) w_fc1 = F.parameter(pw_fc1) w_fc2 = F.parameter(pw_fc2) b_fc1 = F.parameter(pb_fc1) b_fc2 = F.parameter(pb_fc2) # CNNs h_cnn1 = F.relu(F.conv2d(x, w_cnn1, PADDING1, PADDING1, 1, 1, 1, 1)) h_pool1 = F.max_pool2d(h_cnn1, 2, 2, 0, 0, 2, 2) h_cnn2 = F.relu( F.conv2d(h_pool1, w_cnn2, PADDING2, PADDING2, 1, 1, 1, 1)) h_pool2 = F.max_pool2d(h_cnn2, 2, 2, 0, 0, 2, 2) # FC layers x_fc = F.dropout(F.flatten(h_pool2), .5, train) h_fc = F.dropout(F.relu(F.matmul(w_fc1, x_fc) + b_fc1), .5, train) return F.matmul(w_fc2, h_fc) + b_fc2 # Batch randomizer ids = list(range(NUM_TRAIN_SAMPLES)) for epoch in range(MAX_EPOCH): # Shuffles sample IDs. random.shuffle(ids) # Training loop for batch in range(NUM_TRAIN_BATCHES): print("\rTraining... %d / %d" % (batch + 1, NUM_TRAIN_BATCHES), end="") # Makes a minibatch for training. inputs = [ train_inputs[ids[batch * BATCH_SIZE + i]] for i in range(BATCH_SIZE) ] labels = [ train_labels[ids[batch * BATCH_SIZE + i]] for i in range(BATCH_SIZE) ] # Constructs the graph. g.clear() y = make_graph(inputs, True) loss = F.softmax_cross_entropy(y, labels, 0) avg_loss = F.batch.mean(loss) # Dump computation graph at the first time. # if epoch == 0 and batch == 0: # print(g.dump("dot")) # Implicit forward, backward, and updates parameters. optimizer.reset_gradients() avg_loss.backward() optimizer.update() print() match = 0 # Test loop for batch in range(NUM_TEST_BATCHES): print("\rTesting... %d / %d" % (batch + 1, NUM_TEST_BATCHES), end="") # Makes a test minibatch. inputs = [ test_inputs[batch * BATCH_SIZE + i] for i in range(BATCH_SIZE) ] # Constructs the graph. g.clear() y = make_graph(inputs, False) # Gets outputs, argmax, and compares them with the label. y_val = y.to_list() for i in range(BATCH_SIZE): maxval = -1e10 argmax = -1 for j in range(NUM_OUTPUT_UNITS): v = y_val[j + i * NUM_OUTPUT_UNITS] if v > maxval: maxval = v argmax = j if argmax == test_labels[i + batch * BATCH_SIZE]: match += 1 accuracy = 100.0 * match / NUM_TEST_SAMPLES print("epoch %d: accuracy: %.2f%%" % (epoch, accuracy)) return 0
def main(): dev = D.Naive() # or D.CUDA(gpuid) Device.set_default(dev) # Parameters pw1 = Parameter([8, 2], I.XavierUniform()) pb1 = Parameter([8], I.Constant(0)) pw2 = Parameter([1, 8], I.XavierUniform()) pb2 = Parameter([], I.Constant(0)) # Optimizer optimizer = O.SGD(0.1) # Registers parameters. optimizer.add_parameter(pw1) optimizer.add_parameter(pb1) optimizer.add_parameter(pw2) optimizer.add_parameter(pb2) # Training data input_data = [ np.array([1, 1], dtype=np.float32), # Sample 1 np.array([1, -1], dtype=np.float32), # Sample 2 np.array([-1, 1], dtype=np.float32), # Sample 3 np.array([-1, -1], dtype=np.float32), # Sample 4 ] output_data = [ np.array([1], dtype=np.float32), # Label 1 np.array([-1], dtype=np.float32), # Label 2 np.array([-1], dtype=np.float32), # Label 3 np.array([1], dtype=np.float32), # Label 4 ] g = Graph() Graph.set_default(g) for i in range(10): g.clear() # Builds a computation graph. x = F.input(input_data) w1 = F.parameter(pw1) b1 = F.parameter(pb1) w2 = F.parameter(pw2) b2 = F.parameter(pb2) h = F.tanh(w1 @ x + b1) y = w2 @ h + b2 # Obtains values. y_val = y.to_list() print("epoch ", i, ":") for j in range(4): print(" [", j, "]: ", y_val[j]) # Extends the computation graph to calculate loss values. t = F.input(output_data) diff = t - y loss = F.batch.mean(diff * diff) # Obtains the loss. loss_val = loss.to_float() print(" loss: ", loss_val) # Updates parameters. optimizer.reset_gradients() loss.backward() optimizer.update()
def main(): # Loads data train_inputs = load_images("data/train-images-idx3-ubyte", NUM_TRAIN_SAMPLES) train_labels = load_labels("data/train-labels-idx1-ubyte", NUM_TRAIN_SAMPLES) test_inputs = load_images("data/t10k-images-idx3-ubyte", NUM_TEST_SAMPLES) test_labels = load_labels("data/t10k-labels-idx1-ubyte", NUM_TEST_SAMPLES) dev = D.Naive() # or D.CUDA(gpuid) Device.set_default(dev) pw1 = Parameter([NUM_HIDDEN_UNITS, NUM_INPUT_UNITS], I.XavierUniform()) pb1 = Parameter([NUM_HIDDEN_UNITS], I.Constant(0)) pw2 = Parameter([NUM_OUTPUT_UNITS, NUM_HIDDEN_UNITS], I.XavierUniform()) pb2 = Parameter([NUM_OUTPUT_UNITS], I.Constant(0)) optimizer = O.SGD(.5) optimizer.add(pw1, pb1, pw2, pb2) def make_graph(inputs, train): x = F.input(inputs) w1 = F.parameter(pw1) b1 = F.parameter(pb1) h = F.relu(w1 @ x + b1) h = F.dropout(h, .5, train) w2 = F.parameter(pw2) b2 = F.parameter(pb2) return w2 @ h + b2 ids = list(range(NUM_TRAIN_SAMPLES)) g = Graph() Graph.set_default(g) for epoch in range(MAX_EPOCH): random.shuffle(ids) # Training loop for batch in range(NUM_TRAIN_BATCHES): print("\rTraining... %d / %d" % (batch + 1, NUM_TRAIN_BATCHES), end="") inputs = [train_inputs[ids[batch * BATCH_SIZE + i]] for i in range(BATCH_SIZE)] labels = [train_labels[ids[batch * BATCH_SIZE + i]] for i in range(BATCH_SIZE)] g.clear() y = make_graph(inputs, True) loss = F.softmax_cross_entropy(y, labels, 0) avg_loss = F.batch.mean(loss) optimizer.reset_gradients() avg_loss.backward() optimizer.update() print() match = 0 # Test loop for batch in range(NUM_TEST_BATCHES): print("\rTesting... %d / %d" % (batch + 1, NUM_TEST_BATCHES), end="") inputs = [test_inputs[batch * BATCH_SIZE + i] for i in range(BATCH_SIZE)] g.clear() y = make_graph(inputs, False) y_val = y.to_list() for i in range(BATCH_SIZE): maxval = -1e10 argmax = -1 for j in range(NUM_OUTPUT_UNITS): v = y_val[j + i * NUM_OUTPUT_UNITS] if (v > maxval): maxval = v argmax = j if argmax == test_labels[i + batch * BATCH_SIZE]: match += 1 accuracy = 100.0 * match / NUM_TEST_SAMPLES print("\nepoch %d: accuracy: %.2f%%\n" % (epoch, accuracy))
def setUp(self): self.dev = D.Naive() Device.set_default(self.dev) self.p = Parameter(init=np.array([1, 2, 3, 4, 5, 6, 7, 8]))
def main(): # Loads vocab. vocab = make_vocab("data/ptb.train.txt") print("#vocab:", len(vocab)) # maybe 10000 eos_id = vocab["<s>"] # Loads all corpus. train_corpus = load_corpus("data/ptb.train.txt", vocab) valid_corpus = load_corpus("data/ptb.valid.txt", vocab) num_train_sents = len(train_corpus) num_valid_sents = len(valid_corpus) num_train_labels = count_labels(train_corpus) num_valid_labels = count_labels(valid_corpus) print("train:", num_train_sents, "sentences,", num_train_labels, "labels") print("valid:", num_valid_sents, "sentences,", num_valid_labels, "labels") dev = D.CUDA(0) Device.set_default(dev) # Trainer. trainer = T.Adam() trainer.set_weight_decay(1e-6) trainer.set_gradient_clipping(5) # Our LM. lm = RNNLM(len(vocab), eos_id, trainer) # Sentence IDs. train_ids = list(range(num_train_sents)) valid_ids = list(range(num_valid_sents)) g = Graph() Graph.set_default(g) # Train/valid loop. for epoch in range(MAX_EPOCH): print("epoch", (epoch + 1), "/", MAX_EPOCH, ":") # Shuffles train sentence IDs. random.shuffle(train_ids) # Training. train_loss = 0 for ofs in range(0, num_train_sents, BATCH_SIZE): batch_ids = train_ids[ofs:min(ofs + BATCH_SIZE, num_train_sents)] batch = make_batch(train_corpus, batch_ids, eos_id) g.clear() outputs = lm.forward(batch) loss = lm.forward_loss(outputs, batch) train_loss += loss.to_float() * len(batch_ids) trainer.reset_gradients() loss.backward() trainer.update() print("\r%d" % ofs, end="") sys.stdout.flush() print() train_ppl = math.exp(train_loss / num_train_labels) print(" train ppl =", train_ppl) # Validation. valid_loss = 0 for ofs in range(0, num_valid_sents, BATCH_SIZE): batch_ids = valid_ids[ofs:min(ofs + BATCH_SIZE, num_valid_sents)] batch = make_batch(valid_corpus, batch_ids, eos_id) g.clear() outputs = lm.forward(batch) loss = lm.forward_loss(outputs, batch) valid_loss += loss.to_float() * len(batch_ids) print("\r%d" % ofs, end="") sys.stdout.flush() print() valid_ppl = math.exp(valid_loss / num_valid_labels) print(" valid ppl =", valid_ppl)
def main(): # Loads vocab. vocab = make_vocab("data/ptb.train.txt") print("#vocab:", len(vocab)) # maybe 10000 eos_id = vocab["<s>"] # Loads all corpus. train_corpus = load_corpus("data/ptb.train.txt", vocab) valid_corpus = load_corpus("data/ptb.valid.txt", vocab) num_train_sents = len(train_corpus) num_valid_sents = len(valid_corpus) num_train_labels = count_labels(train_corpus) num_valid_labels = count_labels(valid_corpus) print("train:", num_train_sents, "sentences,", num_train_labels, "labels") print("valid:", num_valid_sents, "sentences,", num_valid_labels, "labels") # Device and computation graph. dev = D.CUDA(0) Device.set_default(dev) g = Graph() Graph.set_default(g) # Our LM. lm = RNNLM(len(vocab), eos_id) # Optimizer. optimizer = O.SGD(1) #optimizer.set_weight_decay(1e-6) optimizer.set_gradient_clipping(5) optimizer.add(lm) # Sentence IDs. train_ids = list(range(num_train_sents)) valid_ids = list(range(num_valid_sents)) best_valid_ppl = 1e10 # Train/valid loop. for epoch in range(MAX_EPOCH): print("epoch", epoch + 1, "/", MAX_EPOCH, ":") # Shuffles train sentence IDs. random.shuffle(train_ids) # Training. train_loss = 0 for ofs in range(0, num_train_sents, BATCH_SIZE): batch_ids = train_ids[ofs:min(ofs + BATCH_SIZE, num_train_sents)] batch = make_batch(train_corpus, batch_ids, eos_id) g.clear() outputs = lm.forward(batch, True) loss = lm.loss(outputs, batch) train_loss += loss.to_float() * len(batch_ids) optimizer.reset_gradients() loss.backward() optimizer.update() print("%d" % ofs, end="\r") sys.stdout.flush() train_ppl = math.exp(train_loss / num_train_labels) print(" train ppl =", train_ppl) # Validation. valid_loss = 0 for ofs in range(0, num_valid_sents, BATCH_SIZE): batch_ids = valid_ids[ofs:min(ofs + BATCH_SIZE, num_valid_sents)] batch = make_batch(valid_corpus, batch_ids, eos_id) g.clear() outputs = lm.forward(batch, False) loss = lm.loss(outputs, batch) valid_loss += loss.to_float() * len(batch_ids) print("%d" % ofs, end="\r") sys.stdout.flush() valid_ppl = math.exp(valid_loss / num_valid_labels) print(" valid ppl =", valid_ppl) if valid_ppl < best_valid_ppl: best_valid_ppl = valid_ppl print(" BEST") else: old_lr = optimizer.get_learning_rate_scaling() new_lr = 0.5 * old_lr optimizer.set_learning_rate_scaling(new_lr) print(" learning rate scaled:", old_lr, "->", new_lr)
def setUp(self): self.dev = D.Naive() Device.set_default(self.dev) self.p = Parameter([8], I.Constant(0)) self.p.value.reset_by_vector([1, 2, 3, 4, 5, 6, 7, 8])
def setUp(self): self.device = Naive() Device.set_default(self.device) self.graph = Graph() Graph.set_default(self.graph)
def setUp(self): self.dev = D.Naive() Device.set_default(self.dev)
def setUp(self): Device.set_default(ModelTest.device)