def test_gather_op(device_id, precision): a_data = [AA([[0],[1]], dtype=PRECISION_TO_TYPE[precision]), AA([[3],[4]], dtype=PRECISION_TO_TYPE[precision])] a = C.input_variable((2,1)) r_data = np.arange(12).reshape(6,2).astype('f') r = C.parameter(shape=r_data.data, init=r_data) res = C.gather(r, a).eval({a:a_data}) expectd = np.asarray([[[[0., 1.]],[[2., 3.]]],[[[6., 7.]],[[8.,9.]]]]) assert np.array_equal(res, expectd) grads = C.gather(r, a).grad({a:a_data}, [r]) expectd_grad = np.asarray([[1,1],[1,1],[0,0],[1,1],[1,1],[0,0]], dtype=np.float32) assert np.array_equal(grads, expectd_grad) #gather with indices from learning parameter (no gradients should passed through the indices -- 0s should be passed) indices_params = C.parameter(shape=(1,), init=1.0) grads = C.gather(r, (indices_params *a)).grad({a:a_data}, [r, indices_params]) assert np.array_equal(grads[r], expectd_grad) assert np.array_equal(grads[indices_params], np.asarray([0.0], dtype=np.float32)) b_data = [AA([[0,2],[1,3]], dtype=PRECISION_TO_TYPE[precision]), AA([[2,4],[3,5]], dtype=PRECISION_TO_TYPE[precision])] b = C.input_variable((2,2)) res2 = C.gather(r, b).eval({b:b_data}) expectd2 = np.asarray([[[[0., 1.],[4.,5.]],[[2., 3.],[6., 7.]]],[[[4., 5.],[8.,9.]],[[6., 7.], [10., 11.]]]]) assert np.array_equal(res2, expectd2) #the following small model is to test the memory reuse issue of gather node. x = C.input((3, 4)) x1 = C.to_sequence(x) w = C.parameter((5, 6), init=1) z = C.gather(w, x1) assert z.shape == (4, 6) #need the unpack node to trigger memory reuse. f = C.sequence.unpack(z, 0, no_mask_output=True) y = C.input((3, 4, 6)) loss = C.reduce_mean(C.square(f - y), axis=-1) loss = C.reduce_mean(loss, axis=C.Axis.all_axes()) g = C.constant(0, shape=w.shape) u = C.assign(w, g + 1) learner = C.cntk_py.universal_learner([w], [g], u) trainer = C.trainer.Trainer(loss, [loss], [learner]) indices = np.asarray([[[1, 2, 1, 2]]]) input = np.repeat(np.repeat(indices, 3, axis=1), 10, axis=0) lable = np.full((10, 3, 4, 6), 2) trainer.train_minibatch({x: input, y: lable}) # the 2nd and 3rd rows should be udpated by gradients. assert np.mean(w.value[1, :]) < 1 assert np.mean(w.value[2, :]) < 1 # the other three rows should keep as 1 assert np.isclose(np.mean(w.value[0, :]), 1) assert np.isclose(np.mean(w.value[3, :]), 1) assert np.isclose(np.mean(w.value[4, :]), 1)
def test_universal(): np.random.seed(98052) builtin_sgd = lambda params: sgd(params, lr=learning_rate_schedule(0.125, UnitType.minibatch)) builtin_last_avg_error, builtin_avg_error, _ = ffnet(builtin_sgd) np.random.seed(98052) my_sgd = lambda ps, gs: C.combine([C.assign(p, p - 0.125/25 * g) for p, g in zip(ps, gs)]) universal_sgd = lambda params: universal(my_sgd, params) my_last_avg_error, my_avg_error, _ = ffnet(universal_sgd) assert np.all(np.less_equal(my_last_avg_error, builtin_last_avg_error)) assert np.all(np.less_equal(my_avg_error, builtin_avg_error))
def test_assign_to_param(input_data, device_id, precision): dt = PRECISION_TO_TYPE[precision] data = AA(input_data, dtype=dt) value = C.parameter(init=data) dest = C.parameter(shape=data.shape, dtype=dt) assign_op = C.assign(dest, value) result = assign_op.eval() assert np.array_equal(dest.asarray(), data) assert np.array_equal(result, data)
def test_assign_dependency(input_data, device_id, precision): dt = PRECISION_TO_TYPE[precision] data = AA(input_data, dtype=dt) value = C.parameter(init=data) dest = C.parameter(shape=data.shape, dtype=dt) assign_op = C.assign(dest, value) y = dest + value result = C.combine([y, assign_op]).eval() assert np.array_equal(result[y.output], data) assert np.array_equal(dest.asarray(), data) assert np.array_equal(y.eval(), data + data)
def test_convert_dynamic_axis(): #test fix batch size batch_size = 4 a = C.parameter(shape=(batch_size, 2, 3), init=1) dynamic_a = C.to_batch(a) assert len(dynamic_a.dynamic_axes) == 1 assert dynamic_a.shape == (2, 3) x = C.input_variable((2, 3)) y = x * dynamic_a #test grad data = np.arange(batch_size * 2 * 3).reshape(batch_size, 2, 3).astype('f') assert np.array_equal(y.grad({x:data}, [a]), data) const_a = C.unpack_batch(y) assert len(const_a.dynamic_axes) == 0 assert const_a.shape == (C.FreeDimension, 2, 3) f = C.assign(a, const_a) f.eval({x:data}) assert np.array_equal(a.value, data) #test reshape with batch axis x = C.input_variable((2,3)) const_x = C.unpack_batch(x) assert len(const_x.dynamic_axes) == 0 assert const_x.shape == (C.FreeDimension, 2, 3) const_y = C.reshape(const_x, (-1, 3)) assert const_y.shape == (C.FreeDimension, 3) y = C.to_batch(const_y) assert len(y.dynamic_axes) == 1 assert y.shape == (3,) z = y * 2 expected = data.reshape((8, 3)) * 2 assert np.array_equal(z.eval({x:data}), expected) #test inferred dimension x = C.input_variable((C.InferredDimension, 3)) const_x = C.unpack_batch(x) assert len(const_x.dynamic_axes) == 0 assert const_x.shape == (C.FreeDimension, C.InferredDimension, 3) const_y = const_x * 2 y = C.to_batch(const_y) assert len(y.dynamic_axes) == 1 assert y.shape == (C.InferredDimension, 3)
def test_assign_gradient(input_data, device_id, precision): dt = PRECISION_TO_TYPE[precision] data = AA(input_data, dtype=dt) value = C.parameter(init=data) dest = C.parameter(shape=data.shape, dtype=dt) assign_op = C.assign(dest, value) bwd, fwd = assign_op.forward({}, [assign_op.output], set([assign_op.output])) # dest will be updated after backprop, so verify that it isn't updated yet. assert not np.array_equal(dest.asarray(), data) result = list(fwd.values())[0] grad = assign_op.backward(bwd, {assign_op.output:np.ones_like(result)}, set([dest])) # dest should have the new value by now. assert np.array_equal(dest.asarray(), data) # check the gradient. assert np.array_equal(grad[dest], np.zeros_like(result))
def test_assign_timestamp_bump(input_data, device_id, precision): dt = PRECISION_TO_TYPE[precision] data = AA(input_data, dtype=dt) x = C.parameter(shape=data.shape, dtype=dt) w = C.constant(np.ones_like(data)) y = w + 1 z = C.assign(x, y) f = x + 1 result = f.eval() assert np.array_equal(x.asarray(), 0*np.ones_like(data)) assert np.array_equal(result, 1*np.ones_like(data)) result = z.eval() assert np.array_equal(x.asarray(), 2*np.ones_like(data)) assert np.array_equal(result, 2*np.ones_like(data)) result = f.eval() assert np.array_equal(x.asarray(), 2*np.ones_like(data)) assert np.array_equal(result, 3*np.ones_like(data))
def train(data_path, model_path, log_file, config_file, restore=False, profiling=False, gen_heartbeat=False): training_config = importlib.import_module(config_file).training_config # config for using multi GPUs if training_config['multi_gpu']: gpu_pad = training_config['gpu_pad'] gpu_cnt = training_config['gpu_cnt'] my_rank = C.Communicator.rank() my_gpu_id = (my_rank + gpu_pad) % gpu_cnt print("rank = " + str(my_rank) + ", using gpu " + str(my_gpu_id) + " of " + str(gpu_cnt)) C.try_set_default_device(C.gpu(my_gpu_id)) else: C.try_set_default_device(C.gpu(0)) # outputs while training normal_log = os.path.join(data_path, training_config['logdir'], log_file) # tensorboard files' dir tensorboard_logdir = os.path.join(data_path, training_config['logdir'], log_file) polymath = PolyMath(config_file) z, loss = polymath.model() max_epochs = training_config['max_epochs'] log_freq = training_config['log_freq'] progress_writers = [ C.logging.ProgressPrinter(num_epochs=max_epochs, freq=log_freq, tag='Training', log_to_file=normal_log, rank=C.Communicator.rank(), gen_heartbeat=gen_heartbeat) ] # add tensorboard writer for visualize tensorboard_writer = C.logging.TensorBoardProgressWriter( freq=10, log_dir=tensorboard_logdir, rank=C.Communicator.rank(), model=z) progress_writers.append(tensorboard_writer) lr = C.learning_parameter_schedule(training_config['lr'], minibatch_size=None, epoch_size=None) ema = {} dummies_info = {} dummies = [] for p in z.parameters: ema_p = C.constant(0, shape=p.shape, dtype=p.dtype, name='ema_%s' % p.uid) ema[p.uid] = ema_p dummies.append(C.reduce_sum(C.assign(ema_p, p))) dummies_info[dummies[-1].output] = (p.name, p.shape) dummy = C.combine(dummies) learner = C.adadelta(z.parameters, lr) if C.Communicator.num_workers() > 1: learner = C.data_parallel_distributed_learner(learner) trainer = C.Trainer(z, (loss, None), learner, progress_writers) if profiling: C.debugging.start_profiler(sync_gpu=True) train_data_file = os.path.join(data_path, training_config['train_data']) train_data_ext = os.path.splitext(train_data_file)[-1].lower() model_file = os.path.join(model_path, model_name) model = C.combine(list(z.outputs) + [loss.output]) label_ab = argument_by_name(loss, 'ab') epoch_stat = { 'best_val_err': 100, 'best_since': 0, 'val_since': 0, 'record_num': 0 } if restore and os.path.isfile(model_file): trainer.restore_from_checkpoint(model_file) #after restore always re-evaluate epoch_stat['best_val_err'] = validate_model( os.path.join(data_path, training_config['val_data']), model, polymath, config_file) def post_epoch_work(epoch_stat): trainer.summarize_training_progress() epoch_stat['val_since'] += 1 if epoch_stat['val_since'] == training_config['val_interval']: epoch_stat['val_since'] = 0 temp = dict((p.uid, p.value) for p in z.parameters) for p in trainer.model.parameters: p.value = ema[p.uid].value val_err = validate_model( os.path.join(data_path, training_config['val_data']), model, polymath, config_file) if epoch_stat['best_val_err'] > val_err: epoch_stat['best_val_err'] = val_err epoch_stat['best_since'] = 0 os.system("ls -la >> log.log") os.system("ls -la ./Models >> log.log") save_flag = True fail_cnt = 0 while save_flag: if fail_cnt > 100: print("ERROR: failed to save models") break try: trainer.save_checkpoint(model_file) epoch_stat['record_num'] += 1 record_file = os.path.join( model_path, str(epoch_stat['record_num']) + '-' + model_name) trainer.save_checkpoint(record_file) save_flag = False except: fail_cnt = fail_cnt + 1 for p in trainer.model.parameters: p.value = temp[p.uid] else: epoch_stat['best_since'] += 1 if epoch_stat['best_since'] > training_config['stop_after']: return False if profiling: C.debugging.enable_profiler() return True if train_data_ext == '.ctf': mb_source, input_map = create_mb_and_map(loss, train_data_file, polymath) minibatch_size = training_config['minibatch_size'] # number of samples epoch_size = training_config['epoch_size'] for epoch in range(max_epochs): num_seq = 0 while True: if trainer.total_number_of_samples_seen >= training_config[ 'distributed_after']: data = mb_source.next_minibatch( minibatch_size * C.Communicator.num_workers(), input_map=input_map, num_data_partitions=C.Communicator.num_workers(), partition_index=C.Communicator.rank()) else: data = mb_source.next_minibatch(minibatch_size, input_map=input_map) trainer.train_minibatch(data) num_seq += trainer.previous_minibatch_sample_count # print_para_info(dummy, dummies_info) if num_seq >= epoch_size: break if not post_epoch_work(epoch_stat): break else: if train_data_ext != '.tsv': raise Exception("Unsupported format") minibatch_seqs = training_config[ 'minibatch_seqs'] # number of sequences for epoch in range(max_epochs): # loop over epochs tsv_reader = create_tsv_reader(loss, train_data_file, polymath, minibatch_seqs, C.Communicator.num_workers()) minibatch_count = 0 for data in tsv_reader: if (minibatch_count % C.Communicator.num_workers()) == C.Communicator.rank(): trainer.train_minibatch(data) # update model with it dummy.eval() minibatch_count += 1 if not post_epoch_work(epoch_stat): break if profiling: C.debugging.stop_profiler()
def train(data_path, model_path, log_file, config_file, restore=False, profiling=False, gen_heartbeat=False): polymath = PolyMath(config_file) z, loss = polymath.model() training_config = importlib.import_module(config_file).training_config max_epochs = training_config['max_epochs'] log_freq = training_config['log_freq'] progress_writers = [C.logging.ProgressPrinter( num_epochs = max_epochs, freq = log_freq, tag = 'Training', log_to_file = log_file, rank = C.Communicator.rank(), gen_heartbeat = gen_heartbeat)] lr = C.learning_parameter_schedule(training_config['lr'], minibatch_size=None, epoch_size=None) ema = {} dummies = [] for p in z.parameters: ema_p = C.constant(0, shape=p.shape, dtype=p.dtype, name='ema_%s' % p.uid) ema[p.uid] = ema_p dummies.append(C.reduce_sum(C.assign(ema_p, 0.999 * ema_p + 0.001 * p))) dummy = C.combine(dummies) learner = C.adadelta(z.parameters, lr) if C.Communicator.num_workers() > 1: learner = C.data_parallel_distributed_learner(learner) trainer = C.Trainer(z, (loss, None), learner, progress_writers) if profiling: C.debugging.start_profiler(sync_gpu=True) train_data_file = os.path.join(data_path, training_config['train_data']) train_data_ext = os.path.splitext(train_data_file)[-1].lower() model_file = os.path.join(model_path, model_name) model = C.combine(list(z.outputs) + [loss.output]) label_ab = argument_by_name(loss, 'ab') epoch_stat = { 'best_val_err' : 100, 'best_since' : 0, 'val_since' : 0} if restore and os.path.isfile(model_file): trainer.restore_from_checkpoint(model_file) #after restore always re-evaluate epoch_stat['best_val_err'] = validate_model(os.path.join(data_path, training_config['val_data']), model, polymath) def post_epoch_work(epoch_stat): trainer.summarize_training_progress() epoch_stat['val_since'] += 1 if epoch_stat['val_since'] == training_config['val_interval']: epoch_stat['val_since'] = 0 temp = dict((p.uid, p.value) for p in z.parameters) for p in trainer.model.parameters: p.value = ema[p.uid].value val_err = validate_model(os.path.join(data_path, training_config['val_data']), model, polymath) if epoch_stat['best_val_err'] > val_err: epoch_stat['best_val_err'] = val_err epoch_stat['best_since'] = 0 trainer.save_checkpoint(model_file) for p in trainer.model.parameters: p.value = temp[p.uid] else: epoch_stat['best_since'] += 1 if epoch_stat['best_since'] > training_config['stop_after']: return False if profiling: C.debugging.enable_profiler() return True if train_data_ext == '.ctf': mb_source, input_map = create_mb_and_map(loss, train_data_file, polymath) minibatch_size = training_config['minibatch_size'] # number of samples epoch_size = training_config['epoch_size'] for epoch in range(max_epochs): num_seq = 0 while True: if trainer.total_number_of_samples_seen >= training_config['distributed_after']: data = mb_source.next_minibatch(minibatch_size*C.Communicator.num_workers(), input_map=input_map, num_data_partitions=C.Communicator.num_workers(), partition_index=C.Communicator.rank()) else: data = mb_source.next_minibatch(minibatch_size, input_map=input_map) trainer.train_minibatch(data) num_seq += trainer.previous_minibatch_sample_count dummy.eval() if num_seq >= epoch_size: break if not post_epoch_work(epoch_stat): break else: if train_data_ext != '.tsv': raise Exception("Unsupported format") minibatch_seqs = training_config['minibatch_seqs'] # number of sequences for epoch in range(max_epochs): # loop over epochs tsv_reader = create_tsv_reader(loss, train_data_file, polymath, minibatch_seqs, C.Communicator.num_workers()) minibatch_count = 0 for data in tsv_reader: if (minibatch_count % C.Communicator.num_workers()) == C.Communicator.rank(): trainer.train_minibatch(data) # update model with it dummy.eval() minibatch_count += 1 if not post_epoch_work(epoch_stat): break if profiling: C.debugging.stop_profiler()
N = float(saxpy.N) YVAL = float(saxpy.YVAL) XVAL = float(saxpy.XVAL) AVAL = float(saxpy.AVAL) print("N: {}".format(N)) a = cntk.Constant(value=AVAL, shape=[N], dtype=np.float32, device=dev, name="a") y = cntk.Parameter(shape=[N], init=YVAL, dtype=np.float32, device=dev, name="y") x = cntk.Parameter(shape=[N], init=XVAL, dtype=np.float32, device=dev, name="x") t0 = time.time() cntk.assign(y, y + a * x).eval() t1 = time.time() print("Elapsed: %.3f ms" % ((t1 - t0) * 1000)) saxpy.verify(y.asarray())
def test_gather_op(device_id, precision): a_data = [ AA([[0], [1]], dtype=PRECISION_TO_TYPE[precision]), AA([[3], [4]], dtype=PRECISION_TO_TYPE[precision]) ] a = C.input_variable((2, 1)) r_data = np.arange(12).reshape(6, 2).astype('f') r = C.parameter(shape=r_data.data, init=r_data) res = C.gather(r, a).eval({a: a_data}) expectd = np.asarray([[[[0., 1.]], [[2., 3.]]], [[[6., 7.]], [[8., 9.]]]]) assert np.array_equal(res, expectd) grads = C.gather(r, a).grad({a: a_data}, [r]) expectd_grad = np.asarray([[1, 1], [1, 1], [0, 0], [1, 1], [1, 1], [0, 0]], dtype=np.float32) assert np.array_equal(grads, expectd_grad) #gather with indices from learning parameter (no gradients should passed through the indices -- 0s should be passed) indices_params = C.parameter(shape=(1, ), init=1.0) grads = C.gather(r, (indices_params * a)).grad({a: a_data}, [r, indices_params]) assert np.array_equal(grads[r], expectd_grad) assert np.array_equal(grads[indices_params], np.asarray([0.0], dtype=np.float32)) b_data = [ AA([[0, 2], [1, 3]], dtype=PRECISION_TO_TYPE[precision]), AA([[2, 4], [3, 5]], dtype=PRECISION_TO_TYPE[precision]) ] b = C.input_variable((2, 2)) res2 = C.gather(r, b).eval({b: b_data}) expectd2 = np.asarray([[[[0., 1.], [4., 5.]], [[2., 3.], [6., 7.]]], [[[4., 5.], [8., 9.]], [[6., 7.], [10., 11.]]]]) assert np.array_equal(res2, expectd2) #the following small model is to test the memory reuse issue of gather node. x = C.input((3, 4)) x1 = C.to_sequence(x) w = C.parameter((5, 6), init=1) z = C.gather(w, x1) assert z.shape == (4, 6) #need the unpack node to trigger memory reuse. f = C.sequence.unpack(z, 0, no_mask_output=True) y = C.input((3, 4, 6)) loss = C.reduce_mean(C.square(f - y), axis=-1) loss = C.reduce_mean(loss, axis=C.Axis.all_axes()) g = C.constant(0, shape=w.shape) u = C.assign(w, g + 1) learner = C.cntk_py.universal_learner([w], [g], u) trainer = C.trainer.Trainer(loss, [loss], [learner]) indices = np.asarray([[[1, 2, 1, 2]]]) input = np.repeat(np.repeat(indices, 3, axis=1), 10, axis=0) lable = np.full((10, 3, 4, 6), 2) trainer.train_minibatch({x: input, y: lable}) # the 2nd and 3rd rows should be udpated by gradients. assert np.mean(w.value[1, :]) < 1 assert np.mean(w.value[2, :]) < 1 # the other three rows should keep as 1 assert np.isclose(np.mean(w.value[0, :]), 1) assert np.isclose(np.mean(w.value[3, :]), 1) assert np.isclose(np.mean(w.value[4, :]), 1)
def train(i2w, data_path, model_path, log_file, config_file, restore=True, profiling=False, gen_heartbeat=False): polymath = PolyMath(config_file) z, loss = polymath.model() training_config = importlib.import_module(config_file).training_config max_epochs = training_config['max_epochs'] log_freq = training_config['log_freq'] progress_writers = [ C.logging.ProgressPrinter(num_epochs=max_epochs, freq=log_freq, tag='Training', log_to_file=log_file, metric_is_pct=False, rank=C.Communicator.rank(), gen_heartbeat=gen_heartbeat) ] lr = C.learning_parameter_schedule(training_config['lr'], minibatch_size=None, epoch_size=None) ema = {} dummies = [] for p in z.parameters: ema_p = C.constant(0, shape=p.shape, dtype=p.dtype, name='ema_%s' % p.uid) ema[p.uid] = ema_p dummies.append(C.reduce_sum(C.assign(ema_p, 0.999 * ema_p + 0.001 * p))) dummy = C.combine(dummies) # learner = C.adadelta(z.parameters, lr) learner = C.fsadagrad( z.parameters, #apply the learning rate as if it is a minibatch of size 1 lr, momentum=C.momentum_schedule( 0.9366416204111472, minibatch_size=training_config['minibatch_size']), gradient_clipping_threshold_per_sample=2.3, gradient_clipping_with_truncation=True) if C.Communicator.num_workers() > 1: learner = C.data_parallel_distributed_learner(learner) trainer = C.Trainer(z, loss, learner, progress_writers) if profiling: C.debugging.start_profiler(sync_gpu=True) train_data_file = os.path.join(data_path, training_config['train_data']) train_data_ext = os.path.splitext(train_data_file)[-1].lower() model_file = os.path.join(model_path, model_name) model = C.combine(z.outputs + loss.outputs) #this is for validation only epoch_stat = {'best_val_err': 1000, 'best_since': 0, 'val_since': 0} print(restore, os.path.isfile(model_file)) # if restore and os.path.isfile(model_file): if restore and os.path.isfile(model_file): z.restore(model_file) #after restore always re-evaluate #TODO replace with rougel with external script(possibly) #epoch_stat['best_val_err'] = validate_model(i2w, os.path.join(data_path, training_config['val_data']), model, polymath) def post_epoch_work(epoch_stat): trainer.summarize_training_progress() epoch_stat['val_since'] += 1 if epoch_stat['val_since'] == training_config['val_interval']: epoch_stat['val_since'] = 0 temp = dict((p.uid, p.value) for p in z.parameters) for p in trainer.model.parameters: p.value = ema[p.uid].value #TODO replace with rougel with external script(possibly) val_err = validate_model( i2w, os.path.join(data_path, training_config['val_data']), model, polymath) #if epoch_stat['best_val_err'] > val_err: # epoch_stat['best_val_err'] = val_err # epoch_stat['best_since'] = 0 # trainer.save_checkpoint(model_file) # for p in trainer.model.parameters: # p.value = temp[p.uid] #else: # epoch_stat['best_since'] += 1 # if epoch_stat['best_since'] > training_config['stop_after']: # return False z.save(model_file) epoch_stat['best_since'] += 1 if epoch_stat['best_since'] > training_config['stop_after']: return False if profiling: C.debugging.enable_profiler() return True init_pointer_importance = polymath.pointer_importance if train_data_ext == '.ctf': mb_source, input_map = create_mb_and_map(loss, train_data_file, polymath) minibatch_size = training_config['minibatch_size'] # number of samples epoch_size = training_config['epoch_size'] for epoch in range(max_epochs): num_seq = 0 while True: if trainer.total_number_of_samples_seen >= training_config[ 'distributed_after']: data = mb_source.next_minibatch( minibatch_size * C.Communicator.num_workers(), input_map=input_map, num_data_partitions=C.Communicator.num_workers(), partition_index=C.Communicator.rank()) else: data = mb_source.next_minibatch(minibatch_size, input_map=input_map) trainer.train_minibatch(data) num_seq += trainer.previous_minibatch_sample_count dummy.eval() if num_seq >= epoch_size: break if not post_epoch_work(epoch_stat): break print('Before Pointer_importance:', polymath.pointer_importance) if polymath.pointer_importance > 0.1 * init_pointer_importance: polymath.pointer_importance = polymath.pointer_importance * 0.9 print('Pointer_importance:', polymath.pointer_importance) else: if train_data_ext != '.tsv': raise Exception("Unsupported format") minibatch_seqs = training_config[ 'minibatch_seqs'] # number of sequences for epoch in range(max_epochs): # loop over epochs tsv_reader = create_tsv_reader(loss, train_data_file, polymath, minibatch_seqs, C.Communicator.num_workers()) minibatch_count = 0 for data in tsv_reader: if (minibatch_count % C.Communicator.num_workers()) == C.Communicator.rank(): trainer.train_minibatch(data) # update model with it dummy.eval() minibatch_count += 1 if not post_epoch_work(epoch_stat): break if profiling: C.debugging.stop_profiler()
def train(data_path, model_path, log_file, config_file, restore=False, profiling=False, gen_heartbeat=False): polymath = PolyMath(config_file) z, loss = polymath.model() training_config = importlib.import_module(config_file).training_config max_epochs = training_config['max_epochs'] log_freq = training_config['log_freq'] progress_writers = [C.logging.ProgressPrinter( num_epochs = max_epochs, freq = log_freq, tag = 'Training', log_to_file = log_file, rank = C.Communicator.rank(), gen_heartbeat = gen_heartbeat)] lr = C.learning_parameter_schedule(training_config['lr'], minibatch_size=None, epoch_size=None) ema = {} dummies = [] for p in z.parameters: ema_p = C.constant(0, shape=p.shape, dtype=p.dtype, name='ema_%s' % p.uid) ema[p.uid] = ema_p dummies.append(C.reduce_sum(C.assign(ema_p, 0.999 * ema_p + 0.001 * p))) dummy = C.combine(dummies) learner = C.adadelta(z.parameters, lr) if C.Communicator.num_workers() > 1: learner = C.data_parallel_distributed_learner(learner) tensorboard_writer = TensorBoardProgressWriter(freq=10, log_dir='log', model=z) trainer = C.Trainer(z, (loss, None), learner, tensorboard_writer) if profiling: C.debugging.start_profiler(sync_gpu=True) train_data_file = os.path.join(data_path, training_config['train_data']) train_data_ext = os.path.splitext(train_data_file)[-1].lower() model_file = os.path.join(model_path, model_name) model = C.combine(list(z.outputs) + [loss.output]) label_ab = argument_by_name(loss, 'ab') epoch_stat = { 'best_val_err' : 100, 'best_since' : 0, 'val_since' : 0} if restore and os.path.isfile(model_file): trainer.restore_from_checkpoint(model_file) #after restore always re-evaluate epoch_stat['best_val_err'] = validate_model(os.path.join(data_path, training_config['val_data']), model, polymath) def post_epoch_work(epoch_stat): trainer.summarize_training_progress() epoch_stat['val_since'] += 1 if epoch_stat['val_since'] == training_config['val_interval']: epoch_stat['val_since'] = 0 temp = dict((p.uid, p.value) for p in z.parameters) for p in trainer.model.parameters: p.value = ema[p.uid].value val_err = validate_model(os.path.join(data_path, training_config['val_data']), model, polymath) if epoch_stat['best_val_err'] > val_err: epoch_stat['best_val_err'] = val_err epoch_stat['best_since'] = 0 trainer.save_checkpoint(model_file) for p in trainer.model.parameters: p.value = temp[p.uid] else: epoch_stat['best_since'] += 1 if epoch_stat['best_since'] > training_config['stop_after']: return False if profiling: C.debugging.enable_profiler() return True if train_data_ext == '.ctf': mb_source, input_map = create_mb_and_map(loss, train_data_file, polymath) minibatch_size = training_config['minibatch_size'] # number of samples epoch_size = training_config['epoch_size'] for epoch in range(max_epochs): num_seq = 0 while True: if trainer.total_number_of_samples_seen >= training_config['distributed_after']: data = mb_source.next_minibatch(minibatch_size*C.Communicator.num_workers(), input_map=input_map, num_data_partitions=C.Communicator.num_workers(), partition_index=C.Communicator.rank()) else: data = mb_source.next_minibatch(minibatch_size, input_map=input_map) trainer.train_minibatch(data) num_seq += trainer.previous_minibatch_sample_count dummy.eval() if num_seq >= epoch_size: break if not post_epoch_work(epoch_stat): break else: if train_data_ext != '.tsv': raise Exception("Unsupported format") minibatch_seqs = training_config['minibatch_seqs'] # number of sequences for epoch in range(max_epochs): # loop over epochs tsv_reader = create_tsv_reader(loss, train_data_file, polymath, minibatch_seqs, C.Communicator.num_workers()) minibatch_count = 0 for data in tsv_reader: if (minibatch_count % C.Communicator.num_workers()) == C.Communicator.rank(): trainer.train_minibatch(data) # update model with it dummy.eval() minibatch_count += 1 if not post_epoch_work(epoch_stat): break if profiling: C.debugging.stop_profiler()