Beispiel #1
0
    def create_model(self):
        hidden_layers = [8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 16, 32]
        first_input = C.ops.splice(self._input, self._target)
        first_input_size = first_input.shape
        first_input = C.ops.reshape(
            first_input, (first_input_size[0], 1, first_input_size[1]))

        model = C.layers.Convolution2D((1, 3),
                                       num_filters=8,
                                       pad=True,
                                       reduction_rank=1,
                                       activation=C.ops.tanh)(first_input)
        print(model)
        for h in hidden_layers:
            input_new = C.ops.splice(model, first_input, axis=0)
            model = C.layers.Convolution2D((1, 3),
                                           num_filters=h,
                                           pad=True,
                                           reduction_rank=1,
                                           activation=C.ops.tanh)(input_new)
            print(model)
        ######
        #model = C.ops.splice(model, self._target)
        # Dense layers
        direction = C.layers.Sequential([
            C.layers.Dense(720, activation=C.ops.relu),
            C.layers.Dense(360, activation=None)
        ])(model)

        velocity = C.layers.Sequential([
            C.layers.Dense(128, activation=C.ops.relu),
            C.layers.Dense(64, activation=None),
            C.layers.Dense(1, activation=None)
        ])(model)

        model = C.ops.splice(direction, velocity)
        if self._load_model:
            model = C.load_model(self._file_name)
            direction = model[0:360]
            velocity = model[360]

        C.logging.log_number_of_parameters(model)
        print(model)
        #loss = C.squared_error(direction, self._output) + C.squared_error(velocity, self._output_velocity)
        #error = C.squared_error(direction, self._output)  + C.squared_error(velocity, self._output_velocity)
        loss = C.cross_entropy_with_softmax(
            direction, self._output) + C.squared_error(velocity,
                                                       self._output_velocity)
        error = C.classification_error(direction,
                                       self._output) + C.squared_error(
                                           velocity, self._output_velocity)

        learner = C.adadelta(model.parameters, l2_regularization_weight=0.001)
        progress_printer = C.logging.ProgressPrinter(tag='Training')
        trainer = C.Trainer(model, (loss, error), learner, progress_printer)
        return model, loss, learner, trainer
Beispiel #2
0
def train():
    model = Model()
    z, loss, acc = model.model()

    progress_writers = [
        C.logging.ProgressPrinter(num_epochs=max_epochs,
                                  freq=log_freq,
                                  tag='Training',
                                  log_to_file='log/log_' + version)
    ]

    lr = C.learning_parameter_schedule(learning_rate,
                                       minibatch_size=None,
                                       epoch_size=None)
    learner = C.adadelta(z.parameters, lr)
    trainer = C.Trainer(z, (loss, acc), learner, progress_writers)

    mb_source, input_map = deserialize(loss, train_data, model)
    mb_valid, valid_map = deserialize(loss, valid_data, model)

    try:
        trainer.restore_from_checkpoint('../model/' + version)
    except Exception:
        print('No checkpoint.')

    for epoch in range(max_epochs):
        # train
        num_seq = 0
        with tqdm(total=epoch_size, ncols=79) as progress_bar:
            while True:
                data = mb_source.next_minibatch(minibatch_size,
                                                input_map=input_map)
                trainer.train_minibatch(data)
                num_seq += trainer.previous_minibatch_sample_count
                progress_bar.update(trainer.previous_minibatch_sample_count)
                if num_seq >= epoch_size:
                    break
            trainer.summarize_training_progress()
            trainer.save_checkpoint('../model/' + version + '/' + str(epoch))

        # validation
        num_seq = 0
        with tqdm(total=num_validation, ncols=79) as valid_progress_bar:
            while True:
                data = mb_valid.next_minibatch(minibatch_size,
                                               input_map=valid_map)
                if not data:
                    break
                trainer.test_minibatch(data)
                num_seq += len(data)
                valid_progress_bar.update(len(data))
                if num_seq >= num_validation:
                    break
            trainer.summarize_test_progress()
Beispiel #3
0
    def create_model(self):
        hidden_layers = [8, 8, 8, 8, 8, 8, 8, 8, 8]

        #first_input = C.ops.reshape(
        #    C.ops.splice(self._input, self._target),
        #    (1,self._input_size[0]*2,self._input_size[1]))
        #print(first_input)
        first_input = C.ops.reshape(
            self._input, (1, self._input_size[0], self._input_size[1]))
        model = C.layers.Convolution2D((1, 3),
                                       num_filters=8,
                                       pad=True,
                                       reduction_rank=1,
                                       activation=C.ops.tanh)(first_input)

        for h in hidden_layers:
            input_new = C.ops.splice(model, first_input)
            model = C.layers.Convolution2D((1, 3),
                                           num_filters=h,
                                           pad=True,
                                           reduction_rank=1,
                                           activation=C.ops.tanh)(input_new)

        model = C.ops.splice(model, self._target)

        ######
        # Dense layers
        direction = C.layers.Sequential([
            C.layers.Dense(256, activation=C.ops.relu),
            C.layers.Dense(128, activation=C.ops.relu),
            C.layers.Dense(64, activation=C.ops.relu),
            C.layers.Dense(32, activation=None),
        ])(model)

        velocity = C.layers.Sequential([
            C.layers.Dense(128, activation=C.ops.relu),
            C.layers.Dense(64, activation=None),
            C.layers.Dense(1, activation=None)
        ])(model)

        model = C.ops.splice(direction, velocity)
        print(model)

        loss = C.cross_entropy_with_softmax(
            direction, self._output) + C.squared_error(velocity,
                                                       self._output_velocity)
        error = C.classification_error(direction,
                                       self._output) + C.squared_error(
                                           velocity, self._output_velocity)

        learner = C.adadelta(model.parameters)
        progress_printer = C.logging.ProgressPrinter(tag='Training', freq=50)
        trainer = C.Trainer(model, (loss, error), learner, progress_printer)
        return model, loss, learner, trainer
Beispiel #4
0
 def create_model(self):
     hidden_layers = self._hidden_layers
     with C.layers.default_options(init=C.layers.glorot_uniform(),
                                   activation=C.ops.relu):
         h = self._input
         for i in range(self._num_hidden_layers):
             h = C.layers.Dense(hidden_layers[i])(h)
         model = C.layers.Dense(self._output_size, activation=None)(h)
         loss = C.reduce_mean(C.square(model - self._output), axis=0)
         meas = C.reduce_mean(C.square(model - self._output), axis=0)
         learner = C.adadelta(model.parameters, self._lr_schedule)
         trainer = C.Trainer(model, (loss, meas), learner)
         return model, loss, learner, trainer
Beispiel #5
0
 def create_model(self):
     hidden_layers = self._hidden_layers
     with cntk.layers.default_options(init=cntk.layers.glorot_uniform(),
                                      activation=cntk.ops.relu):
         h = self._input
         for i in range(self._num_hidden_layers):
             h = cntk.layers.Dense(hidden_layers[i],
                                   activation=cntk.ops.relu)(h)
         model = cntk.layers.Dense(self._output_size, activation=None)(h)
         loss = cntk.reduce_mean(cntk.square(model - self._output), axis=0)
         meas = cntk.reduce_mean(cntk.square(model - self._output), axis=0)
         learner = cntk.adadelta(model.parameters,
                                 self._lr_schedule,
                                 l2_regularization_weight=0.01)
         trainer = cntk.Trainer(model, (loss, meas), learner)
         return model, loss, learner, trainer
Beispiel #6
0
	def create_model(self):
		hidden_layers = [8,8,8,8,8,8,8,8,8]
		
		first_input = C.ops.reshape(
		    C.ops.splice(self._input,self._target),
		    (1,self._input_size[0]*2,self._input_size[1]))
		print(first_input)
		model = C.layers.Convolution2D(
		    (1,3), num_filters=8, pad=True, reduction_rank=1, activation=C.ops.tanh)(first_input)
		print(model)	
		for h in hidden_layers:
			input_new = C.ops.splice(model,first_input)
			model = C.layers.Convolution2D(
			    (1,3), num_filters=h, pad=True, 
			    reduction_rank=1, activation=C.ops.tanh)(input_new)
			print(model)
		######
		# Dense layers
		direction = C.layers.Sequential([
		C.layers.Dense(256, activation=C.ops.relu),
		C.layers.Dense(128, activation=C.ops.relu),
		C.layers.Dense(64, activation=C.ops.relu),
		C.layers.Dense(32, activation=None),
		])(model)

		velocity = C.layers.Sequential([
		C.layers.Dense(128,activation=C.ops.relu),
		C.layers.Dense(64,activation=None),
		C.layers.Dense(1,activation=None)
		])(model)

		model = C.ops.splice(direction,velocity)
		
		if self._load_model:
			model = C.load_model('dnns/GRP_f.dnn')
			direction = model[0:32]
			velocity  = model[32]
 
		print (model)
		loss = C.cross_entropy_with_softmax(direction, self._output) +  C.squared_error(velocity, self._output_velocity)
		error = C.classification_error(direction, self._output) +   C.squared_error(velocity, self._output_velocity)
		
		learner = C.adadelta(model.parameters, l2_regularization_weight=0.001)
		progress_printer = C.logging.ProgressPrinter(tag='Training')
		trainer = C.Trainer(model, (loss,error), learner, progress_printer)
		return model, loss, learner, trainer
Beispiel #7
0
    def create_model(self):
        hidden_layers = [8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 1]

        first_input = C.ops.reshape(
            C.ops.splice(self._input, self._target),
            (2, self._input_size[0], self._input_size[1]))
        print(first_input)
        model = C.layers.Convolution2D((1, 3),
                                       num_filters=8,
                                       pad=True,
                                       reduction_rank=1,
                                       activation=C.ops.tanh)(first_input)
        #print(model)
        for h in hidden_layers:
            input_new = C.ops.splice(model, first_input, axis=0)
            #print(input_new)
            model = C.layers.Convolution2D((1, 3),
                                           num_filters=h,
                                           pad=True,
                                           reduction_rank=1,
                                           activation=C.ops.tanh)(input_new)
            #print(model)
        model = C.ops.reshape(model, (1, 360))
        #print(model)
        #C.logging.log_number_of_parameters(model)
        if self._load_model:
            model = C.load_model('dnns/feature_predicter_GRP.dnn')

        #print(model)
        err = C.ops.reshape(C.ops.minus(model, self._output),
                            (self._output_size))
        sq_err = C.ops.square(err)
        mse = C.ops.reduce_mean(sq_err)
        rmse_loss = C.ops.sqrt(mse)
        rmse_eval = rmse_loss

        learner = C.adadelta(model.parameters)
        progress_printer = C.logging.ProgressPrinter(tag='Training')
        trainer = C.Trainer(model, (rmse_loss, rmse_eval), learner,
                            progress_printer)
        return model, rmse_loss, learner, trainer
Beispiel #8
0
	def create_model(self):
		hidden_layers = [8,8,8,8,8,8,8,8,8]
		first_input = C.ops.splice(self._input,self._target)
		i_size = first_input.shape
		first_input = C.ops.reshape(first_input,(i_size[0],1,i_size[1]))
		model = C.layers.Convolution2D((1,3), num_filters=8, pad=True, reduction_rank=1,activation=C.ops.tanh)(first_input)
		print (model)
		for i, h in enumerate(hidden_layers):
			input_new = C.ops.splice(model, first_input,axis=0)
			model = C.layers.Convolution2D((1,3), num_filters=h, pad=True, reduction_rank=1, activation=C.ops.tanh, name='c_{}'.format(h))(input_new)
			print(model)
		model = C.layers.BatchNormalization()(model)
		model = C.layers.Dropout(0.1)(model)
		model = C.ops.splice(model, self._target)
		direction = C.layers.Sequential([
		C.layers.Recurrence(C.layers.LSTM(720)),
		C.layers.Dense(360, activation=None)
		]) (model)
		
		velocity = C.layers.Sequential([
		C.layers.Recurrence(C.layers.LSTM(128)),
		C.layers.Dense(64,activation=C.ops.tanh),
		C.layers.Dense(1, activation=None)
		])(model)
		model = C.ops.splice(direction,velocity)
		if self._load_model:
			model = C.load_model(self._file_name)
			direction = model[0:360]
			velocity = model[360]
		C.logging.log_number_of_parameters(model)
		print (model)
		#loss = C.cross_entropy_with_softmax(direction, self._output) + C.squared_error(velocity,self._output_velocity) + C.ops.relu(1-C.ops.log(C.reduce_min(self._input) + 1 -self._safe_dist))
		loss = C.cross_entropy_with_softmax(direction, self._output) + C.squared_error(velocity,self._output_velocity)

		error = C.classification_error(direction, self._output) + C.squared_error(velocity,self._output_velocity)
		learner = C.adadelta(model.parameters, l2_regularization_weight=0.001)
		progress_printer = C.logging.ProgressPrinter(tag='Training', freq=20)
		trainer = C.Trainer(model,(loss,error), learner, progress_printer)
		return model, loss, learner, trainer
    def create_model(self):
        hidden_layers = [8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 16, 32]
        first_input = C.ops.splice(self._input, self._target)
        first_input_size = first_input.shape
        first_input = C.ops.reshape(
            first_input, (first_input_size[0], 1, first_input_size[1]))

        model = C.layers.Convolution2D((1, 3),
                                       num_filters=8,
                                       pad=True,
                                       reduction_rank=1,
                                       activation=C.ops.tanh)(first_input)
        for i, h in enumerate(hidden_layers):
            #if i >0:
            #input_new = C.ops.splice(model,input_new,axis=0)
            #else:
            input_new = C.ops.splice(model, first_input, axis=0)
            model1 = C.layers.Convolution2D((1, 3),
                                            num_filters=int(0.5 * h),
                                            pad=True,
                                            reduction_rank=1,
                                            activation=C.ops.tanh)(input_new)
            model2 = C.layers.Convolution2D((1, 5),
                                            num_filters=int(0.25 * h),
                                            pad=True,
                                            reduction_rank=1,
                                            activation=C.ops.tanh)(input_new)
            model3 = C.layers.Convolution2D((1, 1),
                                            num_filters=int(0.25 * h),
                                            pad=True,
                                            reduction_rank=1,
                                            activation=C.ops.tanh)(input_new)
            model = C.ops.splice(model1, model2, model3, axis=0)
            print(model)
        """	
		model = C.layers.Sequential([
		# Convolution layers
		C.layers.Convolution2D((1,3), num_filters=8, pad=True, reduction_rank=1, activation=C.ops.tanh, name='conv_a'),
		C.layers.Convolution2D((1,3), num_filters=16, pad=True, reduction_rank=1, activation=C.ops.tanh, name='conv2_a'),
		C.layers.Convolution2D((1,3), num_filters=32, pad=False, reduction_rank=1, activation=C.ops.tanh, name='conv3_a'),
		######
		# Dense layers
		C.layers.Dense(1024, activation=C.ops.relu,name='dense5_a'),
		# Recurrence
		#C.layers.Recurrence(C.layers.LSTM(2048, init=C.glorot_uniform()),name='lstm_a'),
		C.layers.Dense(1024,activation=None)
		])(model)
		######
		"""
        # Prediction
        direction = C.layers.Sequential([
            C.layers.Dense(720, activation=C.ops.relu, name='dense6_a'),
            C.layers.Dense(360, activation=None, name='dense7_a')
        ])(model)
        velocity = C.layers.Sequential([
            C.layers.Dense(128, activation=C.ops.relu),
            C.layers.Dense(64, activation=None),
            C.layers.Dense(1, activation=None)
        ])(model)
        model = C.ops.splice(direction, velocity)
        #model = velocity
        if self._load_model:
            model = C.load_model(self._file_name)
            direction = model[0:360]
            velocity = model[360]

        C.logging.log_number_of_parameters(model)
        print(model)
        #loss = C.squared_error(direction, self._output) +  C.squared_error(velocity, self._output_velocity)
        #error = C.classification_error(direction, self._output) +   C.squared_error(velocity, self._output_velocity)
        loss = C.cross_entropy_with_softmax(
            direction, self._output) + C.squared_error(velocity,
                                                       self._output_velocity)
        error = C.classification_error(direction,
                                       self._output) + C.squared_error(
                                           velocity, self._output_velocity)

        learner = C.adadelta(model.parameters, l2_regularization_weight=0.001)
        progress_printer = C.logging.ProgressPrinter(tag='Training')
        trainer = C.Trainer(model, (loss, error), learner, progress_printer)
        return model, loss, learner, trainer
Beispiel #10
0
def train(data_path,
          model_path,
          log_file,
          config_file,
          restore=False,
          profiling=False,
          gen_heartbeat=False):
    training_config = importlib.import_module(config_file).training_config
    # config for using multi GPUs
    if training_config['multi_gpu']:
        gpu_pad = training_config['gpu_pad']
        gpu_cnt = training_config['gpu_cnt']
        my_rank = C.Communicator.rank()
        my_gpu_id = (my_rank + gpu_pad) % gpu_cnt
        print("rank = " + str(my_rank) + ", using gpu " + str(my_gpu_id) +
              " of " + str(gpu_cnt))
        C.try_set_default_device(C.gpu(my_gpu_id))
    else:
        C.try_set_default_device(C.gpu(0))
    # outputs while training
    normal_log = os.path.join(data_path, training_config['logdir'], log_file)
    # tensorboard files' dir
    tensorboard_logdir = os.path.join(data_path, training_config['logdir'],
                                      log_file)

    polymath = PolyMath(config_file)
    z, loss = polymath.model()

    max_epochs = training_config['max_epochs']
    log_freq = training_config['log_freq']

    progress_writers = [
        C.logging.ProgressPrinter(num_epochs=max_epochs,
                                  freq=log_freq,
                                  tag='Training',
                                  log_to_file=normal_log,
                                  rank=C.Communicator.rank(),
                                  gen_heartbeat=gen_heartbeat)
    ]
    # add tensorboard writer for visualize
    tensorboard_writer = C.logging.TensorBoardProgressWriter(
        freq=10,
        log_dir=tensorboard_logdir,
        rank=C.Communicator.rank(),
        model=z)
    progress_writers.append(tensorboard_writer)

    lr = C.learning_parameter_schedule(training_config['lr'],
                                       minibatch_size=None,
                                       epoch_size=None)

    ema = {}
    dummies_info = {}
    dummies = []
    for p in z.parameters:
        ema_p = C.constant(0,
                           shape=p.shape,
                           dtype=p.dtype,
                           name='ema_%s' % p.uid)
        ema[p.uid] = ema_p
        dummies.append(C.reduce_sum(C.assign(ema_p, p)))
        dummies_info[dummies[-1].output] = (p.name, p.shape)
    dummy = C.combine(dummies)

    learner = C.adadelta(z.parameters, lr)

    if C.Communicator.num_workers() > 1:
        learner = C.data_parallel_distributed_learner(learner)

    trainer = C.Trainer(z, (loss, None), learner, progress_writers)

    if profiling:
        C.debugging.start_profiler(sync_gpu=True)

    train_data_file = os.path.join(data_path, training_config['train_data'])
    train_data_ext = os.path.splitext(train_data_file)[-1].lower()

    model_file = os.path.join(model_path, model_name)
    model = C.combine(list(z.outputs) + [loss.output])
    label_ab = argument_by_name(loss, 'ab')

    epoch_stat = {
        'best_val_err': 100,
        'best_since': 0,
        'val_since': 0,
        'record_num': 0
    }

    if restore and os.path.isfile(model_file):
        trainer.restore_from_checkpoint(model_file)
        #after restore always re-evaluate
        epoch_stat['best_val_err'] = validate_model(
            os.path.join(data_path, training_config['val_data']), model,
            polymath, config_file)

    def post_epoch_work(epoch_stat):
        trainer.summarize_training_progress()
        epoch_stat['val_since'] += 1

        if epoch_stat['val_since'] == training_config['val_interval']:
            epoch_stat['val_since'] = 0
            temp = dict((p.uid, p.value) for p in z.parameters)
            for p in trainer.model.parameters:
                p.value = ema[p.uid].value
            val_err = validate_model(
                os.path.join(data_path, training_config['val_data']), model,
                polymath, config_file)
            if epoch_stat['best_val_err'] > val_err:
                epoch_stat['best_val_err'] = val_err
                epoch_stat['best_since'] = 0
                os.system("ls -la >> log.log")
                os.system("ls -la ./Models >> log.log")
                save_flag = True
                fail_cnt = 0
                while save_flag:
                    if fail_cnt > 100:
                        print("ERROR: failed to save models")
                        break
                    try:
                        trainer.save_checkpoint(model_file)
                        epoch_stat['record_num'] += 1
                        record_file = os.path.join(
                            model_path,
                            str(epoch_stat['record_num']) + '-' + model_name)
                        trainer.save_checkpoint(record_file)
                        save_flag = False
                    except:
                        fail_cnt = fail_cnt + 1
                for p in trainer.model.parameters:
                    p.value = temp[p.uid]
            else:
                epoch_stat['best_since'] += 1
                if epoch_stat['best_since'] > training_config['stop_after']:
                    return False

        if profiling:
            C.debugging.enable_profiler()

        return True

    if train_data_ext == '.ctf':
        mb_source, input_map = create_mb_and_map(loss, train_data_file,
                                                 polymath)

        minibatch_size = training_config['minibatch_size']  # number of samples
        epoch_size = training_config['epoch_size']

        for epoch in range(max_epochs):
            num_seq = 0
            while True:
                if trainer.total_number_of_samples_seen >= training_config[
                        'distributed_after']:
                    data = mb_source.next_minibatch(
                        minibatch_size * C.Communicator.num_workers(),
                        input_map=input_map,
                        num_data_partitions=C.Communicator.num_workers(),
                        partition_index=C.Communicator.rank())
                else:
                    data = mb_source.next_minibatch(minibatch_size,
                                                    input_map=input_map)

                trainer.train_minibatch(data)
                num_seq += trainer.previous_minibatch_sample_count
                # print_para_info(dummy, dummies_info)
                if num_seq >= epoch_size:
                    break
            if not post_epoch_work(epoch_stat):
                break
    else:
        if train_data_ext != '.tsv':
            raise Exception("Unsupported format")

        minibatch_seqs = training_config[
            'minibatch_seqs']  # number of sequences

        for epoch in range(max_epochs):  # loop over epochs
            tsv_reader = create_tsv_reader(loss, train_data_file, polymath,
                                           minibatch_seqs,
                                           C.Communicator.num_workers())
            minibatch_count = 0
            for data in tsv_reader:
                if (minibatch_count %
                        C.Communicator.num_workers()) == C.Communicator.rank():
                    trainer.train_minibatch(data)  # update model with it
                    dummy.eval()
                minibatch_count += 1
            if not post_epoch_work(epoch_stat):
                break

    if profiling:
        C.debugging.stop_profiler()
Beispiel #11
0
    ((0.2, 0), [0.2, 0.2, 0.2, 0.2], 0),
    (([0.2, 0.4], 0, 5), [0.2] * 5 + [0.4] * 20, 0),
    (([(3, 0.2), (2, 0.4),
       (1, 0.8)], 0, 5), [0.2] * 15 + [0.4] * 10 + [0.8] * 20, 0),
]

MOMENTUM_SCHEDULE_PARAMS = [
    ((0.2, ), [0.2]),
    ((0.2, ), [0.2, 0.2, 0.2, 0.2]),
    (([0.2, 0.4], 5), [0.2] * 5 + [0.4] * 20),
    (([(3, 0.2), (2, 0.4),
       (1, 0.8)], 5), [0.2] * 15 + [0.4] * 10 + [0.8] * 20),
]

LEARNER_LAMBDAS = [
    lambda params: C.adadelta(params), lambda params: C.adagrad(
        params, lr=learning_rate_schedule(1, UnitType.minibatch)),
    lambda params: C.adam(params,
                          lr=learning_rate_schedule(1, UnitType.minibatch),
                          momentum=C.momentum_schedule(0.9)),
    lambda params: C.fsadagrad(params,
                               lr=learning_rate_schedule(
                                   1, UnitType.minibatch),
                               momentum=C.momentum_schedule(0.9)),
    lambda params: C.nesterov(params,
                              lr=learning_rate_schedule(1, UnitType.minibatch),
                              momentum=C.momentum_schedule(0.9)),
    lambda params: C.rmsprop(params,
                             lr=learning_rate_schedule(1, UnitType.minibatch),
                             gamma=0.1,
                             inc=3.0,
Beispiel #12
0
def test_learner_init():
    i = C.input_variable(shape=(1,), needs_gradient=True, name='a')
    w = parameter(shape=(1,))

    res = i * w

    #test new API: learning_parameter_schedule

    #explicitly specify reference minibatch size and learning rate is in number:
    learner = sgd(res.parameters, lr=0.1, minibatch_size = 25)
    assert learner.is_compatible_mode() == False
    assert learner.minibatch_size == 25 #the learner's reference minibatch
    #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters:
    assert learner._learning_rate_schedule.minibatch_size == 25
    assert learner.learning_rate() == 0.1

    #no explicitly specification of reference minibatch size and learning rate is in number:
    learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1))
    assert learner.is_compatible_mode() == False
    assert learner.minibatch_size == C.learners.IGNORE #the learner's reference minibatch
    #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters:
    assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE
    assert learner.learning_rate() == 0.1


    learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1, 20), minibatch_size = 25)
    assert learner.is_compatible_mode() == False
    assert learner.minibatch_size == 25 #the learner's reference minibatch
    #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters:
    assert learner._learning_rate_schedule.minibatch_size == 20
    assert learner.learning_rate() == 0.1


    learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1, 20))
    assert learner.is_compatible_mode() == False
    #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters:
    assert learner._learning_rate_schedule.minibatch_size == 20
    assert learner.learning_rate() == 0.1

    #no explicitly specification of reference minibatch size and learning rate is in number:
    learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1))
    assert learner.is_compatible_mode() == False
    assert learner.minibatch_size == C.learners.IGNORE #the learner's reference minibatch
    #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters:
    assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE
    assert learner.learning_rate() == 0.1


    #no explicitly specification of reference minibatch size and learning rate is in number:
    learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1), minibatch_size=C.learners.IGNORE)
    assert learner.is_compatible_mode() == True
    assert learner.minibatch_size == C.learners.IGNORE #the learner's reference minibatch
    #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters:
    assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE
    assert learner.learning_rate() == 0.1


    learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1, 20), minibatch_size=C.learners.IGNORE)
    assert learner.is_compatible_mode() == True
    assert learner.minibatch_size == C.learners.IGNORE #the learner's reference minibatch
    #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters:
    assert learner._learning_rate_schedule.minibatch_size == 20
    assert learner.learning_rate() == 0.1

    #no explicitly specification of reference minibatch size and learning rate is in number:
    learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1), minibatch_size=C.learners.IGNORE)
    assert learner.is_compatible_mode() == True
    assert learner.minibatch_size == C.learners.IGNORE #the learner's reference minibatch
    #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters:
    assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE
    assert learner.learning_rate() == 0.1

    mysgd = C.sgd(parameters=res.parameters, lr=0.4, minibatch_size=32)
    assert mysgd.minibatch_size == 32
    assert mysgd._learning_rate_schedule.minibatch_size == 32
    assert mysgd.learning_rate() == 0.4

    mymomentum = C.momentum_sgd(parameters=res.parameters, lr=0.4, momentum=0.9, minibatch_size=32)
    assert mymomentum.minibatch_size == 32
    assert mymomentum._learning_rate_schedule.minibatch_size == 32
    assert mymomentum.learning_rate() == 0.4

    myadadelta = C.adadelta(parameters=res.parameters, lr=0.4, minibatch_size=32)
    assert myadadelta.minibatch_size == 32
    assert myadadelta._learning_rate_schedule.minibatch_size == 32
    assert myadadelta.learning_rate() == 0.4

    myadam = C.adam(parameters=res.parameters, lr=0.4, momentum=0.9, variance_momentum=0.9, minibatch_size=32)
    assert myadam.minibatch_size == 32
    assert myadam._learning_rate_schedule.minibatch_size == 32
    assert myadam.learning_rate() == 0.4

    myadagrad = C.adagrad(parameters=res.parameters, lr=0.4, minibatch_size=32)
    assert myadagrad.minibatch_size == 32
    assert myadagrad._learning_rate_schedule.minibatch_size == 32
    assert myadagrad.learning_rate() == 0.4

    myfsadagrad = C.fsadagrad(parameters=res.parameters, lr=0.4, momentum=0.9, variance_momentum=0.9,
                              minibatch_size=32)
    assert myfsadagrad.minibatch_size == 32
    assert myfsadagrad._learning_rate_schedule.minibatch_size == 32
    assert myfsadagrad.learning_rate() == 0.4

    mynesterov = C.nesterov(parameters=res.parameters, lr=0.4, momentum=0.9, minibatch_size=32)
    assert mynesterov.minibatch_size == 32
    assert mynesterov._learning_rate_schedule.minibatch_size == 32
    assert mynesterov.learning_rate() == 0.4

    myrmsrop = C.rmsprop(parameters=res.parameters, lr=0.4, gamma=0.5, inc=1.2, dec=0.7, max=10, min=1e-8,
                         minibatch_size=32)
    assert myrmsrop.minibatch_size == 32
    assert myrmsrop._learning_rate_schedule.minibatch_size == 32
    assert myrmsrop.learning_rate() == 0.4

    mysgd = C.sgd(parameters=res.parameters, lr=[0.4, 0.1, 0.001], minibatch_size=32, epoch_size=512)
    assert mysgd.minibatch_size == 32
    assert mysgd._learning_rate_schedule.minibatch_size == 32
    assert mysgd._learning_rate_schedule[0] == 0.4
    assert mysgd._learning_rate_schedule[512] == 0.1
    assert mysgd._learning_rate_schedule[512 * 2] == 0.001

    mymomentum = C.momentum_sgd(parameters=res.parameters, lr=[0.4, 0.1, 0.001], momentum=[0.9],
                                minibatch_size=32, epoch_size=512)
    assert mymomentum.minibatch_size == 32
    assert mymomentum._learning_rate_schedule.minibatch_size == 32
    assert mymomentum._learning_rate_schedule[0] == 0.4
    assert mymomentum._learning_rate_schedule[512] == 0.1
    assert mymomentum._learning_rate_schedule[512 * 2] == 0.001


    myadadelta = C.adadelta(parameters=res.parameters, lr=[0.4, 0.1, 0.001],
                            minibatch_size=32, epoch_size=512)
    assert myadadelta.minibatch_size == 32
    assert myadadelta._learning_rate_schedule.minibatch_size == 32
    assert myadadelta._learning_rate_schedule[0] == 0.4
    assert myadadelta._learning_rate_schedule[512] == 0.1
    assert myadadelta._learning_rate_schedule[512 * 2] == 0.001

    myadam = C.adam(parameters=res.parameters, lr=[0.4, 0.1, 0.001], momentum=[0.9, 0.1, 0.001], variance_momentum=[0.9],
                    minibatch_size=32, epoch_size=512)
    assert myadam.minibatch_size == 32
    assert myadam._learning_rate_schedule.minibatch_size == 32
    assert myadam._learning_rate_schedule[0] == 0.4
    assert myadam._learning_rate_schedule[512] == 0.1
    assert myadam._learning_rate_schedule[512 * 2] == 0.001

    myadagrad = C.adagrad(parameters=res.parameters, lr=[0.4, 0.1, 0.001], minibatch_size=32, epoch_size=512)
    assert myadagrad.minibatch_size == 32
    assert myadagrad._learning_rate_schedule.minibatch_size == 32
    assert myadagrad._learning_rate_schedule[0] == 0.4
    assert myadagrad._learning_rate_schedule[512] == 0.1
    assert myadagrad._learning_rate_schedule[512 * 2] == 0.001

    myfsadagrad = C.fsadagrad(parameters=res.parameters, lr=[0.4, 0.1, 0.001], momentum=[0.9],
                              variance_momentum=[0.9],
                              minibatch_size=32, epoch_size=512)
    assert myadagrad.minibatch_size == 32
    assert myadagrad._learning_rate_schedule.minibatch_size == 32
    assert myadagrad._learning_rate_schedule[0] == 0.4
    assert myadagrad._learning_rate_schedule[512] == 0.1
    assert myadagrad._learning_rate_schedule[512 * 2] == 0.001

    mynesterov = C.nesterov(parameters=res.parameters, lr=[0.4, 0.1, 0.001], momentum=[0.9],
                            minibatch_size=32, epoch_size=512)
    assert mynesterov.minibatch_size == 32
    assert mynesterov._learning_rate_schedule.minibatch_size == 32
    assert mynesterov._learning_rate_schedule[0] == 0.4
    assert mynesterov._learning_rate_schedule[512] == 0.1
    assert mynesterov._learning_rate_schedule[512 * 2] == 0.001

    myrmsrop = C.rmsprop(parameters=res.parameters, lr=[0.4, 0.1, 0.001], gamma=0.5, inc=1.2, dec=0.7, max=10,
                         min=1e-8,
                         minibatch_size=32, epoch_size=512)
    assert myrmsrop.minibatch_size == 32
    assert myrmsrop._learning_rate_schedule.minibatch_size == 32
    assert myrmsrop._learning_rate_schedule[0] == 0.4
    assert myrmsrop._learning_rate_schedule[512] == 0.1
    assert myrmsrop._learning_rate_schedule[512 * 2] == 0.001

    learner_parameter = learner.parameters
    from cntk.variables import Parameter
    param = learner_parameter[0]
    assert isinstance(param, Parameter)

    unit_gain_value = C.default_unit_gain_value()
    assert unit_gain_value

    momentum = C.momentum_schedule(0.999, minibatch_size=1)
    lr_per_sample = learning_parameter_schedule(0.1, minibatch_size = 1)
    C.momentum_sgd(res.parameters, lr_per_sample, momentum)
    C.momentum_sgd(res.parameters, lr_per_sample, momentum, unit_gain_value)
    C.momentum_sgd(res.parameters, lr_per_sample, momentum, unit_gain=unit_gain_value)

    C.set_default_unit_gain_value(False)
    unit_gain_value = C.default_unit_gain_value()
    assert not unit_gain_value

    lr_per_sample = learning_parameter_schedule([0.1, 0.2], minibatch_size = 1)
    C.nesterov(res.parameters, lr=lr_per_sample, momentum=momentum)
    C.nesterov(res.parameters, lr_per_sample, momentum, unit_gain_value)
    C.nesterov(res.parameters, lr=lr_per_sample, momentum=momentum, unit_gain=unit_gain_value)

    lr_per_sample = learning_parameter_schedule([0.1]*3 +[0.2]*2 +[0.3], minibatch_size=1)
    C.adagrad(res.parameters, lr=lr_per_sample, need_ave_multiplier=True)

    C.set_default_unit_gain_value(True)
    unit_gain_value = C.default_unit_gain_value()
    assert unit_gain_value

    lr_per_sample = learning_parameter_schedule([(3,0.1), (2, 0.2), (1, 0.3)], minibatch_size=1)
    C.fsadagrad(res.parameters, lr=lr_per_sample, momentum=momentum)
    C.fsadagrad(res.parameters, lr_per_sample, momentum, unit_gain_value)
    C.fsadagrad(res.parameters, lr=lr_per_sample, momentum=momentum, unit_gain=unit_gain_value)

    gamma, inc, dec, max, min = [0.5, 1.2, 0.7, 10, 1e-8]
    lr_per_sample = learning_parameter_schedule([0.1, 0.2], minibatch_size = 1, epoch_size = 100)
    C.rmsprop(res.parameters, lr_per_sample, gamma, inc, dec, max, min, True)

    C.adadelta(res.parameters, lr_per_sample)
        #not specifying reference mb sizes
        ((0.2, 0), [0.2], 0),
        ((0.2, 0), [0.2, 0.2, 0.2, 0.2], 0),
        (([0.2,0.4], 0, 5), [0.2]*5+[0.4]*20, 0),
        (([(3,0.2),(2,0.4),(1,0.8)], 0, 5), [0.2]*15+[0.4]*10+[0.8]*20, 0),
        ]

MOMENTUM_SCHEDULE_PARAMS = [
        ((0.2,), [0.2]),
        ((0.2,), [0.2, 0.2, 0.2, 0.2]),
        (([0.2,0.4], 5), [0.2]*5+[0.4]*20),
        (([(3,0.2),(2,0.4),(1,0.8)], 5), [0.2]*15+[0.4]*10+[0.8]*20),
        ]
        
LEARNER_LAMBDAS = [
    lambda params: C.adadelta(params),
    lambda params: C.adagrad(params, lr=learning_rate_schedule(1, UnitType.minibatch)),
    lambda params: C.adam(params, lr=learning_rate_schedule(1, UnitType.minibatch), momentum=C.momentum_schedule(0.9)),
    lambda params: C.fsadagrad(params, lr=learning_rate_schedule(1, UnitType.minibatch), momentum=C.momentum_schedule(0.9)),
    lambda params: C.nesterov(params, lr=learning_rate_schedule(1, UnitType.minibatch), momentum=C.momentum_schedule(0.9)),
    lambda params: C.rmsprop(params, lr=learning_rate_schedule(1, UnitType.minibatch), gamma=0.1, inc=3.0, dec=0.1, max=np.inf, min=1e-8),
    lambda params: C.sgd(params, lr=learning_rate_schedule(1, UnitType.minibatch)),
    lambda params: C.momentum_sgd(params, lr=learning_rate_schedule(1, UnitType.minibatch), momentum=C.momentum_schedule(0.9))]

@pytest.mark.parametrize("params, expectation, minibatch_size", LR_SCHEDULE_PARAMS_LEGACY)
def test_learning_rate_schedule(params, expectation, minibatch_size):
    l = learning_rate_schedule(*params)
    assert l.minibatch_size == minibatch_size
    assert [l[i] for i in range(len(expectation))] == expectation

@pytest.mark.parametrize("params, expectation, minibatch_size", LR_SCHEDULE_PARAMS)
Beispiel #14
0
def test_learner_init():
    i = C.input_variable(shape=(1,), needs_gradient=True, name='a')
    w = parameter(shape=(1,))

    res = i * w

    learner = sgd(res.parameters, lr=learning_rate_schedule(0.1, UnitType.sample))
    assert learner.learning_rate() == 0.1
    
    learner.reset_learning_rate(learning_rate_schedule([1,2,3], UnitType.minibatch));
    assert learner.learning_rate() == 1.0

    learner_parameter = learner.parameters
    from cntk.variables import Parameter
    param = learner_parameter[0]
    assert isinstance(param, Parameter)

    unit_gain_value = C.default_unit_gain_value()
    assert unit_gain_value

    momentum_time_constant = C.momentum_as_time_constant_schedule(1100)
    lr_per_sample = learning_rate_schedule(0.1, UnitType.sample)
    C.momentum_sgd(res.parameters, lr_per_sample, momentum_time_constant)
    C.momentum_sgd(res.parameters, lr_per_sample, momentum_time_constant, unit_gain_value)
    C.momentum_sgd(res.parameters, lr_per_sample, momentum_time_constant, unit_gain=unit_gain_value)

    C.set_default_unit_gain_value(False)
    unit_gain_value = C.default_unit_gain_value()
    assert not unit_gain_value

    lr_per_sample = learning_rate_schedule([0.1, 0.2], UnitType.sample)
    C.nesterov(res.parameters, lr=lr_per_sample, momentum=momentum_time_constant)
    C.nesterov(res.parameters, lr_per_sample, momentum_time_constant, unit_gain_value)
    C.nesterov(res.parameters, lr=lr_per_sample, momentum=momentum_time_constant, unit_gain=unit_gain_value)

    lr_per_sample = learning_rate_schedule([0.1]*3 +[0.2]*2 +[0.3], UnitType.sample)
    C.adagrad(res.parameters, lr=lr_per_sample, need_ave_multiplier=True)

    C.set_default_unit_gain_value(True)
    unit_gain_value = C.default_unit_gain_value()
    assert unit_gain_value

    lr_per_sample = learning_rate_schedule([(3,0.1), (2, 0.2), (1, 0.3)], UnitType.sample)
    C.fsadagrad(res.parameters, lr=lr_per_sample, momentum=momentum_time_constant)
    C.fsadagrad(res.parameters, lr_per_sample, momentum_time_constant, unit_gain_value)
    C.fsadagrad(res.parameters, lr=lr_per_sample, momentum=momentum_time_constant, unit_gain=unit_gain_value)

    gamma, inc, dec, max, min = [0.1]*5
    lr_per_sample = learning_rate_schedule([0.1, 0.2], UnitType.sample, 100)
    C.rmsprop(res.parameters, lr_per_sample, gamma, inc, dec, max, min, True)

    C.set_default_use_mean_gradient_value(False)
    use_mean_gradient_value = C.default_use_mean_gradient_value()
    assert not use_mean_gradient_value

    C.adadelta(res.parameters, lr_per_sample)
    
    C.set_default_use_mean_gradient_value(True)
    use_mean_gradient_value = C.default_use_mean_gradient_value()
    assert use_mean_gradient_value

    C.adadelta(res.parameters, lr_per_sample)
Beispiel #15
0
    ((0.2, UnitType.sample), [0.2, 0.2, 0.2, 0.2]),
    (([0.2, 0.4], UnitType.sample, 5), [0.2] * 5 + [0.4] * 20),
    (([(3, 0.2), (2, 0.4),
       (1, 0.8)], UnitType.sample, 5), [0.2] * 15 + [0.4] * 10 + [0.8] * 20),
]

MOMENTUM_SCHEDULE_PARAMS = [
    ((0.2, ), [0.2]),
    ((0.2, ), [0.2, 0.2, 0.2, 0.2]),
    (([0.2, 0.4], 5), [0.2] * 5 + [0.4] * 20),
    (([(3, 0.2), (2, 0.4),
       (1, 0.8)], 5), [0.2] * 15 + [0.4] * 10 + [0.8] * 20),
]

LEARNER_LAMBDAS = [
    lambda params: C.adadelta(params), lambda params: C.adagrad(
        params, lr=learning_rate_schedule(1, UnitType.minibatch)),
    lambda params: C.adam(params,
                          lr=learning_rate_schedule(1, UnitType.minibatch),
                          momentum=C.momentum_schedule(0.9)),
    lambda params: C.fsadagrad(params,
                               lr=learning_rate_schedule(
                                   1, UnitType.minibatch),
                               momentum=C.momentum_schedule(0.9)),
    lambda params: C.nesterov(params,
                              lr=learning_rate_schedule(1, UnitType.minibatch),
                              momentum=C.momentum_schedule(0.9)),
    lambda params: C.rmsprop(params,
                             lr=learning_rate_schedule(1, UnitType.minibatch),
                             gamma=0.1,
                             inc=3.0,
Beispiel #16
0
def create_learner(paras, lr):
    import cntk as C
    return C.adadelta(paras, lr, 0.95, 1e-6,\
            l2_regularization_weight=1.0, gaussian_noise_injection_std_dev=0,\
            gradient_clipping_threshold_per_sample=3.0)
Beispiel #17
0
def test_learner_init_legacy():
    i = C.input_variable(shape=(1, ), needs_gradient=True, name='a')
    w = parameter(shape=(1, ))

    res = i * w

    # for backcompatibility test
    # this will be deprecated in future version
    learner = sgd(res.parameters,
                  lr=learning_rate_schedule(0.1, UnitType.sample))
    assert learner._learning_rate_schedule.minibatch_size == 1  # the deprecated per sample schedule should not use compatible mode
    assert learner.learning_rate() == 0.1

    # for backcompatibility test
    # this will be deprecated in future version
    # The UnitType will provide per minibatch instruction for the learner
    # this will be deprecated in future version
    learner = sgd(res.parameters,
                  lr=learning_rate_schedule(0.1, UnitType.minibatch))
    assert learner.is_compatible_mode() == False
    assert learner.learning_rate() == 0.1
    assert learner.minibatch_size == C.learners.IGNORE
    assert learner._learning_rate_schedule.minibatch_size == 0

    # for backcompatibility test, in reset learning rate, the learner won't receive the reference minibatch size from the schedule
    # user will need to specify the reference minibatch size explicitly
    # this will be deprecated in future version
    learner = sgd(res.parameters, lr=0.1)
    learner.reset_learning_rate(
        learning_rate_schedule([1, 2, 3], UnitType.minibatch))
    assert learner.learning_rate() == 1.0
    learner.minibatch_size = C.learners.IGNORE  # reset to be per minibatch
    assert learner.minibatch_size == C.learners.IGNORE
    assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE
    assert learner.is_compatible_mode() == True

    # for backcompatibility test
    # this will be deprecated in future version
    learner = sgd(res.parameters,
                  lr=learning_rate_schedule(0.1, UnitType.sample),
                  minibatch_size=C.learners.IGNORE)
    assert learner.is_compatible_mode() == True
    assert learner.learning_rate() == 0.1
    assert learner.minibatch_size == C.learners.IGNORE  # the learner's reference minibatch size is still 0

    # this will be deprecated in future version: This is logical invalid combination but it was the only way to use mean gradient and set learning rate in the past.
    learner = sgd(res.parameters,
                  lr=learning_rate_schedule(0.1, UnitType.sample),
                  use_mean_gradient=True)
    assert learner.is_compatible_mode() == True
    assert learner.learning_rate() == 0.1
    #test the override in the new version
    assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE
    assert learner.minibatch_size == C.learners.IGNORE  # the learner's reference minibatch size is still 0

    # for backcompatibility test
    # this will be deprecated in future version
    # The UnitType will provide per minibatch instruction for the learner
    # this will be deprecated in future version
    learner = sgd(res.parameters,
                  lr=learning_rate_schedule(0.1, UnitType.minibatch),
                  minibatch_size=C.learners.IGNORE)
    assert learner.is_compatible_mode() == True
    assert learner.learning_rate() == 0.1
    assert learner.minibatch_size == C.learners.IGNORE
    assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE

    # for backcompatibility test, in reset learning rate, the learner won't receive the reference minibatch size from the schedule
    # user will need to specify the reference minibatch size explicitly
    # this will be deprecated in future version
    learner = sgd(res.parameters, lr=0.1)
    learner.reset_learning_rate(
        learning_rate_schedule([1, 2, 3], UnitType.minibatch))
    assert learner.learning_rate() == 1.0
    learner.minibatch_size = C.learners.IGNORE  # reset to be per minibatch
    assert learner.minibatch_size == C.learners.IGNORE
    assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE
    assert learner.is_compatible_mode() == True

    learner_parameter = learner.parameters
    from cntk.variables import Parameter
    param = learner_parameter[0]
    assert isinstance(param, Parameter)

    unit_gain_value = C.default_unit_gain_value()
    assert unit_gain_value

    # back compatible API test
    momentum_time_constant = C.momentum_as_time_constant_schedule(1100)
    lr_per_sample = learning_parameter_schedule(0.1, minibatch_size=1)
    C.momentum_sgd(res.parameters, lr_per_sample, momentum_time_constant)
    C.momentum_sgd(res.parameters, lr_per_sample, momentum_time_constant,
                   unit_gain_value)
    C.momentum_sgd(res.parameters,
                   lr_per_sample,
                   momentum_time_constant,
                   unit_gain=unit_gain_value)

    C.set_default_unit_gain_value(False)
    unit_gain_value = C.default_unit_gain_value()
    assert not unit_gain_value

    C.set_default_unit_gain_value(True)
    unit_gain_value = C.default_unit_gain_value()
    assert unit_gain_value

    lr_per_sample = learning_rate_schedule([(3, 0.1), (2, 0.2), (1, 0.3)],
                                           unit=UnitType.sample)
    C.fsadagrad(res.parameters,
                lr=lr_per_sample,
                momentum=momentum_time_constant)
    C.fsadagrad(res.parameters, lr_per_sample, momentum_time_constant,
                unit_gain_value)
    C.fsadagrad(res.parameters,
                lr=lr_per_sample,
                momentum=momentum_time_constant,
                unit_gain=unit_gain_value)

    gamma, inc, dec, max, min = [0.5, 1.2, 0.7, 10, 1e-8]
    lr_per_sample = learning_rate_schedule([0.1, 0.2],
                                           unit=UnitType.sample,
                                           epoch_size=100)
    C.rmsprop(res.parameters, lr_per_sample, gamma, inc, dec, max, min, True)

    C.adadelta(res.parameters, lr_per_sample, use_mean_gradient=True)
	def create_model(self):
		model1i = C.layers.Sequential([
			# Convolution layers
			C.layers.Convolution2D((1,3), num_filters=8, pad=True, reduction_rank=0, activation=C.ops.tanh,name='conv_f'),
			C.layers.Convolution2D((1,3), num_filters=16, pad=True, reduction_rank=1, activation=C.ops.tanh,name='conv2_f'),
			C.layers.Convolution2D((1,3), num_filters=32, strides=(1,3), pad=False, reduction_rank=1, activation=C.ops.tanh,name='conv3_f'),
			######
			# Dense layers
			C.layers.Dense(32, activation=C.ops.relu,name='dense1_f'),
			#C.layers.Dense(32, activation=C.ops.relu,name='dense1_f'),
			#C.layers.Dense(16, activation=C.ops.relu,name='dense1_f')
		]) (self._input)

		### target
		model1t = C.layers.Sequential([
			#C.layers.Dense(16, activation=C.ops.relu,name='dense2_f'),
			C.layers.Dense(32, activation=C.ops.relu,name='dense3_f')
		]) (self._target)

		### concatenate both processed target and observations
		inputs = C.ops.splice(model1i,model1t)

		### Use input to predict next hidden state, and generate
		### next observation
		model1 = C.layers.Sequential([
			C.layers.Dense(64, activation=C.ops.relu),
			######
			# Recurrence
			C.layers.Recurrence(C.layers.LSTM(2048, init=C.glorot_uniform()),name='lstm_f'),
			######
			# Prediction
			#C.layers.Dense(16, activation=C.ops.relu,name='predict'),
			######
			# Decoder layers
			C.layers.Dense(256, activation=C.ops.relu,name='dense4_f'),
			#C.layers.Dense(64, activation=C.ops.relu,name='dense2'),
			C.layers.Dense(114, activation=C.ops.relu,name='dense5_f')
		])(inputs)

		######
		# Reshape output
		model2 = C.ops.reshape(model1,(1,1,114))

		model3 = C.layers.Sequential([
			######
			# Deconvolution layers
			C.layers.ConvolutionTranspose((1,7), num_filters=8, strides=(1,1), pad=False, bias=False, init=C.glorot_uniform(1),name='deconv1'),
			C.layers.ConvolutionTranspose((1,3), num_filters=4, strides=(1,3), pad=False, bias=False, init=C.glorot_uniform(1),name='deconv2'),
			C.layers.ConvolutionTranspose((1,3), num_filters=1,  pad=True,name='deconv3')
		])(model2)

		model = C.ops.reshape(model3,(1,360))

		if self._load_model:
			model = C.load_model('dnns/feature_predicter_ours.dnn')
		
		err = C.ops.reshape(C.ops.minus(model,self._output), (self._output_size))
		sq_err = C.ops.square(err)
		mse = C.ops.reduce_mean(sq_err)
		rmse_loss = C.ops.sqrt(mse)
		rmse_eval = rmse_loss

		learner = C.adadelta(model.parameters)
		progress_printer = C.logging.ProgressPrinter(tag='Training')
		trainer = C.Trainer(model, (rmse_loss,rmse_eval), learner, progress_printer)
		return model, rmse_loss, learner, trainer
Beispiel #19
0
def test_learner_init():
    i = C.input_variable(shape=(1, ), needs_gradient=True, name='a')
    w = parameter(shape=(1, ))

    res = i * w

    #test new API: learning_parameter_schedule

    #explictly specify reference minibatch size and learning rate is in number:
    learner = sgd(res.parameters, lr=0.1, minibatch_size=25)
    assert learner.is_compatible_mode() == False
    assert learner.minibatch_size == 25  #the learner's reference minibatch
    #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters:
    assert learner._learning_rate_schedule.minibatch_size == 25
    assert learner.learning_rate() == 0.1

    #no explictly specification of reference minibatch size and learning rate is in number:
    learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1))
    assert learner.is_compatible_mode() == False
    assert learner.minibatch_size == C.learners.IGNORE  #the learner's reference minibatch
    #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters:
    assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE
    assert learner.learning_rate() == 0.1

    learner = sgd(res.parameters,
                  lr=learning_parameter_schedule(0.1, 20),
                  minibatch_size=25)
    assert learner.is_compatible_mode() == False
    assert learner.minibatch_size == 25  #the learner's reference minibatch
    #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters:
    assert learner._learning_rate_schedule.minibatch_size == 20
    assert learner.learning_rate() == 0.1

    learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1, 20))
    assert learner.is_compatible_mode() == False
    #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters:
    assert learner._learning_rate_schedule.minibatch_size == 20
    assert learner.learning_rate() == 0.1

    #no explictly specification of reference minibatch size and learning rate is in number:
    learner = sgd(res.parameters, lr=learning_parameter_schedule(0.1))
    assert learner.is_compatible_mode() == False
    assert learner.minibatch_size == C.learners.IGNORE  #the learner's reference minibatch
    #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters:
    assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE
    assert learner.learning_rate() == 0.1

    #no explictly specification of reference minibatch size and learning rate is in number:
    learner = sgd(res.parameters,
                  lr=learning_parameter_schedule(0.1),
                  minibatch_size=C.learners.IGNORE)
    assert learner.is_compatible_mode() == True
    assert learner.minibatch_size == C.learners.IGNORE  #the learner's reference minibatch
    #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters:
    assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE
    assert learner.learning_rate() == 0.1

    learner = sgd(res.parameters,
                  lr=learning_parameter_schedule(0.1, 20),
                  minibatch_size=C.learners.IGNORE)
    assert learner.is_compatible_mode() == True
    assert learner.minibatch_size == C.learners.IGNORE  #the learner's reference minibatch
    #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters:
    assert learner._learning_rate_schedule.minibatch_size == 20
    assert learner.learning_rate() == 0.1

    #no explictly specification of reference minibatch size and learning rate is in number:
    learner = sgd(res.parameters,
                  lr=learning_parameter_schedule(0.1),
                  minibatch_size=C.learners.IGNORE)
    assert learner.is_compatible_mode() == True
    assert learner.minibatch_size == C.learners.IGNORE  #the learner's reference minibatch
    #with direct learner learning rate number specification, the learning rate schedule get the reference minibatch size from the learner parameters:
    assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE
    assert learner.learning_rate() == 0.1

    mysgd = C.sgd(parameters=res.parameters, lr=0.4, minibatch_size=32)
    assert mysgd.minibatch_size == 32
    assert mysgd._learning_rate_schedule.minibatch_size == 32
    assert mysgd.learning_rate() == 0.4

    mymomentum = C.momentum_sgd(parameters=res.parameters,
                                lr=0.4,
                                momentum=0.9,
                                minibatch_size=32)
    assert mymomentum.minibatch_size == 32
    assert mymomentum._learning_rate_schedule.minibatch_size == 32
    assert mymomentum.learning_rate() == 0.4

    myadadelta = C.adadelta(parameters=res.parameters,
                            lr=0.4,
                            minibatch_size=32)
    assert myadadelta.minibatch_size == 32
    assert myadadelta._learning_rate_schedule.minibatch_size == 32
    assert myadadelta.learning_rate() == 0.4

    myadam = C.adam(parameters=res.parameters,
                    lr=0.4,
                    momentum=0.9,
                    variance_momentum=0.9,
                    minibatch_size=32)
    assert myadam.minibatch_size == 32
    assert myadam._learning_rate_schedule.minibatch_size == 32
    assert myadam.learning_rate() == 0.4

    myadagrad = C.adagrad(parameters=res.parameters, lr=0.4, minibatch_size=32)
    assert myadagrad.minibatch_size == 32
    assert myadagrad._learning_rate_schedule.minibatch_size == 32
    assert myadagrad.learning_rate() == 0.4

    myfsadagrad = C.fsadagrad(parameters=res.parameters,
                              lr=0.4,
                              momentum=0.9,
                              variance_momentum=0.9,
                              minibatch_size=32)
    assert myfsadagrad.minibatch_size == 32
    assert myfsadagrad._learning_rate_schedule.minibatch_size == 32
    assert myfsadagrad.learning_rate() == 0.4

    mynesterov = C.nesterov(parameters=res.parameters,
                            lr=0.4,
                            momentum=0.9,
                            minibatch_size=32)
    assert mynesterov.minibatch_size == 32
    assert mynesterov._learning_rate_schedule.minibatch_size == 32
    assert mynesterov.learning_rate() == 0.4

    myrmsrop = C.rmsprop(parameters=res.parameters,
                         lr=0.4,
                         gamma=0.5,
                         inc=1.2,
                         dec=0.7,
                         max=10,
                         min=1e-8,
                         minibatch_size=32)
    assert myrmsrop.minibatch_size == 32
    assert myrmsrop._learning_rate_schedule.minibatch_size == 32
    assert myrmsrop.learning_rate() == 0.4

    mysgd = C.sgd(parameters=res.parameters,
                  lr=[0.4, 0.1, 0.001],
                  minibatch_size=32,
                  epoch_size=512)
    assert mysgd.minibatch_size == 32
    assert mysgd._learning_rate_schedule.minibatch_size == 32
    assert mysgd._learning_rate_schedule[0] == 0.4
    assert mysgd._learning_rate_schedule[512] == 0.1
    assert mysgd._learning_rate_schedule[512 * 2] == 0.001

    mymomentum = C.momentum_sgd(parameters=res.parameters,
                                lr=[0.4, 0.1, 0.001],
                                momentum=[0.9],
                                minibatch_size=32,
                                epoch_size=512)
    assert mymomentum.minibatch_size == 32
    assert mymomentum._learning_rate_schedule.minibatch_size == 32
    assert mymomentum._learning_rate_schedule[0] == 0.4
    assert mymomentum._learning_rate_schedule[512] == 0.1
    assert mymomentum._learning_rate_schedule[512 * 2] == 0.001

    myadadelta = C.adadelta(parameters=res.parameters,
                            lr=[0.4, 0.1, 0.001],
                            minibatch_size=32,
                            epoch_size=512)
    assert myadadelta.minibatch_size == 32
    assert myadadelta._learning_rate_schedule.minibatch_size == 32
    assert myadadelta._learning_rate_schedule[0] == 0.4
    assert myadadelta._learning_rate_schedule[512] == 0.1
    assert myadadelta._learning_rate_schedule[512 * 2] == 0.001

    myadam = C.adam(parameters=res.parameters,
                    lr=[0.4, 0.1, 0.001],
                    momentum=[0.9, 0.1, 0.001],
                    variance_momentum=[0.9],
                    minibatch_size=32,
                    epoch_size=512)
    assert myadam.minibatch_size == 32
    assert myadam._learning_rate_schedule.minibatch_size == 32
    assert myadam._learning_rate_schedule[0] == 0.4
    assert myadam._learning_rate_schedule[512] == 0.1
    assert myadam._learning_rate_schedule[512 * 2] == 0.001

    myadagrad = C.adagrad(parameters=res.parameters,
                          lr=[0.4, 0.1, 0.001],
                          minibatch_size=32,
                          epoch_size=512)
    assert myadagrad.minibatch_size == 32
    assert myadagrad._learning_rate_schedule.minibatch_size == 32
    assert myadagrad._learning_rate_schedule[0] == 0.4
    assert myadagrad._learning_rate_schedule[512] == 0.1
    assert myadagrad._learning_rate_schedule[512 * 2] == 0.001

    myfsadagrad = C.fsadagrad(parameters=res.parameters,
                              lr=[0.4, 0.1, 0.001],
                              momentum=[0.9],
                              variance_momentum=[0.9],
                              minibatch_size=32,
                              epoch_size=512)
    assert myadagrad.minibatch_size == 32
    assert myadagrad._learning_rate_schedule.minibatch_size == 32
    assert myadagrad._learning_rate_schedule[0] == 0.4
    assert myadagrad._learning_rate_schedule[512] == 0.1
    assert myadagrad._learning_rate_schedule[512 * 2] == 0.001

    mynesterov = C.nesterov(parameters=res.parameters,
                            lr=[0.4, 0.1, 0.001],
                            momentum=[0.9],
                            minibatch_size=32,
                            epoch_size=512)
    assert mynesterov.minibatch_size == 32
    assert mynesterov._learning_rate_schedule.minibatch_size == 32
    assert mynesterov._learning_rate_schedule[0] == 0.4
    assert mynesterov._learning_rate_schedule[512] == 0.1
    assert mynesterov._learning_rate_schedule[512 * 2] == 0.001

    myrmsrop = C.rmsprop(parameters=res.parameters,
                         lr=[0.4, 0.1, 0.001],
                         gamma=0.5,
                         inc=1.2,
                         dec=0.7,
                         max=10,
                         min=1e-8,
                         minibatch_size=32,
                         epoch_size=512)
    assert myrmsrop.minibatch_size == 32
    assert myrmsrop._learning_rate_schedule.minibatch_size == 32
    assert myrmsrop._learning_rate_schedule[0] == 0.4
    assert myrmsrop._learning_rate_schedule[512] == 0.1
    assert myrmsrop._learning_rate_schedule[512 * 2] == 0.001

    learner_parameter = learner.parameters
    from cntk.variables import Parameter
    param = learner_parameter[0]
    assert isinstance(param, Parameter)

    unit_gain_value = C.default_unit_gain_value()
    assert unit_gain_value

    momentum = C.momentum_schedule(0.999, minibatch_size=1)
    lr_per_sample = learning_parameter_schedule(0.1, minibatch_size=1)
    C.momentum_sgd(res.parameters, lr_per_sample, momentum)
    C.momentum_sgd(res.parameters, lr_per_sample, momentum, unit_gain_value)
    C.momentum_sgd(res.parameters,
                   lr_per_sample,
                   momentum,
                   unit_gain=unit_gain_value)

    C.set_default_unit_gain_value(False)
    unit_gain_value = C.default_unit_gain_value()
    assert not unit_gain_value

    lr_per_sample = learning_parameter_schedule([0.1, 0.2], minibatch_size=1)
    C.nesterov(res.parameters, lr=lr_per_sample, momentum=momentum)
    C.nesterov(res.parameters, lr_per_sample, momentum, unit_gain_value)
    C.nesterov(res.parameters,
               lr=lr_per_sample,
               momentum=momentum,
               unit_gain=unit_gain_value)

    lr_per_sample = learning_parameter_schedule([0.1] * 3 + [0.2] * 2 + [0.3],
                                                minibatch_size=1)
    C.adagrad(res.parameters, lr=lr_per_sample, need_ave_multiplier=True)

    C.set_default_unit_gain_value(True)
    unit_gain_value = C.default_unit_gain_value()
    assert unit_gain_value

    lr_per_sample = learning_parameter_schedule([(3, 0.1), (2, 0.2), (1, 0.3)],
                                                minibatch_size=1)
    C.fsadagrad(res.parameters, lr=lr_per_sample, momentum=momentum)
    C.fsadagrad(res.parameters, lr_per_sample, momentum, unit_gain_value)
    C.fsadagrad(res.parameters,
                lr=lr_per_sample,
                momentum=momentum,
                unit_gain=unit_gain_value)

    gamma, inc, dec, max, min = [0.5, 1.2, 0.7, 10, 1e-8]
    lr_per_sample = learning_parameter_schedule([0.1, 0.2],
                                                minibatch_size=1,
                                                epoch_size=100)
    C.rmsprop(res.parameters, lr_per_sample, gamma, inc, dec, max, min, True)

    C.adadelta(res.parameters, lr_per_sample)
Beispiel #20
0
        #not specifying reference mb sizes
        ((0.2, 0), [0.2], 0),
        ((0.2, 0), [0.2, 0.2, 0.2, 0.2], 0),
        (([0.2,0.4], 0, 5), [0.2]*5+[0.4]*20, 0),
        (([(3,0.2),(2,0.4),(1,0.8)], 0, 5), [0.2]*15+[0.4]*10+[0.8]*20, 0),
        ]

MOMENTUM_SCHEDULE_PARAMS = [
        ((0.2,), [0.2]),
        ((0.2,), [0.2, 0.2, 0.2, 0.2]),
        (([0.2,0.4], 5), [0.2]*5+[0.4]*20),
        (([(3,0.2),(2,0.4),(1,0.8)], 5), [0.2]*15+[0.4]*10+[0.8]*20),
        ]

LEARNER_LAMBDAS = [
    lambda params: C.adadelta(params),
    lambda params: C.adagrad(params, lr=learning_rate_schedule(1, UnitType.minibatch)),
    lambda params: C.adam(params, lr=learning_rate_schedule(1, UnitType.minibatch), momentum=C.momentum_schedule(0.9)),
    lambda params: C.fsadagrad(params, lr=learning_rate_schedule(1, UnitType.minibatch), momentum=C.momentum_schedule(0.9)),
    lambda params: C.nesterov(params, lr=learning_rate_schedule(1, UnitType.minibatch), momentum=C.momentum_schedule(0.9)),
    lambda params: C.rmsprop(params, lr=learning_rate_schedule(1, UnitType.minibatch), gamma=0.1, inc=3.0, dec=0.1, max=np.inf, min=1e-8),
    lambda params: C.sgd(params, lr=learning_rate_schedule(1, UnitType.minibatch)),
    lambda params: C.momentum_sgd(params, lr=learning_rate_schedule(1, UnitType.minibatch), momentum=C.momentum_schedule(0.9))]

@pytest.mark.parametrize("params, expectation, minibatch_size", LR_SCHEDULE_PARAMS_LEGACY)
def test_learning_rate_schedule(params, expectation, minibatch_size):
    l = learning_rate_schedule(*params)
    assert l.minibatch_size == minibatch_size
    assert [l[i] for i in range(len(expectation))] == expectation

@pytest.mark.parametrize("params, expectation, minibatch_size", LR_SCHEDULE_PARAMS)
def train(data_path,
          model_path,
          log_file,
          config_file,
          restore=False,
          profiling=False,
          gen_heartbeat=False):
    polymath = PolyMath(config_file)
    z, loss = polymath.model()
    training_config = importlib.import_module(config_file).training_config

    minibatch_size = training_config['minibatch_size']
    max_epochs = training_config['max_epochs']
    epoch_size = training_config['epoch_size']
    log_freq = training_config['log_freq']

    progress_writers = [
        C.logging.ProgressPrinter(num_epochs=max_epochs,
                                  freq=log_freq,
                                  tag='Training',
                                  log_to_file=log_file,
                                  rank=C.Communicator.rank(),
                                  gen_heartbeat=gen_heartbeat)
    ]

    lr = C.learning_parameter_schedule(training_config['lr'],
                                       minibatch_size=None,
                                       epoch_size=None)

    ema = {}
    dummies = []
    for p in z.parameters:
        ema_p = C.constant(0,
                           shape=p.shape,
                           dtype=p.dtype,
                           name='ema_%s' % p.uid)
        ema[p.uid] = ema_p
        dummies.append(C.reduce_sum(C.assign(ema_p,
                                             0.999 * ema_p + 0.001 * p)))
    dummy = C.combine(dummies)

    learner = C.adadelta(z.parameters, lr)

    if C.Communicator.num_workers() > 1:
        learner = C.data_parallel_distributed_learner(learner,
                                                      num_quantization_bits=1)

    trainer = C.Trainer(z, (loss, None), learner, progress_writers)

    if profiling:
        C.debugging.start_profiler(sync_gpu=True)

    train_data_file = os.path.join(data_path, training_config['train_data'])
    train_data_ext = os.path.splitext(train_data_file)[-1].lower()

    model_file = os.path.join(model_path, model_name)
    model = C.combine(list(z.outputs) + [loss.output])
    label_ab = argument_by_name(loss, 'ab')

    epoch_stat = {'best_val_err': 100, 'best_since': 0, 'val_since': 0}

    if restore and os.path.isfile(model_file):
        trainer.restore_from_checkpoint(model_file)
        epoch_stat['best_val_err'] = validate_model(
            os.path.join(data_path, training_config['val_data']), model,
            polymath)

    def post_epoch_work(epoch_stat):
        trainer.summarize_training_progress()
        epoch_stat['val_since'] += 1

        if epoch_stat['val_since'] == training_config['val_interval']:
            epoch_stat['val_since'] = 0
            temp = dict((p.uid, p.value) for p in z.parameters)
            for p in trainer.model.parameters:
                p.value = ema[p.uid].value
            val_err = validate_model(
                os.path.join(data_path, training_config['val_data']), model,
                polymath)
            if epoch_stat['best_val_err'] > val_err:
                epoch_stat['best_val_err'] = val_err
                epoch_stat['best_since'] = 0
                trainer.save_checkpoint(model_file)
                for p in trainer.model.parameters:
                    p.value = temp[p.uid]
            else:
                epoch_stat['best_since'] += 1
                if epoch_stat['best_since'] > training_config['stop_after']:
                    return False

        if profiling:
            C.debugging.enable_profiler()

        return True

    if train_data_ext == '.ctf':
        mb_source, input_map = create_mb_and_map(loss, train_data_file,
                                                 polymath)

        for epoch in range(max_epochs):
            num_seq = 0
            with tqdm(total=epoch_size, ncols=32,
                      smoothing=0.1) as progress_bar:
                while True:
                    if trainer.total_number_of_samples_seen >= training_config[
                            'distributed_after']:
                        data = mb_source.next_minibatch(
                            minibatch_size * C.Communicator.num_workers(),
                            input_map=input_map,
                            num_data_partitions=C.Communicator.num_workers(),
                            partition_index=C.Communicator.rank())
                    else:
                        data = mb_source.next_minibatch(minibatch_size,
                                                        input_map=input_map)

                    trainer.train_minibatch(data)
                    num_seq += trainer.previous_minibatch_sample_count
                    dummy.eval()
                    if num_seq >= epoch_size:
                        break
                    else:
                        progress_bar.update(
                            trainer.previous_minibatch_sample_count)
                if not post_epoch_work(epoch_stat):
                    break
    else:
        if train_data_ext != '.tsv':
            raise Exception("Unsupported format")

        minibatch_seqs = training_config[
            'minibatch_seqs']  # number of sequences

        for epoch in range(max_epochs):  # loop over epochs
            tsv_reader = create_tsv_reader(loss, train_data_file, polymath,
                                           minibatch_seqs,
                                           C.Communicator.num_workers())
            minibatch_count = 0
            for data in tsv_reader:
                if (minibatch_count %
                        C.Communicator.num_workers()) == C.Communicator.rank():
                    trainer.train_minibatch(data)  # update model with it
                    dummy.eval()
                minibatch_count += 1
            if not post_epoch_work(epoch_stat):
                break

    if profiling:
        C.debugging.stop_profiler()
Beispiel #22
0
#input = np.array(input_map)

#input = C.input_variable(1000)
#label = C.input_variable(num_output_classes)

z = create_model(input)


loss = C.cross_entropy_with_softmax(z, label)

label_error = C.squared_error(z, label)

# Instantiate the trainer object to drive the model training
learning_rate = 0.8
lr_schedule = C.learning_parameter_schedule(learning_rate)
learner = C.adadelta(z.parameters, lr_schedule)
trainer = C.Trainer(z, (loss, label_error), [learner])

# Define a utility function to compute the moving average sum.
# A more efficient implementation is possible with np.cumsum() function
def moving_average(a, w=5):
    if len(a) < w:
        return a[:]    # Need to send a copy of the array
    return [val if idx < w else sum(a[(idx-w):idx])/w for idx, val in enumerate(a)]


# Defines a utility that prints the training progress
def print_training_progress(trainer, mb, frequency, verbose=1):
    loss = "NA"
    error = "NA"
Beispiel #23
0
        #not specifying reference mb sizes
        ((0.2, 0), [0.2], 0),
        ((0.2, 0), [0.2, 0.2, 0.2, 0.2], 0),
        (([0.2,0.4], 0, 5), [0.2]*5+[0.4]*20, 0),
        (([(3,0.2),(2,0.4),(1,0.8)], 0, 5), [0.2]*15+[0.4]*10+[0.8]*20, 0),
        ]

MOMENTUM_SCHEDULE_PARAMS = [
        ((0.2,), [0.2]),
        ((0.2,), [0.2, 0.2, 0.2, 0.2]),
        (([0.2,0.4], 5), [0.2]*5+[0.4]*20),
        (([(3,0.2),(2,0.4),(1,0.8)], 5), [0.2]*15+[0.4]*10+[0.8]*20),
        ]

LEARNER_LAMBDAS = [
    lambda params: C.adadelta(params),
    lambda params: C.adagrad(params, lr=learning_rate_schedule(1, UnitType.minibatch)),
    lambda params: C.adam(params, lr=learning_rate_schedule(1, UnitType.minibatch), momentum=C.momentum_schedule(0.9)),
    lambda params: C.fsadagrad(params, lr=learning_rate_schedule(1, UnitType.minibatch), momentum=C.momentum_schedule(0.9)),
    lambda params: C.nesterov(params, lr=learning_rate_schedule(1, UnitType.minibatch), momentum=C.momentum_schedule(0.9)),
    lambda params: C.rmsprop(params, lr=learning_rate_schedule(1, UnitType.minibatch), gamma=0.1, inc=3.0, dec=0.1, max=np.inf, min=1e-8),
    lambda params: C.sgd(params, lr=learning_rate_schedule(1, UnitType.minibatch)),
    lambda params: C.momentum_sgd(params, lr=learning_rate_schedule(1, UnitType.minibatch), momentum=C.momentum_schedule(0.9))]

@pytest.mark.parametrize("params, expectation, minibatch_size", LR_SCHEDULE_PARAMS_LEGACY)
def test_learning_rate_schedule(params, expectation, minibatch_size):
    l = learning_rate_schedule(*params)
    assert l.minibatch_size == minibatch_size
    assert [l[i] for i in range(len(expectation))] == expectation

@pytest.mark.parametrize("params, expectation, minibatch_size", LR_SCHEDULE_PARAMS)
Beispiel #24
0
def test_learner_init_legacy():
    i = C.input_variable(shape=(1,), needs_gradient=True, name='a')
    w = parameter(shape=(1,))

    res = i * w

    # for backcompatibility test
    # this will be deprecated in future version
    learner = sgd(res.parameters, lr=learning_rate_schedule(0.1, UnitType.sample))
    assert learner._learning_rate_schedule.minibatch_size == 1  # the deprecated per sample schedule should not use compatible mode
    assert learner.learning_rate() == 0.1

    # for backcompatibility test
    # this will be deprecated in future version
    # The UnitType will provide per minibatch instruction for the learner
    # this will be deprecated in future version
    learner = sgd(res.parameters, lr=learning_rate_schedule(0.1, UnitType.minibatch))
    assert learner.is_compatible_mode() == False
    assert learner.learning_rate() == 0.1
    assert learner.minibatch_size == C.learners.IGNORE
    assert learner._learning_rate_schedule.minibatch_size == 0

    # for backcompatibility test, in reset learning rate, the learner won't receive the reference minibatch size from the schedule
    # user will need to specify the reference minibatch size explicitly
    # this will be deprecated in future version
    learner = sgd(res.parameters, lr=0.1)
    learner.reset_learning_rate(learning_rate_schedule([1, 2, 3], UnitType.minibatch))
    assert learner.learning_rate() == 1.0
    learner.minibatch_size = C.learners.IGNORE  # reset to be per minibatch
    assert learner.minibatch_size == C.learners.IGNORE
    assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE
    assert learner.is_compatible_mode() == True

    # for backcompatibility test
    # this will be deprecated in future version
    learner = sgd(res.parameters, lr=learning_rate_schedule(0.1, UnitType.sample), minibatch_size=C.learners.IGNORE)
    assert learner.is_compatible_mode() == True
    assert learner.learning_rate() == 0.1
    assert learner.minibatch_size == C.learners.IGNORE  # the learner's reference minibatch size is still 0

    # this will be deprecated in future version: This is logical invalid combination but it was the only way to use mean gradient and set learning rate in the past.
    learner = sgd(res.parameters, lr=learning_rate_schedule(0.1, UnitType.sample), use_mean_gradient=True)
    assert learner.is_compatible_mode() == True
    assert learner.learning_rate() == 0.1
    #test the override in the new version
    assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE
    assert learner.minibatch_size == C.learners.IGNORE  # the learner's reference minibatch size is still 0


    # for backcompatibility test
    # this will be deprecated in future version
    # The UnitType will provide per minibatch instruction for the learner
    # this will be deprecated in future version
    learner = sgd(res.parameters, lr=learning_rate_schedule(0.1, UnitType.minibatch), minibatch_size=C.learners.IGNORE)
    assert learner.is_compatible_mode() == True
    assert learner.learning_rate() == 0.1
    assert learner.minibatch_size == C.learners.IGNORE
    assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE

    # for backcompatibility test, in reset learning rate, the learner won't receive the reference minibatch size from the schedule
    # user will need to specify the reference minibatch size explicitly
    # this will be deprecated in future version
    learner = sgd(res.parameters, lr=0.1)
    learner.reset_learning_rate(learning_rate_schedule([1, 2, 3], UnitType.minibatch))
    assert learner.learning_rate() == 1.0
    learner.minibatch_size = C.learners.IGNORE  # reset to be per minibatch
    assert learner.minibatch_size == C.learners.IGNORE
    assert learner._learning_rate_schedule.minibatch_size == C.learners.IGNORE
    assert learner.is_compatible_mode() == True

    learner_parameter = learner.parameters
    from cntk.variables import Parameter
    param = learner_parameter[0]
    assert isinstance(param, Parameter)

    unit_gain_value = C.default_unit_gain_value()
    assert unit_gain_value

    # back compatible API test
    momentum_time_constant = C.momentum_as_time_constant_schedule(1100)
    lr_per_sample = learning_parameter_schedule(0.1, minibatch_size=1)
    C.momentum_sgd(res.parameters, lr_per_sample, momentum_time_constant)
    C.momentum_sgd(res.parameters, lr_per_sample, momentum_time_constant, unit_gain_value)
    C.momentum_sgd(res.parameters, lr_per_sample, momentum_time_constant, unit_gain=unit_gain_value)

    C.set_default_unit_gain_value(False)
    unit_gain_value = C.default_unit_gain_value()
    assert not unit_gain_value

    C.set_default_unit_gain_value(True)
    unit_gain_value = C.default_unit_gain_value()
    assert unit_gain_value

    lr_per_sample = learning_rate_schedule([(3, 0.1), (2, 0.2), (1, 0.3)], unit=UnitType.sample)
    C.fsadagrad(res.parameters, lr=lr_per_sample, momentum=momentum_time_constant)
    C.fsadagrad(res.parameters, lr_per_sample, momentum_time_constant, unit_gain_value)
    C.fsadagrad(res.parameters, lr=lr_per_sample, momentum=momentum_time_constant, unit_gain=unit_gain_value)

    gamma, inc, dec, max, min = [0.5, 1.2, 0.7, 10, 1e-8]
    lr_per_sample = learning_rate_schedule([0.1, 0.2], unit=UnitType.sample, epoch_size=100)
    C.rmsprop(res.parameters, lr_per_sample, gamma, inc, dec, max, min, True)

    C.adadelta(res.parameters, lr_per_sample, use_mean_gradient=True)
    def create_model(self):
        modeli = C.layers.Sequential([
            # Convolution layers
            C.layers.Convolution2D((1, 3),
                                   num_filters=8,
                                   pad=True,
                                   reduction_rank=0,
                                   activation=C.ops.tanh,
                                   name='conv_a'),
            C.layers.Convolution2D((1, 3),
                                   num_filters=16,
                                   pad=True,
                                   reduction_rank=1,
                                   activation=C.ops.tanh,
                                   name='conv2_a'),
            C.layers.Convolution2D((1, 3),
                                   num_filters=32,
                                   pad=False,
                                   reduction_rank=1,
                                   activation=C.ops.tanh,
                                   name='conv3_a'),
            ######
            # Dense layers
            #C.layers.Dense(128, activation=C.ops.relu,name='dense1_a'),
            #C.layers.Dense(64, activation=C.ops.relu,name='dense2_a'),
            C.layers.Dense(361, activation=C.ops.relu, name='dense3_a')
        ])(self._input)
        ### target
        modelt = C.layers.Sequential(
            [C.layers.Dense(360, activation=C.ops.relu,
                            name='dense4_a')])(self._target)
        ### concatenate both processed target and observations
        inputs = C.ops.splice(modeli, modelt)
        ### Use input to predict next hidden state, and generate
        ### next observation
        model = C.layers.Sequential([
            ######
            C.layers.Dense(720, activation=C.ops.relu, name='dense5_a'),
            # Recurrence
            C.layers.Recurrence(C.layers.LSTM(2048, init=C.glorot_uniform()),
                                name='lstm_a'),
            C.layers.Dense(1024, activation=None)
        ])(inputs)
        ######
        # Prediction
        direction = C.layers.Sequential([
            C.layers.Dense(720, activation=None, name='dense6_a'),
            C.layers.Dense(360, activation=C.ops.softmax, name='dense7_a')
        ])(model)
        velocity = C.layers.Sequential([
            C.layers.Dense(128, activation=C.ops.relu),
            C.layers.Dense(64, activation=None),
            C.layers.Dense(1, activation=None)
        ])(model)
        model = C.ops.splice(direction, velocity)

        if self._load_model:
            model = C.load_model('dnns/action_predicter_f.dnn')
            direction = model[0:360]
            velocity = model[360]

        print(model)
        loss = C.squared_error(direction, self._output) + C.squared_error(
            velocity, self._output_velocity)
        error = C.classification_error(direction,
                                       self._output) + C.squared_error(
                                           velocity, self._output_velocity)

        learner = C.adadelta(model.parameters, l2_regularization_weight=0.001)
        progress_printer = C.logging.ProgressPrinter(tag='Training')
        trainer = C.Trainer(model, (loss, error), learner, progress_printer)
        return model, loss, learner, trainer
Beispiel #26
0
def train(data_path, model_path, log_file, config_file, restore=False, profiling=False, gen_heartbeat=False):
    polymath = PolyMath(config_file)
    z, loss = polymath.model()
    training_config = importlib.import_module(config_file).training_config

    max_epochs = training_config['max_epochs']
    log_freq = training_config['log_freq']

    progress_writers = [C.logging.ProgressPrinter(
                            num_epochs = max_epochs,
                            freq = log_freq,
                            tag = 'Training',
                            log_to_file = log_file,
                            rank = C.Communicator.rank(),
                            gen_heartbeat = gen_heartbeat)]

    lr = C.learning_parameter_schedule(training_config['lr'], minibatch_size=None, epoch_size=None)

    ema = {}
    dummies = []
    for p in z.parameters:
        ema_p = C.constant(0, shape=p.shape, dtype=p.dtype, name='ema_%s' % p.uid)
        ema[p.uid] = ema_p
        dummies.append(C.reduce_sum(C.assign(ema_p, 0.999 * ema_p + 0.001 * p)))
    dummy = C.combine(dummies)

    learner = C.adadelta(z.parameters, lr)

    if C.Communicator.num_workers() > 1:
        learner = C.data_parallel_distributed_learner(learner)

    tensorboard_writer = TensorBoardProgressWriter(freq=10, log_dir='log', model=z)
    trainer = C.Trainer(z, (loss, None), learner, tensorboard_writer)

    if profiling:
        C.debugging.start_profiler(sync_gpu=True)

    train_data_file = os.path.join(data_path, training_config['train_data'])
    train_data_ext = os.path.splitext(train_data_file)[-1].lower()

    model_file = os.path.join(model_path, model_name)
    model = C.combine(list(z.outputs) + [loss.output])
    label_ab = argument_by_name(loss, 'ab')

    epoch_stat = {
        'best_val_err' : 100,
        'best_since'   : 0,
        'val_since'    : 0}

    if restore and os.path.isfile(model_file):
        trainer.restore_from_checkpoint(model_file)
        #after restore always re-evaluate
        epoch_stat['best_val_err'] = validate_model(os.path.join(data_path, training_config['val_data']), model, polymath)

    def post_epoch_work(epoch_stat):
        trainer.summarize_training_progress()
        epoch_stat['val_since'] += 1

        if epoch_stat['val_since'] == training_config['val_interval']:
            epoch_stat['val_since'] = 0
            temp = dict((p.uid, p.value) for p in z.parameters)
            for p in trainer.model.parameters:
                p.value = ema[p.uid].value
            val_err = validate_model(os.path.join(data_path, training_config['val_data']), model, polymath)
            if epoch_stat['best_val_err'] > val_err:
                epoch_stat['best_val_err'] = val_err
                epoch_stat['best_since'] = 0
                trainer.save_checkpoint(model_file)
                for p in trainer.model.parameters:
                    p.value = temp[p.uid]
            else:
                epoch_stat['best_since'] += 1
                if epoch_stat['best_since'] > training_config['stop_after']:
                    return False

        if profiling:
            C.debugging.enable_profiler()

        return True

    if train_data_ext == '.ctf':
        mb_source, input_map = create_mb_and_map(loss, train_data_file, polymath)

        minibatch_size = training_config['minibatch_size'] # number of samples
        epoch_size = training_config['epoch_size']

        for epoch in range(max_epochs):
            num_seq = 0
            while True:
                if trainer.total_number_of_samples_seen >= training_config['distributed_after']:
                    data = mb_source.next_minibatch(minibatch_size*C.Communicator.num_workers(), input_map=input_map, num_data_partitions=C.Communicator.num_workers(), partition_index=C.Communicator.rank())
                else:
                    data = mb_source.next_minibatch(minibatch_size, input_map=input_map)

                trainer.train_minibatch(data)
                num_seq += trainer.previous_minibatch_sample_count
                dummy.eval()
                if num_seq >= epoch_size:
                    break
            if not post_epoch_work(epoch_stat):
                break
    else:
        if train_data_ext != '.tsv':
            raise Exception("Unsupported format")

        minibatch_seqs = training_config['minibatch_seqs'] # number of sequences

        for epoch in range(max_epochs):       # loop over epochs
            tsv_reader = create_tsv_reader(loss, train_data_file, polymath, minibatch_seqs, C.Communicator.num_workers())
            minibatch_count = 0
            for data in tsv_reader:
                if (minibatch_count % C.Communicator.num_workers()) == C.Communicator.rank():
                    trainer.train_minibatch(data) # update model with it
                    dummy.eval()
                minibatch_count += 1
            if not post_epoch_work(epoch_stat):
                break

    if profiling:
        C.debugging.stop_profiler()