Example #1
0
def block_orthogonal(tensor: NDArray,
                     split_sizes: List[int],
                     gain: float = 1.0) -> None:
    """
    An initializer which allows initializing model parameters in "blocks". This is helpful
    in the case of recurrent models which use multiple gates applied to linear projections,
    which can be computed efficiently if they are concatenated together. However, they are
    separate parameters which should be initialized independently.

    Parameters
    ----------
    tensor : ``mxnet.ndarray.ndarray.NDArray``, required.
        A tensor to initialize.
    split_sizes : List[int], required.
        A list of length ``tensor.ndim()`` specifying the size of the
        blocks along that particular dimension. E.g. ``[10, 20]`` would
        result in the tensor being split into chunks of size 10 along the
        first dimension and 20 along the second.
    gain : float, optional (default = 1.0)
        The gain (scaling) applied to the orthogonal initialization.
    """

    # if isinstance(tensor, NDArray):
    #     block_orthogonal(tensor.data, split_sizes, gain)
    # else:
    sizes = list(tensor.shape)
    if any([a % b != 0 for a, b in zip(sizes, split_sizes)]):
        raise ConfigurationError(
            "tensor dimensions must be divisible by their respective "
            "split_sizes. Found size: {} and split_sizes: {}".format(
                sizes, split_sizes))
    indexes = [
        list(range(0, max_size, split))
        for max_size, split in zip(sizes, split_sizes)
    ]
    # Iterate over all possible blocks within the tensor.
    for block_start_indices in itertools.product(*indexes):
        # A list of tuples containing the index to start at for this block
        # and the appropriate step size (i.e split_size[i] for dimension i).
        index_and_step_tuples = zip(block_start_indices, split_sizes)
        # This is a tuple of slices corresponding to:
        # tensor[index: index + step_size, ...]. This is
        # required because we could have an arbitrary number
        # of dimensions. The actual slices we need are the
        # start_index: start_index + step for each dimension in the tensor.
        block_slice = tuple([
            slice(start_index, start_index + step)
            for start_index, step in index_and_step_tuples
        ])
        orthogonal = init.Orthogonal(scale=gain)
        temp = tensor[block_slice]
        orthogonal._init_weight(None, temp)
        tensor[block_slice] = temp
Example #2
0
    def __init__(
            self,
            units: int,
            in_units: int,
            coeff: float = 0.9,
            activation: Optional[str] = None,
            use_bias: bool = True,
            flatten: bool = True,
            weight_initializer: init.Initializer = init.Orthogonal(scale=0.9),
            bias_initializer="zeros",
            dtype="float32",
            num_power_iter: int = 1,
            ctx: Optional[mx.Context] = None,
            **kwargs):
        super().__init__(**kwargs)
        self._coeff = coeff
        self._flatten = flatten
        self._ctx = ctx if ctx is not None else get_mxnet_context()
        self._num_power_iter = num_power_iter
        with self.name_scope():
            self._units = units
            self._in_units = in_units
            self._weight = self.params.get(
                "weight",
                shape=(units, in_units),
                init=weight_initializer,
                dtype=dtype,
            )
            self._u = self.params.get("u",
                                      init=mx.init.Normal(),
                                      shape=(1, units))

            if use_bias:
                self._bias = self.params.get("bias",
                                             shape=(units, ),
                                             init=bias_initializer,
                                             dtype=dtype)
            else:
                self._bias = None

            if activation is not None:
                self._act = get_activation(activation, prefix=activation + "_")
            else:
                self._act = None
Example #3
0
            batch_size,
            ctx,
            mx_train_data,
            mx_valid_data,
            init_type=init.Xavier(),
            path='xavier')
        print('Finished Traing the Model 3')

        print('Traing the Model 4 with Orthogonal Initialization')
        train_loss_hist_m3, train_acc_hist_m3, valid_loss_hist_m3, valid_acc_hist_m3 = model_fit(
            no_epochs,
            batch_size,
            ctx,
            mx_train_data,
            mx_valid_data,
            init_type=init.Orthogonal(),
            path='orthogonal')
        print('Finished Traing the Model 4')

        print('\nPlotting the Performance')

        epochs = np.arange(no_epochs)

        fig, ax = plt.subplots()
        ax.plot(epochs,
                train_loss_hist_m0,
                linewidth=2.0,
                label='Training Loss')
        ax.plot(epochs,
                valid_loss_hist_m0,
                linewidth=2.0,
Example #4
0
def main(args):
	with open(args.file, 'r') as f:
		settings = yaml.load(f)
	assert args.file[:-5].endswith(settings['model']['name']), \
		'The model name is not consistent! %s != %s' % (args.file[:-5], settings['model']['name'])

	mx.random.seed(settings['seed'])
	np.random.seed(settings['seed'])
	random.seed(settings['seed'])
	
	dataset_setting = settings['dataset']
	model_setting = settings['model']
	train_setting = settings['training']

	name = os.path.join(PARAM_PATH, model_setting['name'])
	train, eval, test, scaler = getattr(data.dataloader, dataset_setting['dataloader'])(settings)

	model_type = getattr(model, model_setting['type'])
	net = model_type.net(settings)

	try:
		logger = Logger.load('%s.yaml' % name)
		net.load_parameters('%s-%04d.params' % (name, logger.best_epoch()), ctx=args.gpus)
		logger.set_net(net)
		print('Successfully loading the model %s [epoch: %d]' % (model_setting['name'], logger.best_epoch()))
	except:
		logger = Logger(name, net, train_setting['early_stop_metric'], train_setting['early_stop_epoch'])
		net.initialize(init.Orthogonal(), ctx=args.gpus)
		print('Initialize the model')

	num_params = 0
	for v in net.collect_params().values():
		num_params += np.prod(v.shape)
	print(net.collect_params())
	print('NUMBER OF PARAMS:', num_params)

	# net.hybridize()
	model_trainer = ModelTrainer(
		net = net,
		trainer = gluon.Trainer(
			net.collect_params(),
			mx.optimizer.Adam(
				learning_rate	= train_setting['lr'],
				multi_precision	= True,
				lr_scheduler	= mx.lr_scheduler.FactorScheduler(
					step			= train_setting['lr_decay_step'] * len(args.gpus),
					factor			= train_setting['lr_decay_factor'],
					stop_factor_lr	= 1e-6
				)
			),
			update_on_kvstore = False
		),
		clip_gradient = train_setting['clip_gradient'],
		logger = logger,
		ctx = args.gpus
	)

	model_trainer.fit(
		begin_epoch = logger.best_epoch(),
		num_epochs	= args.epochs,
		train		= train,
		eval		= eval,
		test		= test,
		metrics		= [MAE(scaler), RMSE(scaler), IndexMAE(scaler, [0,1,2]), IndexRMSE(scaler, [0,1,2])],
	)

	net.load_parameters('%s-%04d.params' % (name, logger.best_epoch()), ctx=args.gpus)
	# print(net.collect_params()['seq2seq_decoder_c1_dense_r_b_dense1_weight'].data())
	# print(net.collect_params()['decode_cell1_out_weight'].data())
	model_trainer.fit(
		begin_epoch	= 0,
		num_epochs	= 1,
		train		= None,
		eval		= eval,
		test		= test,
		metrics		= [MAE(scaler), RMSE(scaler), IndexMAE(scaler, [0,1,2]), IndexRMSE(scaler, [0,1,2])]
	)