def block_orthogonal(tensor: NDArray, split_sizes: List[int], gain: float = 1.0) -> None: """ An initializer which allows initializing model parameters in "blocks". This is helpful in the case of recurrent models which use multiple gates applied to linear projections, which can be computed efficiently if they are concatenated together. However, they are separate parameters which should be initialized independently. Parameters ---------- tensor : ``mxnet.ndarray.ndarray.NDArray``, required. A tensor to initialize. split_sizes : List[int], required. A list of length ``tensor.ndim()`` specifying the size of the blocks along that particular dimension. E.g. ``[10, 20]`` would result in the tensor being split into chunks of size 10 along the first dimension and 20 along the second. gain : float, optional (default = 1.0) The gain (scaling) applied to the orthogonal initialization. """ # if isinstance(tensor, NDArray): # block_orthogonal(tensor.data, split_sizes, gain) # else: sizes = list(tensor.shape) if any([a % b != 0 for a, b in zip(sizes, split_sizes)]): raise ConfigurationError( "tensor dimensions must be divisible by their respective " "split_sizes. Found size: {} and split_sizes: {}".format( sizes, split_sizes)) indexes = [ list(range(0, max_size, split)) for max_size, split in zip(sizes, split_sizes) ] # Iterate over all possible blocks within the tensor. for block_start_indices in itertools.product(*indexes): # A list of tuples containing the index to start at for this block # and the appropriate step size (i.e split_size[i] for dimension i). index_and_step_tuples = zip(block_start_indices, split_sizes) # This is a tuple of slices corresponding to: # tensor[index: index + step_size, ...]. This is # required because we could have an arbitrary number # of dimensions. The actual slices we need are the # start_index: start_index + step for each dimension in the tensor. block_slice = tuple([ slice(start_index, start_index + step) for start_index, step in index_and_step_tuples ]) orthogonal = init.Orthogonal(scale=gain) temp = tensor[block_slice] orthogonal._init_weight(None, temp) tensor[block_slice] = temp
def __init__( self, units: int, in_units: int, coeff: float = 0.9, activation: Optional[str] = None, use_bias: bool = True, flatten: bool = True, weight_initializer: init.Initializer = init.Orthogonal(scale=0.9), bias_initializer="zeros", dtype="float32", num_power_iter: int = 1, ctx: Optional[mx.Context] = None, **kwargs): super().__init__(**kwargs) self._coeff = coeff self._flatten = flatten self._ctx = ctx if ctx is not None else get_mxnet_context() self._num_power_iter = num_power_iter with self.name_scope(): self._units = units self._in_units = in_units self._weight = self.params.get( "weight", shape=(units, in_units), init=weight_initializer, dtype=dtype, ) self._u = self.params.get("u", init=mx.init.Normal(), shape=(1, units)) if use_bias: self._bias = self.params.get("bias", shape=(units, ), init=bias_initializer, dtype=dtype) else: self._bias = None if activation is not None: self._act = get_activation(activation, prefix=activation + "_") else: self._act = None
batch_size, ctx, mx_train_data, mx_valid_data, init_type=init.Xavier(), path='xavier') print('Finished Traing the Model 3') print('Traing the Model 4 with Orthogonal Initialization') train_loss_hist_m3, train_acc_hist_m3, valid_loss_hist_m3, valid_acc_hist_m3 = model_fit( no_epochs, batch_size, ctx, mx_train_data, mx_valid_data, init_type=init.Orthogonal(), path='orthogonal') print('Finished Traing the Model 4') print('\nPlotting the Performance') epochs = np.arange(no_epochs) fig, ax = plt.subplots() ax.plot(epochs, train_loss_hist_m0, linewidth=2.0, label='Training Loss') ax.plot(epochs, valid_loss_hist_m0, linewidth=2.0,
def main(args): with open(args.file, 'r') as f: settings = yaml.load(f) assert args.file[:-5].endswith(settings['model']['name']), \ 'The model name is not consistent! %s != %s' % (args.file[:-5], settings['model']['name']) mx.random.seed(settings['seed']) np.random.seed(settings['seed']) random.seed(settings['seed']) dataset_setting = settings['dataset'] model_setting = settings['model'] train_setting = settings['training'] name = os.path.join(PARAM_PATH, model_setting['name']) train, eval, test, scaler = getattr(data.dataloader, dataset_setting['dataloader'])(settings) model_type = getattr(model, model_setting['type']) net = model_type.net(settings) try: logger = Logger.load('%s.yaml' % name) net.load_parameters('%s-%04d.params' % (name, logger.best_epoch()), ctx=args.gpus) logger.set_net(net) print('Successfully loading the model %s [epoch: %d]' % (model_setting['name'], logger.best_epoch())) except: logger = Logger(name, net, train_setting['early_stop_metric'], train_setting['early_stop_epoch']) net.initialize(init.Orthogonal(), ctx=args.gpus) print('Initialize the model') num_params = 0 for v in net.collect_params().values(): num_params += np.prod(v.shape) print(net.collect_params()) print('NUMBER OF PARAMS:', num_params) # net.hybridize() model_trainer = ModelTrainer( net = net, trainer = gluon.Trainer( net.collect_params(), mx.optimizer.Adam( learning_rate = train_setting['lr'], multi_precision = True, lr_scheduler = mx.lr_scheduler.FactorScheduler( step = train_setting['lr_decay_step'] * len(args.gpus), factor = train_setting['lr_decay_factor'], stop_factor_lr = 1e-6 ) ), update_on_kvstore = False ), clip_gradient = train_setting['clip_gradient'], logger = logger, ctx = args.gpus ) model_trainer.fit( begin_epoch = logger.best_epoch(), num_epochs = args.epochs, train = train, eval = eval, test = test, metrics = [MAE(scaler), RMSE(scaler), IndexMAE(scaler, [0,1,2]), IndexRMSE(scaler, [0,1,2])], ) net.load_parameters('%s-%04d.params' % (name, logger.best_epoch()), ctx=args.gpus) # print(net.collect_params()['seq2seq_decoder_c1_dense_r_b_dense1_weight'].data()) # print(net.collect_params()['decode_cell1_out_weight'].data()) model_trainer.fit( begin_epoch = 0, num_epochs = 1, train = None, eval = eval, test = test, metrics = [MAE(scaler), RMSE(scaler), IndexMAE(scaler, [0,1,2]), IndexRMSE(scaler, [0,1,2])] )