Exemple #1
0
 def __init__(
     self,
     num_features,
     bias=True,
     fix_weight=False,
     fix_bias=False,
     inplace=False,
 ):
     super(Affine, self).__init__()
     self.num_features = num_features
     self.inplace = inplace
     if not fix_weight:
         self.weight = Parameter(ones(num_features))
         if inplace:
             raise ValueError('Inplace computation requires fixed weight.')
     else:
         self.register_buffer('weight', ones(num_features))
     if bias:
         if not fix_bias:
             self.bias = Parameter(zeros(num_features))
         else:
             self.register_buffer('bias', zeros(num_features))
     else:
         self.bias = None
     self.inputs = [self.weight, self.bias] if bias else [self.weight]
     self.register_op()
Exemple #2
0
 def __init__(self, in_channels, out_channels, kernel_size, stride, padding,
              dilation, transposed, output_padding, groups, bias):
     super(_ConvNd, self).__init__()
     if in_channels % groups != 0:
         raise ValueError('in_channels must be divisible by groups')
     if out_channels % groups != 0:
         raise ValueError('out_channels must be divisible by groups')
     self.in_channels = in_channels
     self.out_channels = out_channels
     self.kernel_size = kernel_size
     self.stride = stride
     self.padding = padding
     self.dilation = dilation
     self.transposed = transposed
     self.output_padding = output_padding
     self.groups = groups
     if transposed:
         self.weight = Parameter(
             Tensor(in_channels, out_channels // groups, *kernel_size))
     else:
         self.weight = Parameter(
             Tensor(out_channels, in_channels // groups, *kernel_size))
     if bias:
         self.bias = Parameter(Tensor(out_channels))
     else:
         self.bias = None
     self.reset_parameters()
     self.register_op()
Exemple #3
0
 def __init__(self,
              num_features,
              eps=1e-5,
              momentum=0.1,
              affine=True,
              track_running_stats=True):
     super(_BatchNorm, self).__init__()
     self.num_features = num_features
     self.eps = eps
     self.momentum = momentum
     self.affine = affine
     self.track_running_stats = track_running_stats
     if self.affine:
         self.weight = Parameter(Tensor(num_features))
         self.bias = Parameter(Tensor(num_features))
     else:
         self.register_buffer('weight', ones(num_features))
         self.register_buffer('bias', zeros(num_features))
     self.register_buffer('running_mean', zeros(num_features))
     self.register_buffer('running_var', ones(num_features))
     self.inputs = [
         self.running_mean, self.running_var, self.weight, self.bias
     ]
     self.reset_parameters()
     self.register_op()
     self.op_metas = {'TRAIN': None, 'TEST': None}
Exemple #4
0
 def __init__(
     self,
     in_channels,
     out_channels,
     kernel_size,
     stride,
     padding,
     dilation,
     bias,
 ):
     super(_DepthwiseConvNd, self).__init__()
     if in_channels != out_channels:
         raise ValueError('in/out channels must be same')
     self.in_channels = in_channels
     self.out_channels = out_channels
     self.kernel_size = kernel_size
     self.stride = stride
     self.padding = padding
     self.dilation = dilation
     self.weight = Parameter(Tensor(out_channels, 1, *kernel_size))
     if bias:
         self.bias = Parameter(Tensor(out_channels))
     else:
         self.bias = None
     self.reset_parameters()
     self.register_op()
Exemple #5
0
 def __init__(self, num_features, bias=True, fix_weight=False, fix_bias=False):
     super(Affine, self).__init__()
     self.num_features = num_features
     self.weight = Parameter(ones(num_features), requires_grad=not fix_weight)
     if bias:
         self.bias = Parameter(zeros(num_features), requires_grad=not fix_bias)
     else:
         self.bias = None
     self.inputs = [self.weight, self.bias] if bias else [self.weight]
     self.register_op()
Exemple #6
0
 def __init__(self, in_features, out_features, bias=True):
     super(Linear, self).__init__()
     self.in_features = in_features
     self.out_features = out_features
     self.weight = Parameter(Tensor(out_features, in_features))
     if bias:
         self.bias = Parameter(Tensor(out_features))
     else:
         self.bias = None
     self.reset_parameters()
     self.register_op()
Exemple #7
0
    def _plan_params(self):
        if self.mode == 'lstm': gate_size = 4 * self.hidden_size
        elif self.mode == 'gru': gate_size = 3 * self.hidden_size
        else: gate_size = self.hidden_size
        # 1. plan weights
        self._matrix_weights = []
        self._bias_weights = []
        for layer in range(self.num_layers):
            for direction in range(self.num_directions):
                layer_input_size = self.input_size if layer == 0 \
                    else self.hidden_size * self.num_directions
                w_names = [
                    'layer_{}/{}/{}'.format(layer, p,
                                            'L' if direction == 0 else 'R')
                    for p in ('matrix_ih', 'matrix_hh', 'bias_ih', 'bias_hh')
                ]
                w_ih = dg.Tensor(name=w_names[0],
                                 shape=[gate_size, layer_input_size])
                w_hh = dg.Tensor(name=w_names[1],
                                 shape=[gate_size, self.hidden_size])
                b_ih = dg.Tensor(name=w_names[2], shape=[
                    gate_size,
                ])
                b_hh = dg.Tensor(name=w_names[3], shape=[
                    gate_size,
                ])
                # W (0 ~ 3), R (4 ~ 7)
                self._matrix_weights.extend([w_ih, w_hh])
                # Bw (0 ~ 3), Br (4 ~ 7)
                self._bias_weights.extend([b_ih, b_hh])

        # 2. compute total number of parameters
        self._weights_count = 0
        for w in self._matrix_weights + self._bias_weights:
            self._weights_count += np.prod(w.shape)

        # 3. register the packed weights
        self.weights = Parameter(Tensor(int(self._weights_count)))

        # 4. create the initialization grids
        if self.mode == 'lstm': num_params_per_layer = 8
        elif self.mode == 'gru': num_params_per_layer = 6
        else: num_params_per_layer = 2
        self._matrix_init_grids = [[[
            'orthogonal' for _ in range(num_params_per_layer)
        ] for _ in range(self.num_directions)] for _ in range(self.num_layers)]
        self._bias_init_grids = [[[
            'zero' for _ in range(num_params_per_layer)
        ] for _ in range(self.num_directions)] for _ in range(self.num_layers)]

        # 5. set the init flag
        self._init_params = False
Exemple #8
0
 def __init__(self, num_features, group=32, eps=1e-5, affine=True):
     super(_GroupNorm, self).__init__()
     self.num_features = num_features
     self.group = group
     self.eps = eps
     self.affine = affine
     if self.affine:
         self.weight = Parameter(Tensor(num_features))
         self.bias = Parameter(Tensor(num_features))
     else:
         self.weight = self.bias = None
     self.inputs = [self.weight, self.bias] if self.affine else []
     self.reset_parameters()
     self.register_op()
Exemple #9
0
class Linear(Module):
    def __init__(self, in_features, out_features, bias=True):
        super(Linear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(Tensor(out_features, in_features))
        if bias:
            self.bias = Parameter(Tensor(out_features))
        else:
            self.bias = None
        self.reset_parameters()
        self.register_op()

    def register_op(self):
        self.op_meta = {
            'op_type': 'InnerProduct',
            'n_inputs': 3 if self.bias else 2, 'n_outputs': 1,
            'arguments': {
                'num_output': self.weight.shape[0],
                'axis': -1,
            }
        }

    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        if self.bias is not None:
            self.bias.data.uniform_(-stdv, stdv)

    def forward(self, input):
        inputs = [input, self.weight] + ([self.bias] if self.bias else [])
        self.unify_devices(inputs)
        outputs = [self.register_output(input.dtype)]
        return self.run(inputs, outputs)
Exemple #10
0
 def __init__(self, input_size, hidden_size, bias, num_chunks):
     super(RNNCellBase, self).__init__()
     self.input_size = input_size
     self.hidden_size = hidden_size
     self.bias = bias
     self.weight_ih = Parameter(Tensor(num_chunks * hidden_size,
                                       input_size))
     self.weight_hh = Parameter(
         Tensor(num_chunks * hidden_size, hidden_size))
     if bias:
         self.bias_ih = Parameter(Tensor(num_chunks * hidden_size))
         self.bias_hh = Parameter(Tensor(num_chunks * hidden_size))
     else:
         self.register_parameter('bias_ih', None)
         self.register_parameter('bias_hh', None)
     self.reset_parameters()
Exemple #11
0
    def _plan_params(self):
        if self.mode == 'lstm': gate_size = 4 * self.hidden_size
        elif self.mode == 'gru': gate_size = 3 * self.hidden_size
        else: gate_size = self.hidden_size
        # 1. Plan weights
        self._matrix_shape, self._bias_shape = [], []
        for layer in range(self.num_layers):
            for direction in range(self.num_directions):
                layer_input_size = self.input_size if layer == 0 \
                    else self.hidden_size * self.num_directions
                w_ih_shape = [gate_size, layer_input_size]
                w_hh_shape = [gate_size, self.hidden_size]
                b_ih_shape, b_hh_shape = [gate_size], [gate_size]
                # W (0 ~ 3), R (4 ~ 7)
                self._matrix_shape.extend([w_ih_shape, w_hh_shape])
                # Bw (0 ~ 3), Br (4 ~ 7)
                self._bias_shape.extend([b_ih_shape, b_hh_shape])

        # 2. Compute total number of parameters
        self._weights_count = 0
        for shape in self._matrix_shape + self._bias_shape:
            self._weights_count += numpy.prod(shape)

        # 3. Register the packed weights
        self.weights = Parameter(Tensor(int(self._weights_count)))

        # 4. Create the initialization grids
        if self.mode == 'lstm': num_params_per_layer = 8
        elif self.mode == 'gru': num_params_per_layer = 6
        else: num_params_per_layer = 2
        self._matrix_init_grids = [[[
            'orthogonal' for _ in range(num_params_per_layer)
        ] for _ in range(self.num_directions)] for _ in range(self.num_layers)]
        self._bias_init_grids = [[[
            'zero' for _ in range(num_params_per_layer)
        ] for _ in range(self.num_directions)] for _ in range(self.num_layers)]

        # 5. Set the init flag
        self._init_params = False
Exemple #12
0
class RNNBase(Module):
    def __init__(self,
                 mode,
                 input_size,
                 hidden_size,
                 num_layers=1,
                 bias=True,
                 batch_first=False,
                 dropout=0,
                 bidirectional=False):
        super(RNNBase, self).__init__()
        self.mode = mode
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bias = bias
        self.batch_first = batch_first
        self.dropout = dropout if dropout != 0 else None
        self.dropout_state = {}
        self.bidirectional = bidirectional
        self.num_directions = 2 if bidirectional else 1
        if batch_first:
            raise NotImplementedError('Batch first is disabled.')
        if not bias:
            raise NotImplementedError('Bias is required.')

        if not isinstance(dropout, numbers.Number) or not 0 <= dropout <= 1 or \
                isinstance(dropout, bool):
            raise ValueError(
                "dropout should be a number in range [0, 1] "
                "representing the probability of an element being "
                "zeroed")
        if dropout > 0 and num_layers == 1:
            warnings.warn("dropout option adds dropout after all but last "
                          "recurrent layer, so non-zero dropout expects "
                          "num_layers greater than 1, but got dropout={} and "
                          "num_layers={}".format(dropout, num_layers))
        self._plan_params()
        self.register_op()
        self.meta_in_phase = {'TRAIN': [None, None], 'TEST': [None, None]}

    def register_op(self):
        self.op_meta = {
            'op_type': 'Recurrent',
            'n_inputs': 4,
            'n_outputs': 2,  # meaningless
            'arguments': {
                'hidden_size': self.hidden_size,
                'num_layers': self.num_layers,
                'bidirectional': self.bidirectional,
                'rnn_mode': self.mode,
                'rnn_input_mode': 'linear',
                'dropout_ratio': self.dropout,
                'phase': 'TEST',
            }
        }

    def make_meta_from_phase(self, phase):
        def reset_meta(self, phase):
            # Ren-Gen Key
            self._persistent_key = None
            _ = self.persistent_key
            self._persistent_key += '/{}'.format(phase)
            self.op_meta['arguments']['phase'] = phase
            # Re-Gen Op
            self._gen_op()
            self.meta_in_phase[phase][0] = self._persistent_key
            self.meta_in_phase[phase][1] = self._op

        if self._persistent_key is None:
            # Init or CTX has changed
            reset_meta(self, phase)
        else:
            # CTX unchanged & Run into a new phase
            if self.meta_in_phase[phase][0] is None:
                reset_meta(self, phase)

        return self.meta_in_phase[phase]

    def forward(self, input, hx=None):
        if hx and not isinstance(hx, Tensor):
            raise TypeError('Excepted hx as a Tensor, got {}.'.format(
                type(hx)))

        if not self._init_params: self._reset_params()

        inputs = [input, self.weights] + ([hx] if hx else [])
        self.unify_devices(inputs)
        outputs = [self.register_output(input.dtype) for _ in range(2)]

        requires_grad = False
        for input in inputs:
            if input.requires_grad: requires_grad = True
        requires_grad = requires_grad and is_grad_enabled()
        meta = [
            'PERSISTENT',
        ] + self.make_meta_from_phase('TRAIN' if requires_grad else 'TEST')

        return RunOperator(inputs, outputs, meta)

    def _plan_params(self):
        if self.mode == 'lstm': gate_size = 4 * self.hidden_size
        elif self.mode == 'gru': gate_size = 3 * self.hidden_size
        else: gate_size = self.hidden_size
        # 1. plan weights
        self._matrix_weights = []
        self._bias_weights = []
        for layer in range(self.num_layers):
            for direction in range(self.num_directions):
                layer_input_size = self.input_size if layer == 0 \
                    else self.hidden_size * self.num_directions
                w_names = [
                    'layer_{}/{}/{}'.format(layer, p,
                                            'L' if direction == 0 else 'R')
                    for p in ('matrix_ih', 'matrix_hh', 'bias_ih', 'bias_hh')
                ]
                w_ih = dg.Tensor(name=w_names[0],
                                 shape=[gate_size, layer_input_size])
                w_hh = dg.Tensor(name=w_names[1],
                                 shape=[gate_size, self.hidden_size])
                b_ih = dg.Tensor(name=w_names[2], shape=[
                    gate_size,
                ])
                b_hh = dg.Tensor(name=w_names[3], shape=[
                    gate_size,
                ])
                # W (0 ~ 3), R (4 ~ 7)
                self._matrix_weights.extend([w_ih, w_hh])
                # Bw (0 ~ 3), Br (4 ~ 7)
                self._bias_weights.extend([b_ih, b_hh])

        # 2. compute total number of parameters
        self._weights_count = 0
        for w in self._matrix_weights + self._bias_weights:
            self._weights_count += np.prod(w.shape)

        # 3. register the packed weights
        self.weights = Parameter(Tensor(int(self._weights_count)))

        # 4. create the initialization grids
        if self.mode == 'lstm': num_params_per_layer = 8
        elif self.mode == 'gru': num_params_per_layer = 6
        else: num_params_per_layer = 2
        self._matrix_init_grids = [[[
            'orthogonal' for _ in range(num_params_per_layer)
        ] for _ in range(self.num_directions)] for _ in range(self.num_layers)]
        self._bias_init_grids = [[[
            'zero' for _ in range(num_params_per_layer)
        ] for _ in range(self.num_directions)] for _ in range(self.num_layers)]

        # 5. set the init flag
        self._init_params = False

    ##############################################
    #                                            #
    #                INITIALIZER                 #
    #                                            #
    ##############################################

    def _uniform_init(self, shape, dtype='float32'):
        stdv = 1.0 / np.sqrt(self.hidden_size)
        return np.random.uniform(-stdv, stdv, shape).astype(dtype)

    def _orthogonal_init(self, shape, gain=1, dtype='float32'):
        num_rows = 1
        for dim in shape[:-1]:
            num_rows *= dim
        num_cols = shape[-1]
        flat_shape = (num_cols, num_rows) if num_rows < num_cols \
            else (num_rows,  num_cols)
        W = np.random.randn(*flat_shape)
        q, r = np.linalg.qr(W)
        # Make Q uniform
        d = np.diag(r)
        q *= np.sign(d)
        if num_rows < num_cols: q = q.T
        return gain * q.reshape(shape).astype(dtype)

    def _zero_init(self, shape, dtype='float32'):
        return np.zeros(shape, dtype=dtype)

    ##############################################
    #                                            #
    #                 PARAMETERS                 #
    #                                            #
    ##############################################

    def set_param(self,
                  layer=0,
                  direction=0,
                  param_id=0,
                  type='matrix',
                  initializer=None):
        if type == 'matrix':
            self._matrix_init_grids[layer][direction][param_id] = initializer
        elif type == 'bias':
            self._bias_init_grids[layer][direction][param_id] = initializer
        else:
            raise ValueError('Unknown param type: ' + type)

    def _set_param(self, layer_id, param_id, param_type, param):
        if not isinstance(param, Tensor):
            if isinstance(param, np.ndarray):
                paramT = dg.Tensor('/tmp/rnn_param').Variable()
                paramT.set_value(param)
                param = paramT
            else:
                raise ValueError('Excepted a tensor or numpy array.')
        W = self.weights.dragon()
        outputs = RNNParamSet([W, param],
                              layer_id,
                              param_id,
                              param_type,
                              rnn_mode=self.mode,
                              input_size=self.input_size,
                              hidden_size=self.hidden_size,
                              num_layers=self.num_layers,
                              num_directions=self.num_directions)
        for k, v in outputs.expressions.items():
            dg.workspace.RunOperator(v)

    def _reset_params(self):
        np.random.seed(dg.config.GetRandomSeed())
        if self.mode == 'lstm': num_gates = 4
        elif self.mode == 'gru': num_gates = 3
        else: num_gates = 1
        for layer in range(len(self._matrix_init_grids)):
            for direction in range(len(self._matrix_init_grids[0])):
                for param_id in range(len(self._matrix_init_grids[0][0])):
                    matrix_init = self._matrix_init_grids[layer][direction][
                        param_id]
                    bias_init = self._bias_init_grids[layer][direction][
                        param_id]
                    if isinstance(matrix_init, str):
                        matrix_init = getattr(self,
                                              '_{}_init'.format(matrix_init))
                    if isinstance(bias_init, str):
                        bias_init = getattr(self, '_{}_init'.format(bias_init))
                    pseudo_layer_id = layer * self.num_directions + direction
                    packed_id = pseudo_layer_id * 2 + int(param_id / num_gates)
                    matrix_shape = self._matrix_weights[packed_id].shape[:]
                    bias_shape = self._bias_weights[packed_id].shape[:]
                    matrix_shape[0] = bias_shape[0] = int(matrix_shape[0] /
                                                          num_gates)
                    self._set_param(layer_id=pseudo_layer_id,
                                    param_id=param_id,
                                    param_type='matrix',
                                    param=matrix_init(matrix_shape))
                    self._set_param(layer_id=pseudo_layer_id,
                                    param_id=param_id,
                                    param_type='bias',
                                    param=bias_init(bias_shape))
        self._init_params = True
Exemple #13
0
class RNNBase(Module):
    def __init__(
        self,
        mode,
        input_size,
        hidden_size,
        num_layers=1,
        bias=True,
        batch_first=False,
        dropout=0,
        bidirectional=False,
    ):
        super(RNNBase, self).__init__()
        self.mode = mode
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bias = bias
        self.batch_first = batch_first
        self.dropout = dropout if dropout != 0 else None
        self.dropout_state = {}
        self.bidirectional = bidirectional
        self.num_directions = 2 if bidirectional else 1
        if batch_first:
            raise NotImplementedError('Batch first is disabled.')
        if not bias:
            raise NotImplementedError('Bias is required.')

        if not isinstance(dropout, numbers.Number) or \
                not 0 <= dropout <= 1 or isinstance(dropout, bool):
            raise ValueError(
                "dropout should be a number in range [0, 1] "
                "representing the probability of an element being "
                "zeroed")
        if dropout > 0 and num_layers == 1:
            warnings.warn("dropout option adds dropout after all but last "
                          "recurrent layer, so non-zero dropout expects "
                          "num_layers greater than 1, but got dropout={} and "
                          "num_layers={}".format(dropout, num_layers))
        self._plan_params()
        self.register_op()
        self.op_metas = {'TRAIN': None, 'TEST': None}

    def register_op(self):
        self.op_meta = {
            'op_type': 'Recurrent',
            'arguments': {
                'hidden_size': self.hidden_size,
                'num_layers': self.num_layers,
                'bidirectional': self.bidirectional,
                'rnn_mode': self.mode,
                'rnn_input_mode': 'linear',
                'dropout_ratio': self.dropout,
                'phase': 'TEST',
            }
        }

    def extra_repr(self):
        s = '{input_size}, {hidden_size}'
        if self.num_layers != 1:
            s += ', num_layers={num_layers}'
        if self.bias is not True:
            s += ', bias={bias}'
        if self.batch_first is not False:
            s += ', batch_first={batch_first}'
        if self.dropout != 0:
            s += ', dropout={dropout}'
        if self.bidirectional is not False:
            s += ', bidirectional={bidirectional}'
        return s.format(**self.__dict__)

    def make_meta_from_phase(self, phase):
        def reset_meta(self, phase):
            self._module_key = None
            _ = self.module_key
            self._module_key += '/{}'.format(phase)
            self.op_meta['arguments']['phase'] = phase
            self._gen_module_def()
            self.op_metas[phase] = (self._module_key, self._module_def)

        if self._module_key is None:
            # Init or Context has changed
            reset_meta(self, phase)
        else:
            # Context unchanged
            if self.op_metas[phase] is None:
                reset_meta(self, phase)

        return self.op_metas[phase]

    def forward(self, input, hx=None):
        if hx and not isinstance(hx, Tensor):
            raise TypeError('Excepted hx as a Tensor, got {}.'.format(
                type(hx)))

        if not self._init_params: self._reset_params()

        inputs = [input, self.weights] + ([hx] if hx else [])
        self.unify_devices(inputs)
        outputs = [self.register_output() for _ in range(2)]

        meta = self.make_meta_from_phase('TRAIN' if self.training else 'TEST')
        return RunOperator(inputs, outputs, meta)

    def _plan_params(self):
        if self.mode == 'lstm': gate_size = 4 * self.hidden_size
        elif self.mode == 'gru': gate_size = 3 * self.hidden_size
        else: gate_size = self.hidden_size
        # 1. Plan weights
        self._matrix_shape, self._bias_shape = [], []
        for layer in range(self.num_layers):
            for direction in range(self.num_directions):
                layer_input_size = self.input_size if layer == 0 \
                    else self.hidden_size * self.num_directions
                w_ih_shape = [gate_size, layer_input_size]
                w_hh_shape = [gate_size, self.hidden_size]
                b_ih_shape, b_hh_shape = [gate_size], [gate_size]
                # W (0 ~ 3), R (4 ~ 7)
                self._matrix_shape.extend([w_ih_shape, w_hh_shape])
                # Bw (0 ~ 3), Br (4 ~ 7)
                self._bias_shape.extend([b_ih_shape, b_hh_shape])

        # 2. Compute total number of parameters
        self._weights_count = 0
        for shape in self._matrix_shape + self._bias_shape:
            self._weights_count += numpy.prod(shape)

        # 3. Register the packed weights
        self.weights = Parameter(Tensor(int(self._weights_count)))

        # 4. Create the initialization grids
        if self.mode == 'lstm': num_params_per_layer = 8
        elif self.mode == 'gru': num_params_per_layer = 6
        else: num_params_per_layer = 2
        self._matrix_init_grids = [[[
            'orthogonal' for _ in range(num_params_per_layer)
        ] for _ in range(self.num_directions)] for _ in range(self.num_layers)]
        self._bias_init_grids = [[[
            'zero' for _ in range(num_params_per_layer)
        ] for _ in range(self.num_directions)] for _ in range(self.num_layers)]

        # 5. Set the init flag
        self._init_params = False

    ##############################################
    #                                            #
    #                INITIALIZER                 #
    #                                            #
    ##############################################

    def _uniform_init(self, shape, dtype='float32'):
        stdv = 1.0 / numpy.sqrt(self.hidden_size)
        return numpy.random.uniform(-stdv, stdv, shape).astype(dtype)

    def _orthogonal_init(self, shape, gain=1, dtype='float32'):
        num_rows = 1
        for dim in shape[:-1]:
            num_rows *= dim
        num_cols = shape[-1]
        flat_shape = (num_cols, num_rows) if num_rows < num_cols \
            else (num_rows,  num_cols)
        W = numpy.random.randn(*flat_shape)
        q, r = numpy.linalg.qr(W)
        # Make Q uniform
        d = numpy.diag(r)
        q *= numpy.sign(d)
        if num_rows < num_cols: q = q.T
        return gain * q.reshape(shape).astype(dtype)

    def _zero_init(self, shape, dtype='float32'):
        return numpy.zeros(shape, dtype=dtype)

    ##############################################
    #                                            #
    #                 PARAMETERS                 #
    #                                            #
    ##############################################

    def set_param(self,
                  layer=0,
                  direction=0,
                  param_id=0,
                  type='matrix',
                  initializer=None):
        if type == 'matrix':
            self._matrix_init_grids[layer][direction][param_id] = initializer
        elif type == 'bias':
            self._bias_init_grids[layer][direction][param_id] = initializer
        else:
            raise ValueError('Unknown param type: ' + type)

    def _set_param(self, layer_id, param_id, param_type, param):
        if isinstance(param, numpy.ndarray):
            param_temp = dragon.Tensor.Ref('/tmp/rnn_param')
            param_temp.set_value(param)
            param = param_temp
        else:
            raise ValueError('Excepted a numpy array.')
        W = self.weights.dragon()
        outputs = RNNParamSet([W, param],
                              layer_id,
                              param_id,
                              param_type,
                              rnn_mode=self.mode,
                              input_size=self.input_size,
                              hidden_size=self.hidden_size,
                              num_layers=self.num_layers,
                              num_directions=self.num_directions)
        for k, v in outputs.expressions.items():
            dragon.workspace.RunOperator(v)

    def _reset_params(self):
        numpy.random.seed(dragon.config.GetRandomSeed())
        if self.mode == 'lstm': num_gates = 4
        elif self.mode == 'gru': num_gates = 3
        else: num_gates = 1
        for layer in range(len(self._matrix_init_grids)):
            for direction in range(len(self._matrix_init_grids[0])):
                for param_id in range(len(self._matrix_init_grids[0][0])):
                    matrix_init = self._matrix_init_grids[layer][direction][
                        param_id]
                    bias_init = self._bias_init_grids[layer][direction][
                        param_id]
                    if isinstance(matrix_init, str):
                        matrix_init = getattr(self,
                                              '_{}_init'.format(matrix_init))
                    if isinstance(bias_init, str):
                        bias_init = getattr(self, '_{}_init'.format(bias_init))
                    pseudo_layer_id = layer * self.num_directions + direction
                    packed_id = pseudo_layer_id * 2 + int(param_id / num_gates)
                    matrix_shape = self._matrix_shape[packed_id][:]
                    bias_shape = self._bias_shape[packed_id][:]
                    matrix_shape[0] = bias_shape[0] = int(matrix_shape[0] /
                                                          num_gates)
                    self._set_param(layer_id=pseudo_layer_id,
                                    param_id=param_id,
                                    param_type='matrix',
                                    param=matrix_init(matrix_shape))
                    self._set_param(layer_id=pseudo_layer_id,
                                    param_id=param_id,
                                    param_type='bias',
                                    param=bias_init(bias_shape))
        self._init_params = True