def __init__(self, config, **kwargs): super().__init__(**kwargs) hidden_size = config.hidden_size self.num_heads = config.num_heads self.attn_head_size = config.attention_head_size qkv_size = self.attn_head_size * self.num_heads initializer = get_initializer(config) self.query_layer = layers.Dense(qkv_size, name="self/query", kernel_initializer=initializer) self.key_layer = layers.Dense(qkv_size, name="self/key", kernel_initializer=initializer) self.value_layer = layers.Dense(qkv_size, name="self/value", kernel_initializer=initializer) self.attn_dropout = layers.Dropout(config.attention_dropout_prob) self.attn_output_layer = layers.Dense(hidden_size, name='output/dense', kernel_initializer=initializer) self.attn_output_dropout = layers.Dropout(config.hidden_dropout_prob, seed=config.random_seed) self.attn_norm_layer = layers.LayerNormalization( name="output/layer_norm", axis=-1, epsilon=1e-12) self.w_layer = layers.Dense(1, name="self/w") self.attention = None self.random_seed = config.random_seed self.debug = config.debug self.debug_save_dir = config.debug_save_dir if config.debug else None
def __init__(self, config, **kwargs): super().__init__(**kwargs) self.bert_encoder = Bert(config) initializer = get_initializer(config) self.num_classes = config.num_classes self.max_seq_length = config.max_seq_length self.pooler = layers.Dense(config.hidden_size, kernel_initializer=initializer, name='bert/pooler/dense', activation='tanh') self.cls_dropout_layer = layers.Dropout(config.hidden_dropout_prob) self.num_choices = config.get('num_choices', 0) num_classes = 1 if self.num_choices else self.num_classes self.cls_layer = layers.Dense(num_classes, kernel_initializer=initializer, name='classifier/dense') self.pooled_output = None self.attentions = None self.encoded_output = None self.embeddings = None self.logits = None if config.use_replace_map: self.replace_map = { 'LayerNorm': 'layer_norm', 'bert/': 'bert_' + config.task + '/bert/' } else: self.replace_map = {} self.data_builder = BertClassifierDataBuilder(config)
def __init__(self, config, **kwargs): super().__init__(**kwargs) self.hidden_size = config.hidden_size intermediate_size = config.intermediate_size intermediate_act_fn = get_activation(config.intermediate_act_fn) kwargs['name'] = 'attention' self.attention_layer = MultiHeadAttention(config, **kwargs) if config.get('svd_units', 0) > 0: self.intermediate_layer0 = layers.Dense(config.svd_units, name='dense0') else: self.intermediate_layer0 = None self.intermediate_layer = layers.Dense(intermediate_size, name='dense', activation=intermediate_act_fn) if config.get('svd_units', 0) > 0: self.output_layer0 = layers.Dense(config.svd_units, name='dense0') else: self.output_layer0 = None self.output_layer = layers.Dense(self.hidden_size, name='dense') self.output_dropout = layers.Dropout(config.hidden_dropout_prob, seed=config.random_seed) self.output_norm_layer = layers.LayerNormalization(name="layer_norm", axis=-1, epsilon=1e-12) self.attention = None self.debug_save_dir = config.debug_save_dir if config.debug else None
def __init__(self, size_in, list_size_hidden, size_out, lambda_l2=0., ratio_dropout=None): if isinstance(list_size_hidden, int): list_size_hidden = [list_size_hidden] self.size_units = [size_in] + list_size_hidden + [size_out] self.lambda_l2 = lambda_l2 self.params = {} self.layers = OrderedDict() for i in range(len(self.size_units) - 1): affine, weight, bias, relu, dropout =\ [k + str(i + 1) for k in ('Affine', 'W', 'b', 'Relu', 'Dropout')] scale = np.sqrt(2.0 / self.size_units[i]) self.params[weight] =\ scale * np.random.randn(self.size_units[i], self.size_units[i+1]) self.params[bias] = np.zeros(self.size_units[i + 1]) self.layers[affine] = clay.Affine(self.params[weight], self.params[bias]) if ratio_dropout is not None: self.layers[dropout] = clay.Dropout(ratio_dropout) if i < len(self.size_units) - 2: self.layers[relu] = clay.Relu() else: self.lastLayer = clay.SoftmaxWithLoss()
def __init__(self, config, **kwargs): super().__init__(**kwargs) self.ebert_encoder = Ebert(config) initializer = get_initializer(config) self.num_classes = config.num_classes self.pooler = layers.Dense(config.hidden_size, kernel_initializer=initializer, name='ebert/pooler/dense', activation='tanh') self.cls_dropout_layer = layers.Dropout(config.hidden_dropout_prob) self.num_choices = config.get('num_choices', 0) num_classes = 1 if self.num_choices else self.num_classes self.cls_layer = layers.Dense(num_classes, kernel_initializer=initializer, name='classifier/dense') self.max_first_length = config.max_first_length + 2 self.max_second_length = config.max_seq_length - self.max_first_length self.num_choices = config.get('num_choices', 0) self.pooled_output = None self.encoded_output = None self.embeddings = None self.logits = None self.first_embeddings = None self.second_embeddings = None task = config.task replace_map = OrderedDict({ 'LayerNorm': 'layer_norm', 'bert/pooler': 'ebert_{}/ebert/pooler'.format(task), 'bert/embeddings': 'ebert_{}/ebert/embeddings'.format(task) }) # upper layers must be replaced first (i.e., longest match) layer_key = 'bert/encoder/layer_{}' layer_val = 'ebert_{}/ebert/{}_encoder/layer_{}' for layer_idx in range(config.sep_layers, config.num_hidden_layers): k = layer_key.format(layer_idx) replace_map[k] = layer_val.format(task, 'upper', layer_idx) for layer_idx in range(config.sep_layers): k = layer_key.format(layer_idx) replace_map[k] = layer_val.format(task, 'lower', layer_idx) if config.use_replace_map: self.replace_map = replace_map else: self.replace_map = {} self.data_builder = EbertClassifierDataBuilder(config)
def __init__(self, input_size, hidden_size_list, output_size, activation='relu', weight_init_std='relu', weight_decay_lambda=0, use_dropout=False, dropout_ratio=0.5, use_batchnorm=False): self.input_size = input_size self.output_size = output_size self.hidden_size_list = hidden_size_list self.hidden_layer_num = len(hidden_size_list) self.use_dropout = use_dropout self.weight_decay_lambda = weight_decay_lambda self.use_batchnorm = use_batchnorm self.params = {} # 重みの初期化 self.__init_weight(weight_init_std) # レイヤの生成 activation_layer = {'sigmoid': layers.Sigmoid, 'relu': layers.Relu} self.layers = OrderedDict() for idx in range(1, self.hidden_layer_num + 1): self.layers['Affine' + str(idx)] = layers.Affine( self.params['W' + str(idx)], self.params['b' + str(idx)]) if self.use_batchnorm: self.params['gamma' + str(idx)] = np.ones( hidden_size_list[idx - 1]) self.params['beta' + str(idx)] = np.zeros( hidden_size_list[idx - 1]) self.layers['BatchNorm' + str(idx)] = layers.BatchNormalization( self.params['gamma' + str(idx)], self.params['beta' + str(idx)]) self.layers['Activation_function' + str(idx)] = activation_layer[activation]() if self.use_dropout: self.layers['Dropout' + str(idx)] = layers.Dropout(dropout_ratio) idx = self.hidden_layer_num + 1 self.layers['Affine' + str(idx)] = layers.Affine( self.params['W' + str(idx)], self.params['b' + str(idx)]) self.last_layer = layers.SoftmaxWithLoss()
def __init__(self, config, **kwargs): super().__init__(**kwargs) self.vocab_size = config.vocab_size self.embedding_size = config.hidden_size self.use_tpu = config.use_tpu self.type_vocab_size = config.type_vocab_size self.max_position_embeddings = config.max_position_embeddings self.hidden_dropout_prob = config.hidden_dropout_prob self.embedding_table = None self.token_type_table = None self.position_table = None self.layer_norm = layers.LayerNormalization(name="layer_norm", axis=-1, epsilon=1e-12) self.dropout = layers.Dropout(self.hidden_dropout_prob)
def __init__(self, input_size, hidden_size_list, output_size, activation='relu', weight_init_std='relu', weight_decay_lambda=0.0, use_dropout=False, dropout_ratio=0.5, use_batchnorm=False): self.input_size = input_size self.output_size = output_size self.hidden_size_list = hidden_size_list self.hidden_layer_num = len(hidden_size_list) self.weight_decay_lambda = weight_decay_lambda self.use_dropout = use_dropout self.use_batchnorm = use_batchnorm self.params = {} # weights initialization self.__init_weight(weight_init_std) # generate layers activation_layer = {'sigmoid': layers.Sigmoid, 'relu': layers.Relu} self.layers = OrderedDict() for idx in range(1, self.hidden_layer_num + 1): self.layers['Affine' + str(idx)] = layers.Affine( self.params['W' + str(idx)], self.params['b' + str(idx)]) if self.use_batchnorm: self.layers['BatchNorm' + str(idx)] = layers.BatchNormalization( self.params['gamma' + str(idx)], self.params['beta' + str(idx)]) self.layers['Activation' + str(idx)] = activation_layer[activation]() if self.use_dropout: self.layers['Dropout' + str(idx)] = layers.Dropout(dropout_ratio) # last Affine layer need no Activation & Batch Norm idx = self.hidden_layer_num + 1 self.layers['Affine' + str(idx)] = layers.Affine( self.params['W' + str(idx)], self.params['b' + str(idx)]) # self.last_layer = layers.SoftmaxCrossEntropy() self.last_layer = layers.MSE() # dict to save activation layer output self.activation_dict = OrderedDict()
def __init__(self, input_size, hidden_size, output_size): # init para weight_init_std = 0.01 self.params = {} self.params['W1'] = weight_init_std * np.random.randn( input_size, hidden_size) self.params['b1'] = np.zeros(hidden_size) self.params['W2'] = weight_init_std * np.random.randn( hidden_size, output_size) self.params['b2'] = np.zeros(output_size) # create layers self.layers = OrderedDict() self.layers['Affine1'] = layers.Affine(self.params['W1'], self.params['b1']) self.layers['Relu1'] = layers.Relu() self.layers['Dropout1'] = layers.Dropout(drop_ratio=0.3) self.layers['Affine2'] = layers.Affine(self.params['W2'], self.params['b2']) self.lossLayer = layers.SoftmaxCrossEntropy(class_num=10)
def __init__(self, units, dropout_rate=0, **kwargs): super().__init__(units, **kwargs) self.dropout_layer = layers.Dropout(dropout_rate)
def __init__(self, input_size, hidden_size_list, output_size, activation='relu', weight_init_std='relu', weight_decay_lambda=0, use_dropout=False, dropout_ration=0.5, use_batchnorm=False): self.input_size = input_size self.hidden_size_list = hidden_size_list self.hidden_layer_num = len(hidden_size_list) self.output_size = output_size self.weight_decay_lambda = weight_decay_lambda self.use_dropout = use_dropout self.use_batchnorm = use_batchnorm self.params = {} # 가중치 초기화 self.__init_weight(weight_init_std) # 계층 생성 activation_layer = {'sigmoid': layers.Sigmoid, 'relu': layers.Relu} self.layers = OrderedDict() # < 은닉층 생성 > # self.hidden_layer_num 개수만큼 for idx in range(1, self.hidden_layer_num + 1): # (1) Affine 계층 self.layers['Affine' + str(idx)] = layers.Affine( self.params['W' + str(idx)], self.params['b' + str(idx)]) # (2) BatchNormalization 계층 if self.use_batchnorm: # 각 계층별 배치 정규화 계층에서 사용할 매개변수 최기화 # 원본 그대로에서 시작하는 것으로 초기화. 1배 확대(gamma), 이동 0(beta) self.params['gamma' + str(idx)] = np.ones( hidden_size_list[idx - 1]) # 1 self.params['beta' + str(idx)] = np.zeros( hidden_size_list[idx - 1]) # 0 self.layers['BatchNorm' + str(idx)] = layers.BatchNormalization( self.params['gamma' + str(idx)], self.params['beta' + str(idx)]) # (3) 활성화 함수 self.layers['Activation_function' + str(idx)] = activation_layer[activation]() # (4) Dropout 계층 if self.use_dropout: self.layers['Dropout' + str(idx)] = layers.Dropout(dropout_ration) # < 출력층 Affine 생성 > idx = self.hidden_layer_num + 1 self.layers['Affine' + str(idx)] = layers.Affine( self.params['W' + str(idx)], self.params['b' + str(idx)]) # < 출력층 생성 > # 출력층 활성화 함수로 Softmax, 손실함수로 cross_entropy_error 사용 self.last_layer = layers.SoftmaxWithLoss()
def __init__(self, input_dim=(1, 28, 28), conv_param_1={ 'filter_num': 16, 'filter_size': 3, 'pad': 1, 'stride': 1 }, conv_param_2={ 'filter_num': 16, 'filter_size': 3, 'pad': 1, 'stride': 1 }, conv_param_3={ 'filter_num': 32, 'filter_size': 3, 'pad': 1, 'stride': 1 }, conv_param_4={ 'filter_num': 32, 'filter_size': 3, 'pad': 2, 'stride': 1 }, conv_param_5={ 'filter_num': 64, 'filter_size': 3, 'pad': 1, 'stride': 1 }, conv_param_6={ 'filter_num': 64, 'filter_size': 3, 'pad': 1, 'stride': 1 }, hidden_size=50, output_size=10): # 重みの初期化=========== # 各層のニューロンひとつあたりが、前層のニューロンといくつのつながりがあるか pre_node_nums = np.array([ 1 * 3 * 3, 16 * 3 * 3, 16 * 3 * 3, 32 * 3 * 3, 32 * 3 * 3, 64 * 3 * 3, 64 * 4 * 4, hidden_size ]) wight_init_scales = np.sqrt(2.0 / pre_node_nums) # Heの初期値 self.params = {} pre_channel_num = input_dim[0] for idx, conv_param in enumerate([ conv_param_1, conv_param_2, conv_param_3, conv_param_4, conv_param_5, conv_param_6 ]): self.params[ 'W' + str(idx + 1)] = wight_init_scales[idx] * np.random.randn( conv_param['filter_num'], pre_channel_num, conv_param['filter_size'], conv_param['filter_size']) self.params['b' + str(idx + 1)] = np.zeros( conv_param['filter_num']) pre_channel_num = conv_param['filter_num'] self.params['W7'] = wight_init_scales[6] * np.random.randn( pre_node_nums[6], hidden_size) print(self.params['W7'].shape) self.params['b7'] = np.zeros(hidden_size) self.params['W8'] = wight_init_scales[7] * np.random.randn( pre_node_nums[7], output_size) self.params['b8'] = np.zeros(output_size) # レイヤの生成=========== self.layers = [] self.layers.append( layers.Convolution(self.params['W1'], self.params['b1'], conv_param_1['stride'], conv_param_1['pad'])) self.layers.append(layers.Relu()) self.layers.append( layers.Convolution(self.params['W2'], self.params['b2'], conv_param_2['stride'], conv_param_2['pad'])) self.layers.append(layers.Relu()) self.layers.append(layers.Pooling(pool_h=2, pool_w=2, stride=2)) self.layers.append( layers.Convolution(self.params['W3'], self.params['b3'], conv_param_3['stride'], conv_param_3['pad'])) self.layers.append(layers.Relu()) self.layers.append( layers.Convolution(self.params['W4'], self.params['b4'], conv_param_4['stride'], conv_param_4['pad'])) self.layers.append(layers.Relu()) self.layers.append(layers.Pooling(pool_h=2, pool_w=2, stride=2)) self.layers.append( layers.Convolution(self.params['W5'], self.params['b5'], conv_param_5['stride'], conv_param_5['pad'])) self.layers.append(layers.Relu()) self.layers.append( layers.Convolution(self.params['W6'], self.params['b6'], conv_param_6['stride'], conv_param_6['pad'])) self.layers.append(layers.Relu()) self.layers.append(layers.Pooling(pool_h=2, pool_w=2, stride=2)) self.layers.append(layers.Affine(self.params['W7'], self.params['b7'])) self.layers.append(layers.Relu()) self.layers.append(layers.Dropout(0.5)) self.layers.append(layers.Affine(self.params['W8'], self.params['b8'])) self.layers.append(layers.Dropout(0.5)) self.last_layer = layers.SoftmaxWithLoss()