def forward(self, x): self.instance += 1 x = self.conv1(x) x = lbann.Pooling(x, num_dims=2, has_vectors=False, pool_dims_i=2, pool_pads_i=0, pool_strides_i=2, pool_mode='max', name='{0}_pool1_instance{1}'.format( self.name, self.instance)) x = self.conv2(x) x = lbann.Pooling(x, num_dims=2, has_vectors=False, pool_dims_i=2, pool_pads_i=0, pool_strides_i=2, pool_mode='max', name='{0}_pool2_instance{1}'.format( self.name, self.instance)) x = self.fc1(x) x = lbann.Dropout(x, keep_prob=0.5, name='{0}_drop6_instance{1}'.format( self.name, self.instance)) x = self.fc2(x) x = lbann.Dropout(x, keep_prob=0.5, name='{0}_drop7_instance{1}'.format( self.name, self.instance)) return self.fc3(x)
def forward(self, x, mask=None): """Apply Transformer encoder layer. Args: x (lbann.Layer): Sequence of input vectors. mask (lbann.Layer, optional): Attention mask. Returns: lbann.Layer: Sequence of output vectors. """ self.instance += 1 name = f'{self.name}_instance{self.instance}' # Self-attention with residual connection y = self.attention(x, x, x, mask=mask) if self.dropout_prob > 0: y = lbann.Dropout( y, keep_prob=1 - self.dropout_prob, name=f'{name}_drop1', ) z = lbann.Sum(x, y, name=f'{name}_sum1') z = lbann.InstanceNorm(z, name=f'{name}_norm1') x = z # Feedforward network with residual connection y = lbann.ChannelwiseFullyConnected( x, weights=self.fc1_weights, output_channel_dims=[self.feedforward_dim], name=f'{name}_fc1', ) y = lbann.Relu(y, name=f'{name}_relu1') if self.dropout_prob > 0: y = lbann.Dropout( y, keep_prob=1 - self.dropout_prob, name=f'{name}_drop2', ) y = lbann.ChannelwiseFullyConnected( y, weights=self.fc2_weights, output_channel_dims=[self.embed_dim], name=f'{name}_fc2', ) if self.dropout_prob > 0: y = lbann.Dropout( y, keep_prob=1 - self.dropout_prob, name=f'{name}_drop3', ) z = lbann.Sum(x, y, name=f'{name}_sum2') z = lbann.InstanceNorm(z, name=f'{name}_norm2') return z
def construct_model(): """Model description """ import lbann import lbann.modules fc = lbann.modules.FullyConnectedModule conv = lbann.modules.Convolution2dModule conv1 = conv(20, 3, stride=1, padding=1, name='conv1') conv2 = conv(20, 3, stride=1, padding=1, name='conv2') fc1 = fc(100, name='fc1') fc2 = fc(20, name='fc2') fc3 = fc(num_classes, name='fc3') # Layer graph input = lbann.Input(name='inp_tensor', target_mode='classification') inp_slice = lbann.Slice(input, axis=0, slice_points=str_list([0, dims - 1, dims]), name='inp_slice') xdata = lbann.Identity(inp_slice) ylabel = lbann.Identity(inp_slice, name='gt_y') #NHWC to NCHW x = lbann.Reshape(xdata, dims='14 13 13') x = conv2(conv1(x)) x = lbann.Reshape(x, dims='3380') x = lbann.Dropout(lbann.Relu(fc1(x)), keep_prob=0.5) x = lbann.Dropout(fc2(x), keep_prob=0.5) pred = lbann.Softmax(fc3(x)) gt_label = lbann.OneHot(ylabel, size=num_classes) loss = lbann.CrossEntropy([pred, gt_label], name='loss') acc = lbann.CategoricalAccuracy([pred, gt_label]) layers = list(lbann.traverse_layer_graph(input)) # Setup objective function weights = set() for l in layers: weights.update(l.weights) obj = lbann.ObjectiveFunction(loss) callbacks = [lbann.CallbackPrint(), lbann.CallbackTimer()] # Construct model num_epochs = 10 return lbann.Model(num_epochs, weights=weights, layers=layers, metrics=[lbann.Metric(acc, name='accuracy', unit='%')], objective_function=obj, callbacks=callbacks)
def forward(self, hidden_states, input_tensor): hidden_states, hidden_shape = lbann.modules.PytorchLinear( hidden_states, self.input_shape, self.hidden_size, weights=_load_pretrained_weights( ".".join((self.name, "dense.weight")), ".".join((self.name, "dense.bias")), load_weights=self.load_weights, ), name=".".join((self.name, "dense")), return_dims=True, ) hidden_states = lbann.Dropout(hidden_states, keep_prob=self.hidden_dropout_prob) hidden_states = lbann.modules.PytorchLayerNorm( lbann.Add(hidden_states, input_tensor), self.layer_norm_eps, hidden_shape, weights=_load_pretrained_weights( ".".join((self.name, "layernorm.weightbias")), load_weights=self.load_weights, ), name=".".join((self.name, "LayerNorm")), ) return hidden_states
def create_dropout(x, i): return lbann.Dropout(x, keep_prob=0.8, name='{0}_drop{1}_instance{2}'.format( self.name, i, self.instance))
# Construct layer graph input_ = lbann.Input(name='input') image = lbann.Identity(input_, name='images') dummy = lbann.Dummy(input_, name='labels') # Encoder encode1 = lbann.FullyConnected(image, name="encode1", data_layout="model_parallel", num_neurons=1000, has_bias=True) relu1 = lbann.Relu(encode1, name="relu1", data_layout="model_parallel") dropout1 = lbann.Dropout(relu1, name="dropout1", data_layout="model_parallel", keep_prob=0.8) decode1 = lbann.FullyConnected(dropout1, name="decode1", data_layout="model_parallel", hint_layer=image, has_bias=True) reconstruction = lbann.Sigmoid(decode1, name="reconstruction", data_layout="model_parallel") dropout2 = lbann.Dropout(reconstruction, name="dropout2", data_layout="model_parallel",
def forward( self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, ): if position_ids is None: if input_ids is not None: position_ids = create_position_ids_from_input_ids( input_ids, self.input_shape, self.padding_idx, ) else: position_ids = self.create_position_ids_from_inputs_embeds( inputs_embeds) if token_type_ids is None: token_type_ids = lbann.Constant(value=0, num_neurons=str_list( self.input_shape)) if inputs_embeds is None: inputs_embeds = lbann.Embedding( input_ids, num_embeddings=self.vocab_size, embedding_dim=self.hidden_size, padding_idx=self.pad_token_id, weights=_load_pretrained_weights( ".".join((self.name, "word_embeddings.weight")), load_weights=self.load_weights, ), name=".".join((self.name, "word_embeddings")), ) token_type_embeddings = lbann.Embedding( token_type_ids, num_embeddings=self.type_vocab_size, embedding_dim=self.hidden_size, weights=_load_pretrained_weights( ".".join((self.name, "token_type_embeddings.weight")), load_weights=self.load_weights, ), name=".".join((self.name, "token_type_embeddings")), ) embeddings = lbann.Add(inputs_embeds, token_type_embeddings) if self.position_embedding_type == "absolute": position_embeddings = lbann.Embedding( position_ids, num_embeddings=self.max_position_embeddings, embedding_dim=self.hidden_size, padding_idx=self.pad_token_id, weights=_load_pretrained_weights( ".".join((self.name, "position_embeddings.weight")), load_weights=self.load_weights, ), name=".".join((self.name, "position_embeddings")), ) embeddings = lbann.Add(embeddings, position_embeddings) embeddings = lbann.modules.PytorchLayerNorm( embeddings, self.layer_norm_eps, self.input_shape + (self.hidden_size, ), weights=_load_pretrained_weights( ".".join((self.name, "layernorm.weightbias")), load_weights=self.load_weights, ), name=".".join((self.name, "LayerNorm")), ) embeddings = lbann.Dropout(embeddings, keep_prob=self.hidden_dropout_prob) return embeddings
def forward( self, hidden_states, attention_mask=None, head_mask=None, ): mixed_query_layer, query_shape = lbann.modules.PytorchLinear( hidden_states, self.input_shape, self.all_head_size, weights=_load_pretrained_weights( ".".join((self.name, "query.weight")), ".".join((self.name, "query.bias")), load_weights=self.load_weights, ), name=".".join((self.name, "query")), return_dims=True, ) query_layer, query_shape = self.transpose_for_scores( mixed_query_layer, query_shape) key_layer, key_shape = lbann.modules.PytorchLinear( hidden_states, self.input_shape, self.all_head_size, weights=_load_pretrained_weights( ".".join((self.name, "key.weight")), ".".join((self.name, "key.bias")), load_weights=self.load_weights, ), name=".".join((self.name, "key")), return_dims=True, ) key_layer, key_shape = self.transpose_for_scores(key_layer, key_shape) value_layer, value_shape = lbann.modules.PytorchLinear( hidden_states, self.input_shape, self.all_head_size, weights=_load_pretrained_weights( ".".join((self.name, "value.weight")), ".".join((self.name, "value.bias")), load_weights=self.load_weights, ), name=".".join((self.name, "value")), return_dims=True, ) value_layer, value_shape = self.transpose_for_scores( value_layer, value_shape) # Take the dot product between "query" and "key" to get the raw attention scores. key_layer, key_shape = lbann.modules.Permute(key_layer, key_shape, axes=(0, 1, -1, -2), return_dims=True) attention_scores, attention_shape = lbann.modules.PytorchMatmul( query_layer, query_shape, key_layer, key_shape, return_dims=True, ) attention_scores = lbann.Scale(attention_scores, constant=1 / math.sqrt(self.attention_head_size)) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in RobertaModel forward() function) attention_scores = lbann.Add(attention_scores, attention_mask) # Normalize the attention scores to probabilities. attention_scores = lbann.Reshape( attention_scores, dims=str_list([np.prod(attention_shape[:-1]), attention_shape[-1]]), ) attention_probs = lbann.ChannelwiseSoftmax(attention_scores) attention_probs = lbann.Reshape(attention_probs, dims=str_list(attention_shape)) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. attention_probs = lbann.Dropout( attention_probs, keep_prob=self.attention_probs_dropout_prob, ) # Mask heads if we want to if head_mask is not None: attention_probs = lbann.Multiply(attention_probs, head_mask) context_layer, context_shape = lbann.modules.PytorchMatmul( attention_probs, attention_shape, value_layer, value_shape, return_dims=True, ) context_layer, context_shape = lbann.modules.Permute( context_layer, context_shape, axes=(0, 2, 1, 3), return_dims=True, ) new_context_layer_shape = context_shape[:-2] + (self.all_head_size, ) context_layer = lbann.Reshape(context_layer, dims=str_list(self.input_shape)) return context_layer
def forward(self, x, memory, src_mask=None, tgt_mask=None): """Apply Transformer decoder layer. Args: x (lbann.Layer): Sequence of input vectors. memory (lbann.Layer): Sequence of vectors produced by Transformer encoder stack. src_mask (lbann.Layer, optional): Attention mask for second attention module (attends to both `x` and `memory`). tgt_mask (lbann.Layer, optional): Attention mask for first attention module (attends only to `x`). Returns: lbann.Layer: Sequence of output vectors. """ self.instance += 1 name = f'{self.name}_instance{self.instance}' # Self-attention with residual connection y = self.attention1(x, x, x, mask=tgt_mask) if self.dropout_prob > 0: y = lbann.Dropout( y, keep_prob=1 - self.dropout_prob, name=f'{name}_drop1', ) z = lbann.Sum(x, y, name=f'{name}_sum1') z = lbann.InstanceNorm(z, name=f'{name}_norm1') x = z # Attention on encoder output with residual connection y = self.attention2(x, memory, memory, mask=src_mask) if self.dropout_prob > 0: y = lbann.Dropout( y, keep_prob=1 - self.dropout_prob, name=f'{name}_drop2', ) z = lbann.Sum(x, y, name=f'{name}_sum2') z = lbann.InstanceNorm(z, name=f'{name}_norm2') x = z # Feedforward network with residual connection y = lbann.ChannelwiseFullyConnected( x, weights=self.fc1_weights, output_channel_dims=[self.feedforward_dim], name=f'{name}_fc1', ) y = lbann.Relu(y, name=f'{name}_relu1') if self.dropout_prob > 0: y = lbann.Dropout( y, keep_prob=1 - self.dropout_prob, name=f'{name}_drop3', ) y = lbann.ChannelwiseFullyConnected( y, weights=self.fc2_weights, output_channel_dims=[self.embed_dim], name=f'{name}_fc2', ) if self.dropout_prob > 0: y = lbann.Dropout( y, keep_prob=1 - self.dropout_prob, name=f'{name}_drop4', ) z = lbann.Sum(x, y, name=f'{name}_sum3') z = lbann.InstanceNorm(z, name=f'{name}_norm3') return z
def forward(self, x): self.instance += 1 # Convolutional network x = self.conv1(x) x = lbann.LocalResponseNormalization( x, window_width=5, lrn_alpha=0.0001, lrn_beta=0.75, lrn_k=2, name='{0}_norm1_instance{1}'.format(self.name, self.instance)) x = lbann.Pooling(x, num_dims=2, has_vectors=False, pool_dims_i=3, pool_pads_i=0, pool_strides_i=2, pool_mode='max', name='{0}_pool1_instance{1}'.format( self.name, self.instance)) x = self.conv2(x) x = lbann.LocalResponseNormalization( x, window_width=5, lrn_alpha=0.0001, lrn_beta=0.75, lrn_k=2, name='{0}_norm2_instance{1}'.format(self.name, self.instance)) x = lbann.Pooling(x, num_dims=2, has_vectors=False, pool_dims_i=3, pool_pads_i=0, pool_strides_i=2, pool_mode='max', name='{0}_pool2_instance{1}'.format( self.name, self.instance)) x = self.conv5(self.conv4(self.conv3(x))) x = lbann.Pooling(x, num_dims=2, has_vectors=False, pool_dims_i=3, pool_pads_i=0, pool_strides_i=2, pool_mode='max', name='{0}_pool5_instance{1}'.format( self.name, self.instance)) # Fully-connected network x = self.fc6(x) x = lbann.Dropout(x, keep_prob=0.5, name='{0}_drop6_instance{1}'.format( self.name, self.instance)) x = self.fc7(x) x = lbann.Dropout(x, keep_prob=0.5, name='{0}_drop7_instance{1}'.format( self.name, self.instance)) return self.fc8(x)
def forward(self, x): return lbann.Dropout(self.track_fc[2](lbann.Dropout( self.track_fc[1](lbann.Dropout(self.track_fc[0](x), keep_prob=self.kp)), keep_prob=self.kp)), keep_prob=self.kp)