def __init__(self, block, num_classes=100, batch_size=32): """init""" super(ResNet, self).__init__() self.batch_size = batch_size self.num_classes = num_classes self.head = Head() self.layer1 = MakeLayer0(block, in_channels=64, out_channels=256, stride=1) self.layer2 = MakeLayer1(block, in_channels=256, out_channels=512, stride=2) self.layer3 = MakeLayer2(block, in_channels=512, out_channels=1024, stride=2) self.layer4 = MakeLayer3(block, in_channels=1024, out_channels=2048, stride=2) self.pool = ops.ReduceMean(keep_dims=True) self.squeeze = ops.Squeeze(axis=(2, 3)) self.fc = fc_with_initialize(512 * block.expansion, num_classes) # pipeline parallel config self.head.pipeline_stage = 0 self.layer1.pipeline_stage = 0 self.layer2.pipeline_stage = 0 self.layer3.pipeline_stage = 1 self.layer4.pipeline_stage = 1 self.fc.pipeline_stage = 1
def __init__(self, block, num_classes=100, batch_size=32): """init""" super(ResNet, self).__init__() self.batch_size = batch_size self.num_classes = num_classes self.conv1 = conv7x7(3, 64, stride=2, padding=0) self.bn1 = bn_with_initialize(64) self.relu = ops.ReLU() self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, pad_mode="same") self.layer1 = MakeLayer0(block, in_channels=64, out_channels=256, stride=1) self.layer2 = MakeLayer1(block, in_channels=256, out_channels=512, stride=2) self.layer3 = MakeLayer2(block, in_channels=512, out_channels=1024, stride=2) self.layer4 = MakeLayer3(block, in_channels=1024, out_channels=2048, stride=2) self.pool = ops.ReduceMean(keep_dims=True) self.squeeze = ops.Squeeze(axis=(2, 3)) self.fc = fc_with_initialize(512 * block.expansion, num_classes)
def variable_recurrent(self, x, h, seq_length, w_ih, w_hh, b_ih, b_hh): '''recurrent steps with sequence length''' time_step = x.shape[0] h_t = h if self.is_lstm: hidden_size = h[0].shape[-1] zero_output = P.ZerosLike()(h_t[0]) else: hidden_size = h.shape[-1] zero_output = P.ZerosLike()(h_t) seq_length = P.Cast()(seq_length, mindspore.float32) seq_length = P.BroadcastTo((hidden_size, -1))(seq_length) seq_length = P.Cast()(seq_length, mindspore.int32) seq_length = P.Transpose()(seq_length, (1, 0)) outputs = [] state_t = h_t t = 0 while t < time_step: x_t = x[t:t + 1:1] x_t = P.Squeeze(0)(x_t) h_t = self.cell(x_t, state_t, w_ih, w_hh, b_ih, b_hh) seq_cond = seq_length > t if self.is_lstm: state_t_0 = P.Select()(seq_cond, h_t[0], state_t[0]) state_t_1 = P.Select()(seq_cond, h_t[1], state_t[1]) output = P.Select()(seq_cond, h_t[0], zero_output) state_t = (state_t_0, state_t_1) else: state_t = P.Select()(seq_cond, h_t, state_t) output = P.Select()(seq_cond, h_t, zero_output) outputs.append(output) t += 1 outputs = P.Stack()(outputs) return outputs, state_t
def construct(self, lstm_outputs, next_ids): total_loss = [] for lstm_output, next_token_id in zip(lstm_outputs, next_ids): next_token_id_flat = next_token_id.view((-1, 1)) if self.training and self.sample_softmax: lstm_output = lstm_output.view((-1, self.hidden_size)) loss = self.sampled_softmax_loss(self.weight, self.bias, next_token_id_flat, lstm_output) else: next_token_id_flat = P.Squeeze(1)(next_token_id_flat) output_scores = self.matmul(lstm_output, self.weight) + self.bias output_scores = output_scores.view((-1, output_scores.shape[-1])) loss = self.sparse_softmax_cross_entropy_with_logits(output_scores, next_token_id_flat) total_loss.append(self.reduce_mean(loss)) return 0.5 * (total_loss[0] + total_loss[1])
def recurrent(self, x, h_0, w_ih, w_hh, b_ih, b_hh): '''recurrent steps without sequence length''' time_step = x.shape[0] outputs = [] t = 0 h = h_0 while t < time_step: x_t = x[t:t + 1:1] x_t = P.Squeeze(0)(x_t) h = self.cell(x_t, h, w_ih, w_hh, b_ih, b_hh) if self.is_lstm: outputs.append(h[0]) else: outputs.append(h) t += 1 outputs = P.Stack()(outputs) return outputs, h
def __init__(self, block, layer_nums, in_channels, out_channels, strides=(1, 2, 2, 2), num_classes=100): super(ResNet, self).__init__() if not len(layer_nums) == len(in_channels) == len(out_channels) == 4: raise ValueError("the length of " "layer_num, inchannel, outchannel list must be 4!") self.conv1 = _conv7x7(3, 64, stride=2) self.bn1 = _fused_bn(64) self.relu = ops.ReLU() self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, pad_mode='same') self.layer1 = self._make_layer(block, layer_nums[0], in_channel=in_channels[0], out_channel=out_channels[0], stride=strides[0]) self.layer2 = self._make_layer(block, layer_nums[1], in_channel=in_channels[1], out_channel=out_channels[1], stride=strides[1]) self.layer3 = self._make_layer(block, layer_nums[2], in_channel=in_channels[2], out_channel=out_channels[2], stride=strides[2]) self.layer4 = self._make_layer(block, layer_nums[3], in_channel=in_channels[3], out_channel=out_channels[3], stride=strides[3]) self.mean = ops.ReduceMean(keep_dims=True) self.end_point = nn.Dense(out_channels[3], num_classes, has_bias=True, weight_init=dense_weight_variable()) self.squeeze = ops.Squeeze() self.cast = ops.Cast()
def __init__(self, config, is_training, use_one_hot_embeddings=False): super(BertModel, self).__init__() config = copy.deepcopy(config) if not is_training: config.hidden_dropout_prob = 0.0 config.attention_probs_dropout_prob = 0.0 self.input_mask_from_dataset = config.input_mask_from_dataset self.token_type_ids_from_dataset = config.token_type_ids_from_dataset self.batch_size = config.batch_size self.seq_length = config.seq_length self.hidden_size = config.hidden_size self.num_hidden_layers = config.num_hidden_layers self.embedding_size = config.hidden_size self.token_type_ids = None self.last_idx = self.num_hidden_layers - 1 #output_embedding_shape = [self.batch_size, self.seq_length, # self.embedding_size] output_embedding_shape = [-1, self.seq_length, self.embedding_size] if not self.token_type_ids_from_dataset: self.token_type_ids = initializer( "zeros", [self.batch_size, self.seq_length], mstype.int32).to_tensor() self.bert_embedding_lookup = EmbeddingLookup( vocab_size=config.vocab_size, embedding_size=self.embedding_size, embedding_shape=output_embedding_shape, use_one_hot_embeddings=use_one_hot_embeddings, initializer_range=config.initializer_range) self.bert_embedding_postprocessor = EmbeddingPostprocessor( embedding_size=self.embedding_size, embedding_shape=output_embedding_shape, use_relative_positions=config.use_relative_positions, use_token_type=True, token_type_vocab_size=config.type_vocab_size, use_one_hot_embeddings=use_one_hot_embeddings, initializer_range=0.02, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob) self.bert_encoder = BertTransformer( batch_size=self.batch_size, hidden_size=self.hidden_size, seq_length=self.seq_length, num_attention_heads=config.num_attention_heads, num_hidden_layers=self.num_hidden_layers, intermediate_size=config.intermediate_size, attention_probs_dropout_prob=config.attention_probs_dropout_prob, use_one_hot_embeddings=use_one_hot_embeddings, initializer_range=config.initializer_range, hidden_dropout_prob=config.hidden_dropout_prob, use_relative_positions=config.use_relative_positions, hidden_act=config.hidden_act, compute_type=config.compute_type, return_all_encoders=True, enable_fused_layernorm=config.enable_fused_layernorm) self.cast = ops.Cast() self.dtype = config.dtype self.cast_compute_type = SaturateCast(dst_type=config.compute_type) self.slice = ops.StridedSlice() self.squeeze_1 = ops.Squeeze(axis=1) self.dense = nn.Dense(self.hidden_size, self.hidden_size, activation="tanh", weight_init=TruncatedNormal(config.initializer_range)).to_float(config.compute_type) self._create_attention_mask_from_input_mask = CreateAttentionMaskFromInputMask(config)
def __init__(self): super(SqueezeNet, self).__init__() self.squeeze = P.Squeeze()
def __init__(self, config, is_training, use_one_hot_embeddings=False): super(BertModel, self).__init__() self._bertconfig=BertConfig( seq_length=config["max_position_embeddings"], vocab_size=config["vocab_size"], hidden_size=config["hidden_size"], num_hidden_layers=config["num_hidden_layers"], num_attention_heads=config["num_attention_heads"], intermediate_size=config["hidden_size"]*4, hidden_act=config["hidden_act"], hidden_dropout_prob=config["hidden_dropout_prob"], attention_probs_dropout_prob=config["attention_probs_dropout_prob"], max_position_embeddings=config["max_position_embeddings"], type_vocab_size=config["type_vocab_size"], initializer_range=config["initializer_range"], use_relative_positions=False, dtype=mstype.float32, compute_type=mstype.float32) config = copy.deepcopy(self._bertconfig) if not is_training: config.hidden_dropout_prob = 0.0 config.attention_probs_dropout_prob = 0.0 self.seq_length = config.seq_length self.hidden_size = config.hidden_size self.num_hidden_layers = config.num_hidden_layers self.embedding_size = config.hidden_size self.token_type_ids = None self.last_idx = self.num_hidden_layers - 1 output_embedding_shape = [-1, self.seq_length, self.embedding_size] self.bert_embedding_lookup = nn.Embedding( vocab_size=config.vocab_size, embedding_size=self.embedding_size, use_one_hot=use_one_hot_embeddings) self.bert_embedding_postprocessor = EmbeddingPostprocessor( embedding_size=self.embedding_size, embedding_shape=output_embedding_shape, use_relative_positions=config.use_relative_positions, use_token_type=True, token_type_vocab_size=config.type_vocab_size, use_one_hot_embeddings=use_one_hot_embeddings, initializer_range=0.02, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob) self.bert_encoder = BertTransformer( hidden_size=self.hidden_size, seq_length=self.seq_length, num_attention_heads=config.num_attention_heads, num_hidden_layers=self.num_hidden_layers, intermediate_size=config.intermediate_size, attention_probs_dropout_prob=config.attention_probs_dropout_prob, use_one_hot_embeddings=use_one_hot_embeddings, initializer_range=config.initializer_range, hidden_dropout_prob=config.hidden_dropout_prob, use_relative_positions=config.use_relative_positions, hidden_act=config.hidden_act, compute_type=config.compute_type, return_all_encoders=True) self.cast = P.Cast() self.dtype = config.dtype self.cast_compute_type = SaturateCast(dst_type=config.compute_type) self.slice = P.StridedSlice() self.squeeze_1 = P.Squeeze(axis=1) self.dense = nn.Dense(self.hidden_size, self.hidden_size, activation="tanh", weight_init=TruncatedNormal(config.initializer_range)).to_float(config.compute_type) self._create_attention_mask_from_input_mask = CreateAttentionMaskFromInputMask(config)