def train(print_result=True): """train""" # 1. initialize parallel environment train_data_list1 = [] train_data_list2 = [] dist.init_parallel_env() # 2. create data parallel layer & optimizer layer = LinearNet() dp_layer = paddle.DataParallel(layer) loss_fn = nn.MSELoss() adam = opt.Adam(learning_rate=0.001, parameters=dp_layer.parameters()) # 3. run layer inputs = paddle.randn([10, 10], 'float32') outputs = dp_layer(inputs) labels = paddle.randn([10, 1], 'float32') loss = loss_fn(outputs, labels) assert len(loss) == 1 if print_result is True: train_data_list1.append(loss.numpy()) assert len(train_data_list1) loss.backward() adam.step() adam.clear_grad()
def train(): # 1. enable dynamic mode paddle.disable_static() # 2. initialize parallel environment dist.init_parallel_env() # 3. create data parallel layer & optimizer layer = LinearNet() dp_layer = paddle.DataParallel(layer) loss_fn = nn.MSELoss() adam = opt.Adam(learning_rate=0.001, parameters=dp_layer.parameters()) # 4. run layer inputs = paddle.randn([10, 10], 'float32') outputs = dp_layer(inputs) labels = paddle.randn([10, 1], 'float32') loss = loss_fn(outputs, labels) loss = dp_layer.scale_loss(loss) loss.backward() dp_layer.apply_collective_grads() adam.step() adam.clear_grad()
def __init__(self, reduction='mean', loss_weight=1.0): # when loss weight less than zero return None if loss_weight <= 0: return None self._l2_loss = nn.MSELoss(reduction) self.loss_weight = loss_weight self.reduction = reduction
def train(print_result=False): # 1. initialize parallel environment dist.init_parallel_env() # 2. create data parallel layer & optimizer layer = LinearNet() dp_layer = paddle.DataParallel(layer) loss_fn = nn.MSELoss() adam = opt.Adam(learning_rate=0.001, parameters=dp_layer.parameters()) # 3. run layer inputs = paddle.randn([10, 10], 'float32') outputs = dp_layer(inputs) labels = paddle.randn([10, 1], 'float32') loss = loss_fn(outputs, labels) if print_result is True: print("Rank:", int(os.getenv("PADDLE_TRAINER_ID"))) loss.backward() adam.step() adam.clear_grad() return int(os.getenv("PADDLE_TRAINER_ID"))
def train(print_result=False): # 1. enable dynamic mode paddle.disable_static() # 2. initialize parallel environment dist.init_parallel_env() # 3. create data parallel layer & optimizer layer = LinearNet() dp_layer = paddle.DataParallel(layer) loss_fn = nn.MSELoss() adam = opt.Adam( learning_rate=0.001, parameters=dp_layer.parameters()) # 4. run layer inputs = paddle.randn([10, 10], 'float32') outputs = dp_layer(inputs) labels = paddle.randn([10, 1], 'float32') loss = loss_fn(outputs, labels) if print_result is True: print("loss:", loss.numpy()) loss.backward() adam.step() adam.clear_grad()
def train(print_result=True): # 1. enable dynamic mode # device = paddle.set_device('gpu') # paddle.disable_static(device) # 2. initialize parallel environment dist.init_parallel_env() # 3. create data parallel layer & optimizer layer = LinearNet() dp_layer = paddle.DataParallel(layer) loss_fn = nn.MSELoss() adam = opt.Adam(learning_rate=0.001, parameters=dp_layer.parameters()) dataset = FakeDataset() # loader = paddle.io.DataLoader(dataset, batch_size=2, places=device, num_workers=2) loader = paddle.io.DataLoader(dataset, batch_size=2, num_workers=2) # 4. run layer for inputs, labels in loader: # inputs = paddle.randn([10, 10], 'float32') outputs = dp_layer(inputs) # labels = paddle.randn([10, 1], 'float32') loss = loss_fn(outputs, labels) if print_result is True: print("loss:", loss.numpy()) # loss = dp_layer.scale_loss(loss) loss.backward() # dp_layer.apply_collective_grads() adam.step() adam.clear_grad()
def __init__(self, gan_mode, target_real_label=1.0, target_fake_label=0.0): """ Initialize the GANLoss class. Parameters: gan_mode (str) - - the type of GAN objective. It currently supports vanilla, lsgan, and wgangp. target_real_label (bool) - - label for a real image target_fake_label (bool) - - label of a fake image Note: Do not use sigmoid as the last layer of Discriminator. LSGAN needs no sigmoid. vanilla GANs will handle it with BCEWithLogitsLoss. """ super(GANLoss, self).__init__() # self.register_buffer('real_label', torch.tensor(target_real_label)) # self.register_buffer('fake_label', torch.tensor(target_fake_label)) self.real_label = paddle.fluid.dygraph.to_variable( np.array(target_real_label)) self.fake_label = paddle.fluid.dygraph.to_variable( np.array(target_fake_label)) # self.real_label.stop_gradients = True # self.fake_label.stop_gradients = True self.gan_mode = gan_mode if gan_mode == 'lsgan': self.loss = nn.MSELoss() elif gan_mode == 'vanilla': self.loss = nn.BCELoss() #nn.BCEWithLogitsLoss() elif gan_mode in ['wgangp']: self.loss = None else: raise NotImplementedError('gan mode %s not implemented' % gan_mode)
def __init__(self, gan_mode, target_real_label=1.0, target_fake_label=0.0, loss_weight=1.0): """ Initialize the GANLoss class. Args: gan_mode (str): the type of GAN objective. It currently supports vanilla, lsgan, and wgangp. target_real_label (bool): label for a real image target_fake_label (bool): label of a fake image Note: Do not use sigmoid as the last layer of Discriminator. LSGAN needs no sigmoid. vanilla GANs will handle it with BCEWithLogitsLoss. """ super(GANLoss, self).__init__() # when loss weight less than zero return None if loss_weight <= 0: return None self.target_real_label = target_real_label self.target_fake_label = target_fake_label self.loss_weight = loss_weight self.gan_mode = gan_mode if gan_mode == 'lsgan': self.loss = nn.MSELoss() elif gan_mode == 'vanilla': self.loss = nn.BCEWithLogitsLoss() elif gan_mode in ['wgan', 'wgangp', 'hinge', 'logistic']: self.loss = None else: raise NotImplementedError('gan mode %s not implemented' % gan_mode)
def __init__(self, mode="l2", **kargs): super().__init__() assert mode in ["l1", "l2", "smooth_l1"] if mode == "l1": self.loss_func = nn.L1Loss(**kargs) elif mode == "l2": self.loss_func = nn.MSELoss(**kargs) elif mode == "smooth_l1": self.loss_func = nn.SmoothL1Loss(**kargs)
def __init__(self, use_target_weight=True): """ KeyPointMSELoss layer Args: use_target_weight (bool): whether to use target weight """ super(KeyPointMSELoss, self).__init__() self.criterion = nn.MSELoss(reduction='mean') self.use_target_weight = use_target_weight
def __init__(self, use_target_weight=True, loss_scale=0.5, key=None, weight=1.0): super().__init__() self.criterion = nn.MSELoss(reduction='mean') self.use_target_weight = use_target_weight self.loss_scale = loss_scale self.key = key self.weight = weight
def train(): dist.init_parallel_env() # 1. initialize parallel environment set_seed(2021) # 2. create data parallel layer & optimizer layer = LinearNet() if dist.get_world_size() > 1: dp_layer = paddle.DataParallel(layer) else: dp_layer = layer layer2 = LinearNet() if dist.get_world_size() > 1: dp_layer2 = paddle.DataParallel(layer2) else: dp_layer2 = layer2 dp_layer2.set_state_dict(dp_layer.state_dict()) loss_fn = nn.MSELoss() adam = opt.Adam(learning_rate=0.001, parameters=dp_layer.parameters()) adam2 = opt.Adam(learning_rate=0.001, parameters=dp_layer2.parameters()) # 3. run layer print("Start") for i in range(10): batch_size = 10 shard = int(batch_size / dist.get_world_size()) start_no = shard * dist.get_rank() end_no = start_no + shard inputs = paddle.randn([10, 10], 'float32')[start_no:end_no] outputs = dp_layer(inputs) labels = paddle.randn([10, 1], 'float32')[start_no:end_no] loss = loss_fn(outputs, labels) if dist.get_rank() == 0: print("Loss1", loss.numpy()[0]) print(dp_layer.parameters()) loss.backward() adam.step() adam.clear_grad() outputs = dp_layer2(inputs) loss = loss_fn(outputs, labels) loss.backward() if dist.get_rank() == 0: print("Loss2", loss.numpy()[0]) print(dp_layer2.parameters()) adam2.step() adam2.clear_grad()
def train(): """bergin train""" arr1 = [] arr2 = [] dist.init_parallel_env() set_seed(2021) layer = LinearNet() if dist.get_world_size() > 1: dp_layer = paddle.DataParallel(layer) else: dp_layer = layer layer2 = LinearNet() if dist.get_world_size() > 1: dp_layer2 = paddle.DataParallel(layer2) else: dp_layer2 = layer2 dp_layer2.set_state_dict(dp_layer.state_dict()) loss_fn = nn.MSELoss() adam = opt.Adam( learning_rate=0.001, parameters=dp_layer.parameters()) adam2 = opt.Adam( learning_rate=0.001, parameters=dp_layer2.parameters()) for i in range(2): batch_size = 10 shard = int(batch_size / dist.get_world_size()) start_no = shard * dist.get_rank() end_no = start_no + shard inputs = paddle.randn([10, 10], 'float32')[start_no:end_no] outputs = dp_layer(inputs) labels = paddle.randn([10, 1], 'float32')[start_no:end_no] loss = loss_fn(outputs, labels) if dist.get_rank() == 0: arr1.append(loss.numpy()[0]) loss.backward() adam.step() adam.clear_grad() outputs = dp_layer2(inputs) loss = loss_fn(outputs, labels) loss.backward() if dist.get_rank() == 0: arr2.append(loss.numpy()[0]) adam2.step() adam2.clear_grad() check_data(arr1, arr2)
def pack_models(path): model = Model() loss = nn.MSELoss() adam = paddle.optimizer.Adam(parameters=model.parameters()) train_data = paddle.text.datasets.UCIHousing(mode="train") loader = paddle.io.DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, num_workers=2) train(model, loader, loss, adam) PaddlePaddleModelArtifact("model").pack(model).save(path)
def __init__(self, cfg): """Initialize the CycleGAN class. Parameters: opt (config)-- stores all the experiment flags; needs to be a subclass of Dict """ super(UGATITModel, self).__init__(cfg) # define networks (both Generators and discriminators) # The naming is different from those used in the paper. self.nets['genA2B'] = build_generator(cfg.model.generator) self.nets['genB2A'] = build_generator(cfg.model.generator) init_weights(self.nets['genA2B']) init_weights(self.nets['genB2A']) if self.is_train: # define discriminators self.nets['disGA'] = build_discriminator(cfg.model.discriminator_g) self.nets['disGB'] = build_discriminator(cfg.model.discriminator_g) self.nets['disLA'] = build_discriminator(cfg.model.discriminator_l) self.nets['disLB'] = build_discriminator(cfg.model.discriminator_l) init_weights(self.nets['disGA']) init_weights(self.nets['disGB']) init_weights(self.nets['disLA']) init_weights(self.nets['disLB']) if self.is_train: # define loss functions self.BCE_loss = nn.BCEWithLogitsLoss() self.L1_loss = nn.L1Loss() self.MSE_loss = nn.MSELoss() self.build_lr_scheduler() self.optimizers['optimizer_G'] = build_optimizer( cfg.optimizer, self.lr_scheduler, parameter_list=self.nets['genA2B'].parameters() + self.nets['genB2A'].parameters()) self.optimizers['optimizer_D'] = build_optimizer( cfg.optimizer, self.lr_scheduler, parameter_list=self.nets['disGA'].parameters() + self.nets['disGB'].parameters() + self.nets['disLA'].parameters() + self.nets['disLB'].parameters()) self.Rho_clipper = RhoClipper(0, 1)
def validation_step(self, batch: int, batch_idx: int) -> dict: ''' One step for validation, which should be called as forward computation. Args: batch(list[paddle.Tensor]): The one batch data, which contains images and labels. batch_idx(int): The index of batch. Returns: results(dict) : The model outputs, such as metrics. ''' mse_loss = nn.MSELoss() N, C, H, W = batch[0].shape batch[1] = batch[1][0].unsqueeze(0) self.setTarget(batch[1]) y = self(batch[0]) xc = paddle.to_tensor(batch[0].numpy().copy()) y = utils.subtract_imagenet_mean_batch(y) xc = utils.subtract_imagenet_mean_batch(xc) features_y = self.getFeature(y) features_xc = self.getFeature(xc) f_xc_c = paddle.to_tensor(features_xc[1].numpy(), stop_gradient=True) content_loss = mse_loss(features_y[1], f_xc_c) batch[1] = utils.subtract_imagenet_mean_batch(batch[1]) features_style = self.getFeature(batch[1]) gram_style = [utils.gram_matrix(y) for y in features_style] style_loss = 0. for m in range(len(features_y)): gram_y = utils.gram_matrix(features_y[m]) gram_s = paddle.to_tensor( np.tile(gram_style[m].numpy(), (N, 1, 1, 1))) style_loss += mse_loss(gram_y, gram_s[:N, :, :]) loss = content_loss + style_loss return { 'loss': loss, 'metrics': { 'content gap': content_loss, 'style gap': style_loss } }
def test_dygraph_single(self): paddle.disable_static() fleet.init(is_collective=True) layer = LinearNet() loss_fn = nn.MSELoss() adam = paddle.optimizer.Adam(learning_rate=0.001, parameters=layer.parameters()) adam = fleet.distributed_optimizer(adam) dp_layer = fleet.distributed_model(layer) for step in range(2): inputs = paddle.randn([10, 10], 'float32') outputs = dp_layer(inputs) labels = paddle.randn([10, 1], 'float32') loss = loss_fn(outputs, labels) loss.backward() adam.step() adam.clear_grad()
def __init__(self, balance_loss=True, main_loss_type='DiceLoss', negative_ratio=3, return_origin=False, eps=1e-6, **kwargs): """ The BalanceLoss for Differentiable Binarization text detection args: balance_loss (bool): whether balance loss or not, default is True main_loss_type (str): can only be one of ['CrossEntropy','DiceLoss', 'Euclidean','BCELoss', 'MaskL1Loss'], default is 'DiceLoss'. negative_ratio (int|float): float, default is 3. return_origin (bool): whether return unbalanced loss or not, default is False. eps (float): default is 1e-6. """ super(BalanceLoss, self).__init__() self.balance_loss = balance_loss self.main_loss_type = main_loss_type self.negative_ratio = negative_ratio self.return_origin = return_origin self.eps = eps if self.main_loss_type == "CrossEntropy": self.loss = nn.CrossEntropyLoss() elif self.main_loss_type == "Euclidean": self.loss = nn.MSELoss() elif self.main_loss_type == "DiceLoss": self.loss = DiceLoss(self.eps) elif self.main_loss_type == "BCELoss": self.loss = BCELoss(reduction='none') elif self.main_loss_type == "MaskL1Loss": self.loss = MaskL1Loss(self.eps) else: loss_type = [ 'CrossEntropy', 'DiceLoss', 'Euclidean', 'BCELoss', 'MaskL1Loss' ] raise Exception( "main_loss_type in BalanceLoss() can only be one of {}".format( loss_type))
def run_double_hook_in_model(data, label, hook=None, register=False, remove=False): for device in self.devices: paddle.seed(self.seed) paddle.set_device(device) net = SimpleNet(self.in_size, self.out_size) loss_fn = nn.MSELoss() data = paddle.to_tensor(data) label = paddle.to_tensor(label) ret1, out = net(data, hook, register, remove) loss = loss_fn(out, label) loss.backward() return (ret1.grad.numpy(), net.linear1.weight.grad.numpy(), net.linear1.bias.grad.numpy())
def train_paddle_model() -> "LinearModel": set_random_seed(SEED) model = LinearModel() loss = nn.MSELoss() adam = paddle.optimizer.Adam(parameters=model.parameters()) train_data = paddle.text.datasets.UCIHousing(mode="train") loader = paddle.io.DataLoader( train_data, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, num_workers=2 ) model.train() for _ in range(EPOCH_NUM): for _, (feature, label) in enumerate(loader()): out = model(feature) loss_fn = loss(out, label) loss_fn.backward() adam.step() adam.clear_grad() return model
def __init__(self, conf, info_graph, soc_graph, user_feature, item_feautre): super(DiffNet, self).__init__() self.conf = conf self.user_feature = paddle.to_tensor(user_feature) self.item_feature = paddle.to_tensor(item_feautre) # the user-item interactions form the infomation graph => info_graph self.infomation_gcn_layer = CustomGCNConv(self.conf['gnn_dim'], self.conf['gnn_dim'], info_graph) # the user-user relations form the social graph => soc_graph self.social_gcn_layer = CustomGCNConv(self.conf['gnn_dim'], self.conf['gnn_dim'], soc_graph) self.user_embedding = nn.Embedding(self.conf['num_users'], self.conf['gnn_dim'], sparse=True) self.item_embedding = nn.Embedding(self.conf['num_items'], self.conf['gnn_dim'], sparse=True) # initialize user_embedding and item_embedding from \mathcal{N}(\mu, \sigma^2) # self.user_embedding.weight.set_value(0.1 * np.random.randn(self.conf['num_users'], self.conf['gnn_dim'])) # self.item_embedding.weight.set_value(0.1 * np.randn(self.conf['num_items'], self.conf['gnn_dim'])) self.reduce_dim_layer = nn.Linear(self.conf['review_feature_dim'], self.conf['gnn_dim']) self.mse_loss = nn.MSELoss()
def train(): """train""" # 1. initialize parallel environment dist.init_parallel_env() # 2. create data parallel layer & optimizer layer = LinearNet() dp_layer = paddle.DataParallel(layer) loss_fn = nn.MSELoss() adam = opt.Adam( learning_rate=0.001, parameters=dp_layer.parameters()) # 3. run layer inputs = paddle.randn([10, 10], 'float32') outputs = dp_layer(inputs) labels = paddle.randn([10, 1], 'float32') loss = loss_fn(outputs, labels) loss.backward() adam.step() adam.clear_grad() assert len(loss) == 1
def train(): # 1. initialize parallel environment (cpu & gpu) dist.init_parallel_env() # 2. set cpu place paddle.set_device('cpu') # 3. create data parallel layer & optimizer layer = LinearNet() dp_layer = paddle.DataParallel(layer) loss_fn = nn.MSELoss() adam = opt.Adam(learning_rate=0.001, parameters=dp_layer.parameters()) # 4. run layer inputs = paddle.randn([10, 10], 'float32') outputs = dp_layer(inputs) labels = paddle.randn([10, 1], 'float32') loss = loss_fn(outputs, labels) loss.backward() adam.step() adam.clear_grad()
def __init__(self, gan_mode, target_real_label=1.0, target_fake_label=0.0): """ Initialize the GANLoss class. Parameters: gan_mode (str) - - the type of GAN objective. It currently supports vanilla, lsgan, and wgangp. target_real_label (bool) - - label for a real image target_fake_label (bool) - - label of a fake image Note: Do not use sigmoid as the last layer of Discriminator. LSGAN needs no sigmoid. vanilla GANs will handle it with BCEWithLogitsLoss. """ super(GANLoss, self).__init__() self.target_real_label = target_real_label self.target_fake_label = target_fake_label self.gan_mode = gan_mode if gan_mode == 'lsgan': self.loss = nn.MSELoss() elif gan_mode == 'vanilla': self.loss = BCEWithLogitsLoss() elif gan_mode in ['wgangp']: self.loss = None else: raise NotImplementedError('gan mode %s not implemented' % gan_mode)
# from collections import OrderedDict # net = nn.Sequential(OrderedDict([ # ('linear', nn.Linear(num_inputs, 1)) # ])) print(net) for param in net.parameters(): print(param) # 3.3.4 初始化模型参数 # 设置全局参数初始化 fluid.set_global_initializer(initializer.Uniform(), initializer.Constant()) # 3.3.5 定义损失函数 loss = nn.MSELoss() # 3.3.6 定义优化算法 optimizer = optim.SGD(learning_rate=0.03, parameters=net.parameters()) print(optimizer) # 设置不同自网络的学习率(待修改) # optimizer = optim.SGD([ # {'params': net._sub_layers1.paramaters()}, # {'params': net._sub_layers2.paramaters(), 'lr': 0.01} # ], learning_rate=0.03) # for param_group in optimizer.param_groups: # param_group['lr'] *= 0.1 # 3.3.7 训练模型
def main(args): """ Model training for one epoch and return the average loss and model evaluating to monitor pcc. """ paddle.set_device('gpu:{}'.format(args.device) if args.use_cuda else 'cpu') logging.info('Load data ...') dataset = InMemoryDataset(npz_data_path=args.data_path) train_ds = Dataset(dataset[1]) test_ds = Dataset(dataset[0]) train_loader = train_ds.get_data_loader(batch_size=args.batch_size, collate_fn=collate_fn) test_loader = test_ds.get_data_loader(batch_size=args.batch_size, collate_fn=collate_fn) logging.info("Data loaded.") model = CDRModel(args) optim = Adam(learning_rate=args.lr, parameters=model.parameters()) criterion = nn.MSELoss() global_step = 0 best_pcc = 0.0 os.makedirs(args.output_path, exist_ok=True) best_model = os.path.join(args.output_path, 'best_model.pdparams') for epoch in range(1, args.epoch_num + 1): model.train() for idx, batch_data in enumerate(train_loader): graphs, mut, gexpr, met, label = batch_data g = pgl.Graph.batch(graphs).tensor() mut = paddle.to_tensor(mut) gexpr = paddle.to_tensor(gexpr) met = paddle.to_tensor(met) label = paddle.to_tensor(label) pred = model([g, mut, gexpr, met]) train_loss = paddle.pow(criterion(pred[:, 0], label)[0], 0.5) train_loss.backward() train_pcc = pearsonr(pred[:, 0].numpy(), label.numpy())[0] optim.step() optim.clear_grad() global_step += 1 if global_step % 500 == 0: message = "train: epoch %d | step %d | " % (epoch, global_step) message += "loss %.6f | pcc %.4f" % (train_loss, train_pcc) log.info(message) result = evaluate(model, test_loader, criterion) message = "eval: epoch %d | step %d " % (epoch, global_step) for key, value in result.items(): message += "| %s %.6f" % (key, value) log.info(message) if best_pcc < result['pcc']: best_pcc = result['pcc'] paddle.save(model.state_dict(), best_model) log.info("best evaluating accuracy: %.6f" % best_pcc)
if __name__ == "__main__": trainset = pd.DataFrame({ 'weight': [133., 160, 152, 120], 'height': [65., 72, 70, 60], 'label': [0, 1, 1, 0] }) trainset = GetDataset(trainset) trainset.__getitem__(0) exit() train_loader = DataLoader(trainset, batch_size=4) lr = 0.5 epochs = 2000 loss_fn = nn.MSELoss() model = ConNet() model.train() # 训练模式开启 optimizer = paddle.optimizer.SGD(parameters=model.parameters(), learning_rate=lr) # 优化器 for epoch in range(epochs): for i, data in enumerate(train_loader, 0): X, y = data y_pred = model(X) loss = loss_fn(y_pred, y) if epoch % 100 == 99: print("epoch: %d/%d - loss is: %.6f" % (epoch + 1, epochs, float(loss)))
def do_train(agrs): train_data_loader, dev_data_loader = create_distill_loader( args.task_name, model_name=args.model_name, vocab_path=args.vocab_path, batch_size=args.batch_size, max_seq_length=args.max_seq_length, n_iter=args.n_iter) emb_tensor = load_embedding( args.vocab_path) if args.use_pretrained_emb else None model = BiLSTM(args.emb_dim, args.hidden_size, args.vocab_size, args.output_dim, args.padding_idx, args.num_layers, args.dropout_prob, args.init_scale, emb_tensor) if args.optimizer == 'adadelta': optimizer = paddle.optimizer.Adadelta(learning_rate=args.lr, rho=0.95, parameters=model.parameters()) else: optimizer = paddle.optimizer.Adam(learning_rate=args.lr, parameters=model.parameters()) ce_loss = nn.CrossEntropyLoss() mse_loss = nn.MSELoss() klloss = nn.KLDivLoss() metric_class = TASK_CLASSES[args.task_name][1] metric = metric_class() teacher = TeacherModel(model_name=args.model_name, param_path=args.teacher_path) print("Start to distill student model.") global_step = 0 tic_train = time.time() for epoch in range(args.max_epoch): model.train() for i, batch in enumerate(train_data_loader): if args.task_name == 'qqp': bert_input_ids, bert_segment_ids, student_input_ids_1, seq_len_1, student_input_ids_2, seq_len_2, labels = batch else: bert_input_ids, bert_segment_ids, student_input_ids, seq_len, labels = batch # Calculate teacher model's forward. with paddle.no_grad(): teacher_logits = teacher.model(bert_input_ids, bert_segment_ids) # Calculate student model's forward. if args.task_name == 'qqp': logits = model(student_input_ids_1, seq_len_1, student_input_ids_2, seq_len_2) else: logits = model(student_input_ids, seq_len) loss = args.alpha * ce_loss(logits, labels) + ( 1 - args.alpha) * mse_loss(logits, teacher_logits) loss.backward() optimizer.step() optimizer.clear_grad() if i % args.log_freq == 0: print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.4f step/s" % (global_step, epoch, i, loss, args.log_freq / (time.time() - tic_train))) tic_eval = time.time() acc = evaluate(args.task_name, model, metric, dev_data_loader) print("eval done total : %s s" % (time.time() - tic_eval)) tic_train = time.time() global_step += 1
def __init__(self): super(LandmarkLoss, self).__init__(name_scope='LandmarkLoss') self.square_loss = nn.MSELoss(reduction='none') self.keep_ratio = 1.0
def do_train(agrs): device = paddle.set_device(args.device) train_data_loader, dev_data_loader = create_distill_loader( args.task_name, model_name=args.model_name, vocab_path=args.vocab_path, batch_size=args.batch_size, max_seq_length=args.max_seq_length, n_iter=args.n_iter, whole_word_mask=args.whole_word_mask, seed=args.seed) model = BiLSTM(args.emb_dim, args.hidden_size, args.vocab_size, args.output_dim, args.vocab_path, args.padding_idx, args.num_layers, args.dropout_prob, args.init_scale, args.embedding_name) if args.optimizer == 'adadelta': optimizer = paddle.optimizer.Adadelta(learning_rate=args.lr, rho=0.95, parameters=model.parameters()) else: optimizer = paddle.optimizer.Adam(learning_rate=args.lr, parameters=model.parameters()) ce_loss = nn.CrossEntropyLoss() mse_loss = nn.MSELoss() klloss = nn.KLDivLoss() metric_class = TASK_CLASSES[args.task_name][1] metric = metric_class() teacher = TeacherModel(model_name=args.model_name, param_path=args.teacher_path) print("Start to distill student model.") if args.init_from_ckpt: model.set_state_dict(paddle.load(args.init_from_ckpt + ".pdparams")) optimizer.set_state_dict(paddle.load(args.init_from_ckpt + ".pdopt")) print("Loaded checkpoint from %s" % args.init_from_ckpt) global_step = 0 tic_train = time.time() for epoch in range(args.max_epoch): model.train() for i, batch in enumerate(train_data_loader): global_step += 1 if args.task_name == 'qqp': bert_input_ids, bert_segment_ids, student_input_ids_1, seq_len_1, student_input_ids_2, seq_len_2, labels = batch else: bert_input_ids, bert_segment_ids, student_input_ids, seq_len, labels = batch # Calculate teacher model's forward. with paddle.no_grad(): teacher_logits = teacher.model(bert_input_ids, bert_segment_ids) # Calculate student model's forward. if args.task_name == 'qqp': logits = model(student_input_ids_1, seq_len_1, student_input_ids_2, seq_len_2) else: logits = model(student_input_ids, seq_len) loss = args.alpha * ce_loss(logits, labels) + ( 1 - args.alpha) * mse_loss(logits, teacher_logits) loss.backward() optimizer.step() optimizer.clear_grad() if global_step % args.log_freq == 0: print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.4f step/s" % (global_step, epoch, i, loss, args.log_freq / (time.time() - tic_train))) tic_eval = time.time() acc = evaluate(args.task_name, model, metric, dev_data_loader) print("eval done total : %s s" % (time.time() - tic_eval)) tic_train = time.time() if global_step % args.save_steps == 0: paddle.save( model.state_dict(), os.path.join(args.output_dir, "step_" + str(global_step) + ".pdparams")) paddle.save( optimizer.state_dict(), os.path.join(args.output_dir, "step_" + str(global_step) + ".pdopt"))