def run(params): """Run training Args: params: Parameters for training Returns: None """ logging.info(f'params: {params}') strategy = distribution_utils.get_distribution_strategy( params.get('tpu_address')) batch_size = distribution_utils.update_batch_size(strategy, params['batch_size']) with strategy.scope(): model = model_builder.build_model() input_image_size = model.input_shape[1] # Build dataset train_image_paths, train_scores = data_loader.read_csv(params['train_csv'], params['image_dir'], is_training=True) validation_image_paths, validation_scores = data_loader.read_csv( params['validation_csv'], params['image_dir'], is_training=False) train_dataset = data_loader.build_dataset(train_image_paths, train_scores, is_training=True, batch_size=batch_size, target_size=input_image_size) validation_dataset = data_loader.build_dataset( validation_image_paths, validation_scores, is_training=False, batch_size=batch_size, target_size=input_image_size) train_dataset = strategy.experimental_distribute_dataset(train_dataset) validation_dataset = strategy.experimental_distribute_dataset( validation_dataset) loss_fn = loss_builder.build_loss_fn( loss_name=params['loss'], trainable_variables=model.trainable_variables) train(model=model, loss_fn=loss_fn, strategy=strategy, epochs=params['epochs'], batch_size=batch_size, train_dataset=train_dataset, validation_dataset=validation_dataset, checkpoint_dir=params['checkpoint_dir'], log_dir=params['log_dir'])
def train(config): set_manual_seed(10) """ 1: 文本清洗和分词,构建词表 """ print("Preparing the batch data ... \n") corpus_x, corpus_y, vocab = build_dataset(config) """ 2:计算类别权重,缓解类别不平衡问题 """ class_weights = calcu_class_weights(corpus_y, config) config.class_weights = class_weights """ 3:加载预训练的词向量 """ embed_matrix = load_embed_matrix(vocab, config) config.embed_matrix = embed_matrix """ 4: 划分数据集和生成batch迭代器 """ train_iter, valid_iter, test_iter = batch_generator( corpus_x, corpus_y, 0.15, config) """ 5:模型初始化 """ print("Building the textcnn model ... \n") model = TextCNN(config) print(f'The model has {count_params(model):,} trainable parameters\n') model.to(config.device) """ 6:开始训练模型 """ print("Start the training ... \n") init_network(model) train_model(config, model, train_iter, valid_iter, test_iter)
def train(config): set_manual_seed(10) """ 1: 划分数据集并保存 """ print("Preparing the batch data ... \n") build_dataset(config) """ 2:计算类别权重,缓解类别不平衡问题 """ class_weights = calcu_class_weights(config) config.class_weights = class_weights """ 3: 划分数据集和生成batch迭代器 """ train_iter, valid_iter, test_iter = batch_generator(config) """ 5:模型初始化 """ print("Building the textcnn model ... \n") model = TextCNN(config) print(f'The model has {count_params(model):,} trainable parameters\n') model.to(config.device) """ 6:开始训练模型 """ print("Start the training ... \n") init_network(model) train_model(config, model, train_iter, valid_iter, test_iter)
def update(self, gameid): # remove prev_round for updating (so it's the history *at the prev round*) prev_round = game_histories[gameid].pop() prev_agent = load_agent(prev_round) prev_agent.update_model(prev_round['roundNum'], prev_round['cap']) # add it back to history for future rounds game_histories[gameid].append(prev_round) # precompute history captions so we don't have to do it again on every step for reduced_cap in build_dataset(prev_round['cap'], prev_agent.dataset_type): orig_captions[prev_agent.gameid].append( (prev_round['target'], reduced_cap))
def update_model(self, round_num, caption) : # Remove <start> and <end> if they're part of caption if caption[:7] == '<start>' : caption = caption[8:-6] # don't update if caption is empty if(len(caption.split()) < 1) : return combined_loss = CombinedLoss(self) data_loader = get_reduction_loader( self.raw_image, self.vocab, self.batch_size, caption, self.dataset_type, shuffle=True, num_workers=self.num_workers ) # define optimizer params = list(self.decoder.parameters()) optimizer = torch.optim.Adam(params, lr=self.learning_rate) # Keep training until we hit specified number of gradient steps steps = 0 while True : for i, batch in enumerate(data_loader): # print('num reductions for reduction', self.dataset_type, ':', batch[1].size()) if steps==self.num_steps: break loss = combined_loss.compute(batch, steps) self.decoder.zero_grad() loss.backward() optimizer.step() steps += 1 if steps==self.num_steps : break # After adaptation, add current trial's data to 'memory' for future rounds self.history.append({'target': self.raw_image, 'cap': caption}) # precompute history captions so we don't have to do it again on every step for reduced_cap in build_dataset(caption, self.dataset_type) : self.orig_captions.append((self.raw_image, reduced_cap)) # Save the model checkpoints if(self.checkpoint) : ckpt_loc = 'decoder-{}.ckpt'.format(self.gameid) torch.save(self.decoder.state_dict(), os.path.join(self.model_path, ckpt_loc))
import config import datetime import math from matplotlib import pyplot as plt gpus = tf.config.experimental.list_physical_devices('GPU') if gpus: for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) # Load model model = inception_v3.InceptionV3(num_class=config.classes) model.build(input_shape=(None, config.image_size, config.image_size, config.image_channels)) # Load dataset train_ds, train_len, test_ds, test_len, valid_ds, valid_len = build_dataset() # Loss and optimizer loss_object = tf.keras.losses.SparseCategoricalCrossentropy() optimizers = tf.keras.optimizers.Adam() train_loss = tf.keras.metrics.Mean(name='train_loss') train_acc = tf.keras.metrics.SparseCategoricalCrossentropy(name='train_acc') valid_loss = tf.keras.metrics.Mean(name='valid_loss') valid_acc = tf.keras.metrics.SparseCategoricalCrossentropy(name='valid_acc') # checkpoint checkpoints_dir = 'checkpoints/' checkpoint = tf.train.Checkpoint(model=model, optimizers=optimizers) checkpoint_manager = tf.train.CheckpointManager(checkpoint=checkpoint, directory=checkpoints_dir, max_to_keep=1)
def meta_train(): args = cli_def().parse_args() print(args) network = args.network dataset = args.dataset batch_size = args.batch_size lr = args.lr num_epoch = args.num_epoch if not os.path.isdir('result'): os.mkdir('result') save_path = './result/meta-train_' + network + '_' + dataset tr_loss = [] t_loss = [] tr_acc = [] t_acc = [] lr_save = [] # We are using cuda for training - no point trying out on CPU for ResNet device = torch.device("cuda") if dataset == 'cifar10': num_classes = 10 if dataset == 'cifar100': num_classes = 100 model = build_network(network, num_classes) print('Number of model parameters: {}'.format( sum([p.data.nelement() for p in model.params()]))) model.to(device).apply(init_weights) mlr_snet = MLRSNet(1, 50).to(device) print(mlr_snet) # assign argparse parameters criterion = nn.CrossEntropyLoss().to(device) best_val_accuracy = 0.0 num_meta = 1000 train_data, meta_data, test_data = build_dataset(dataset, num_meta, batch_size) print(len(train_data), len(meta_data), len(test_data)) train_loss, train_acc = compute_loss_accuracy(model, train_data, criterion, device) print('Initial training loss is %.3f' % train_loss) gamma = (train_loss**0.5 * np.log(train_loss * num_classes) / num_classes**0.25) / 4 print('Gamma is %.3f' % gamma) optimizer_vnet = torch.optim.Adam(mlr_snet.params(), lr=lr, weight_decay=1e-4) optimizer = optim.SGD(model.params(), lr=1, momentum=args.momentum, weight_decay=args.wd) meta_data_iter = iter(meta_data) for epoch in range(num_epoch): train_correct = 0 train_loss = 0 for i, (inputs, labels) in enumerate(train_data): model.train() mlr_snet.reset_lstm(keep_states=(epoch + i) > 0, device=device) inputs, labels = inputs.to(device), labels.to(device) if (i + 1) % args.t_val == 0: meta_model = build_network(network, num_classes) meta_model.to(device) meta_model.load_state_dict(model.state_dict()) meta_model.train() outputs = meta_model(inputs) loss = criterion(outputs, labels) loss = loss.unsqueeze(0) meta_model.zero_grad() grads = torch.autograd.grad(loss, (meta_model.params()), create_graph=True) input = loss lr_ = mlr_snet(input.unsqueeze(0)) optimizer_metamodel = MetaSGD(meta_model) optimizer_metamodel.load_state_dict(optimizer.state_dict()) optimizer_metamodel.step(lr=lr_ * gamma, grad=grads) del grads try: inputs_val, targets_val = next(meta_data_iter) except StopIteration: meta_data_iter = iter(meta_data) inputs_val, targets_val = next(meta_data_iter) inputs_val, targets_val = inputs_val.to( device), targets_val.to(device) y_g_hat = meta_model(inputs_val) l_g_meta = criterion(y_g_hat, targets_val.long()) optimizer_vnet.zero_grad() l_g_meta.backward() optimizer_vnet.step() outputs = model(inputs) loss = criterion(outputs, labels) input = loss.unsqueeze(0) with torch.no_grad(): new_lr = mlr_snet(input) new_lr = float(new_lr.data) * gamma lr_save.append(new_lr) for group in optimizer.param_groups: group['lr'] = new_lr train_loss += loss.item() * labels.size(0) train_pred = outputs.argmax(1) train_correct += train_pred.eq(labels).sum().item() optimizer.zero_grad() loss.backward() optimizer.step() train_acc = 100.0 * (train_correct / len(train_data.dataset)) val_loss, val_acc = compute_loss_accuracy(model, test_data, criterion, device) tr_loss.append(train_loss / len(train_data.dataset)) t_loss.append(val_loss) tr_acc.append(train_acc) t_acc.append(val_acc) torch.save( { 'train_acc': tr_acc, 'test_acc': t_acc, 'train_loss': tr_loss, 'test_loss': t_loss, 'lr': lr_save }, save_path) print('train loss is : %.4f' % (train_loss / len(train_data.dataset))) print('test loss is: %.4f' % val_loss) if val_acc > best_val_accuracy: best_val_accuracy = val_acc torch.save(mlr_snet.state_dict(), './result/mlr_snet %d.pth' % (epoch + 1)) print('train_accuracy at epoch :{} is : {}'.format(epoch, train_acc)) print('val_accuracy at epoch :{} is : {}'.format(epoch, val_acc)) print('best val_accuracy is : {}'.format(best_val_accuracy)) cur_lr = 0.0 for param_group in optimizer.param_groups: cur_lr = param_group['lr'] print('learning_rate after epoch :{} is : {}'.format(epoch, cur_lr))
def train(): """ 1: 加载数据集,把样本和标签都转化为id""" if os.path.isfile(config.data_proc_file): with open(config.data_proc_file, "rb") as f: train_data,dev_data,test_data = pickle.load(f) char_to_id,id_to_char,tag_to_id,id_to_tag = pickle.load(f) emb_matrix = pickle.load(f) logger.info("%i / %i / %i sentences in train / dev / test." % (len(train_data), len(dev_data), len(test_data))) else: train_data,dev_data,test_data, char_to_id, tag_to_id, id_to_tag, emb_matrix = build_dataset() """ 2: 产生batch训练数据 """ train_manager = BatchManager(train_data, config.batch_size) dev_manager = BatchManager(dev_data, config.batch_size) test_manager = BatchManager(test_data, config.batch_size) model = NERLSTM_CRF(config, char_to_id, tag_to_id, emb_matrix) model.train() model.to(device) optimizer = optim.Adam(model.parameters(), lr=config.lr, weight_decay=config.weight_decay) """ 3: 用early stop 防止过拟合 """ total_batch = 0 dev_best_f1 = float('-inf') last_improve = 0 flag = False start_time = time.time() logger.info(" 开始训练模型 ...... ") for epoch in range(config.max_epoch): logger.info('Epoch [{}/{}]'.format(epoch + 1, config.max_epoch)) for index, batch in enumerate(train_manager.iter_batch(shuffle=True)): optimizer.zero_grad() """ 计算损失和反向传播 """ _, char_ids, seg_ids, tag_ids, mask = batch loss = model.log_likelihood(char_ids,seg_ids,tag_ids, mask) loss.backward() """ 梯度截断,最大梯度为5 """ nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=config.clip) optimizer.step() if total_batch % config.steps_check == 0: model.eval() dev_f1,dev_loss = evaluate(model, dev_manager, id_to_tag) """ 以f1作为early stop的监控指标 """ if dev_f1 > dev_best_f1: evaluate(model, test_manager, id_to_tag, test=True) dev_best_f1 = dev_f1 torch.save(model, os.path.join(config.save_dir,"medical_ner.ckpt")) improve = '*' last_improve = total_batch else: improve = '' time_dif = get_time_dif(start_time) msg = 'Iter: {} | Dev Loss: {:.4f} | Dev F1-macro: {:.4f} | Time: {} | {}' logger.info(msg.format(total_batch, dev_loss, dev_f1, time_dif, improve)) model.train() total_batch += 1 if total_batch - last_improve > config.require_improve: """ 验证集f1超过5000batch没上升,结束训练 """ logger.info("No optimization for a long time, auto-stopping...") flag = True break if flag: break