def validation(trainer): nonlocal valid_saver nonlocal current_validation_f1 eval_dict = evaluate( model=model, data=valid_data, batch_size=64, tag2int=tag2int, device=updater.device ) if PARAM_ONLY_TAG: ps_tag, rs_tag, fs_tag, fscore_dict_tag = eval_dict['tag'] new_validation_f1 = fscore_dict_tag['macro'] else: ps_bio, rs_bio, fs_bio, fscore_dict_bio = eval_dict['bio'] new_validation_f1 = fscore_dict_bio['macro'] if new_validation_f1 > current_validation_f1: current_validation_f1 = new_validation_f1 if PARAM_SELF_TRAIN: valid_saver(trainer=trainer) chainer.serializers.save_npz(PARAM_LOGDIR + '/best_model', model) util.print_info('Saved best trainer, model/ epoch: {}, iter:{}'.format( trainer.updater.epoch, trainer.updater.iteration, )) else: record_scores(trainer=trainer) util.print_info('Recorded scores/ epoch: {}, iter:{}'.format( trainer.updater.epoch, trainer.updater.iteration, ))
def main(): # Load GloVe embeddings util.print_info('Loading GloVe embeddings.') w2v_util.load_embedding_dict(model_path=PARAM_GLOVE_PATH, normalize_digits=True) # Special symbol dictionary spec_w2vec = w2v_util.create_spec_w2vec(n_dim=w2v_util.embedd_dim) # Add the special dictionary to the GloVe embeddings w2v_util.embedd_dict.update(spec_w2vec) # Load dataset util.print_info('Loading dataset.') dataset = data_loader.load_dataset( dir_conll=PARAM_TRAIN_CONLL, is_sentence_lev=PARAM_SENTENCE_LEVEL ) # Conduct train train( documents=dataset['documents'], tag2int=dataset['tag2int'] )
def main(): # Load GloVe embeddings util.print_info('Loading GloVe embeddings.') w2v_util.load_embedding_dict(model_path=PARAM_GLOVE_PATH, normalize_digits=True) # Special symbol dictionary spec_w2vec = w2v_util.create_spec_w2vec(n_dim=w2v_util.embedd_dim) # Add the special dictionary to the GloVe embeddings w2v_util.embedd_dict.update(spec_w2vec) gpu = -1 param2value = pd.read_csv(filepath_or_buffer=PARAM_FILE, sep=',', header=None, index_col=0).to_dict()[1] model = load_model(model_file=MODEL_FILE, n_units=int(param2value['units']), in_size=int(param2value['in_size']), out_size_bio=int(param2value['out_size_bio']), out_size_tag=int(param2value['out_size_tag']), blstm_stack=int(param2value['BiLSTM stack']), lossfun=param2value['loss_function'], gpu=gpu) global test_documents test_conll_documents = [ DataClass.ConllDocument( sentence_id='0', token_list=tokens, bio_list=['O' for _ in tokens], # Dummy tag_list=[DataClass.ConllTag.NONE for _ in tokens] # Dummy ) for tokens in test_documents ] result = test(model=model, documents=test_conll_documents, tag2int={ DataClass.ConllTag.Fact: 5, DataClass.ConllTag.Testimony: 4, DataClass.ConllTag.Policy: 3, DataClass.ConllTag.Rhetorical: 2, DataClass.ConllTag.Value: 1, DataClass.ConllTag.NONE: 0 }, batch_size=16, gpu=gpu) for bio, tag, tokens in zip(result['bio_list'], result['tag_list'], test_documents): bio, tag = correct_bios_tags(bios=bio, tags=tag) stock_tokens = [] is_in_boundary = False for i, (b, t, token) in enumerate(zip(bio, tag, tokens)): if b == 'B': stock_tokens.append('[') is_in_boundary = True if b == 'O' and is_in_boundary: stock_tokens.append(']({})'.format( str(tag[i - 1]).split('.')[-1])) is_in_boundary = False stock_tokens.append(token) print(' '.join(stock_tokens))
def train(documents: List[DataClass.ConllDocument], tag2int): PARAM_LOGGER = io_util.get_logger(PARAM_LOGDIR + '/' + PARAM_LOG, clear=True) TEST_LOGGER = io_util.get_logger(PARAM_LOGDIR + '/' + TEST_LOG, clear=True) EVAL_LOGGER = io_util.get_logger(PARAM_LOGDIR + '/' + EVAL_LOG, clear=True) util.print_info('Using GPU No.: {}'.format(PARAM_GPU)) util.print_info('log_dir: {}'.format(PARAM_LOGDIR)) PARAM_LOGGER('batch_size,{}'.format(PARAM_BATCH_SIZE)) PARAM_LOGGER('units,{}'.format(PARAM_N_UNITS)) PARAM_LOGGER('dropout,{}'.format(PARAM_DROPOUT)) PARAM_LOGGER('gpu,{}'.format(PARAM_GPU)) PARAM_LOGGER('BiLSTM stack,{}'.format(PARAM_BLSTM)) PARAM_LOGGER('loss_function,{}'.format(PARAM_LOSSFUN)) PARAM_LOGGER('epoch,{}'.format(PARAM_EPOCH)) PARAM_LOGGER('max_length,{}'.format(PARAM_MAX_LENGTH)) PARAM_LOGGER('self_train,{}'.format(PARAM_SELF_TRAIN)) PARAM_LOGGER('only_bio,{}'.format(PARAM_ONLY_BIO)) PARAM_LOGGER('only_tag,{}'.format(PARAM_ONLY_TAG)) PARAM_LOGGER('use_option_features,{}'.format(PARAM_USE_OPTION_FEATURES)) data_dict = util.get_conll_documents_train_test_validation( documents=documents, tag2int=tag2int, max_length=PARAM_MAX_LENGTH, use_option_features=PARAM_USE_OPTION_FEATURES, train_rate=1 if PARAM_SELF_TRAIN else .8, dev_rate=PARAM_DEV_RATE, rnd_state=None ) train_data = data_dict['train'] test_data = data_dict['test'] valid_data = data_dict['dev'] in_size = len(train_data[0]['source'][0]) PARAM_LOGGER('in_size,%d' % in_size) PARAM_LOGGER('train_data,%d' % len(train_data)) PARAM_LOGGER('valid_data,%d' % len(valid_data)) PARAM_LOGGER('test_data,%d' % len(test_data)) PARAM_LOGGER('out_size_bio,%d' % 3) PARAM_LOGGER('out_size_tag,%d' % len(tag2int)) model = create_model(in_size=in_size, out_size_bio=3, out_size_tag=len(tag2int)) optimizer = chainer.optimizers.Adam() optimizer.setup(model) train_iter = chainer.iterators.SerialIterator(train_data, PARAM_BATCH_SIZE) updater = BLCUpdater( train_iterator=train_iter, model=model, optimizer=optimizer, device=PARAM_GPU ) trainer = training.Trainer(updater, (PARAM_EPOCH, 'epoch'), out=PARAM_LOGDIR) trainer.extend(extensions.LogReport(trigger=(5, 'iteration'), log_name=TRAIN_LOG)) trainer.extend(extensions.PrintReport( ['epoch', 'iteration', 'main/loss', 'elapsed_time']), trigger=(5, 'iteration')) trainer.extend(extensions.ProgressBar(update_interval=1)) sorted_tag2int = sorted(tag2int.items(), key=lambda x: x[1]) def record_scores(trainer): eval_dict = evaluate( model=model, data=test_data, batch_size=128, tag2int=tag2int, device=updater.device ) ps_bio, rs_bio, fs_bio, fscore_dict_bio = eval_dict['bio'] ps_tag, rs_tag, fs_tag, fscore_dict_tag = eval_dict['tag'] def round(np_scalar, size=3): return np.round(np_scalar, size) csv_out = [ trainer.updater.epoch, trainer.updater.iteration, round(ps_bio[0], size=4), round(rs_bio[0], size=4), round(fs_bio[0], size=4), round(ps_bio[1], size=4), round(rs_bio[1], size=4), round(fs_bio[1], size=4), round(ps_bio[2], size=4), round(rs_bio[2], size=4), round(fs_bio[2], size=4), round(fscore_dict_bio['micro'], size=4), round(fscore_dict_bio['macro'], size=4), ] for t, i in sorted_tag2int: csv_out.append(round(ps_tag[i], size=4)) csv_out.append(round(rs_tag[i], size=4)) csv_out.append(round(fs_tag[i], size=4)) csv_out += [ round(fscore_dict_tag['micro'], size=4), round(fscore_dict_tag['macro'], size=4), ] formstr = ''.join('{},' * len(csv_out))[:-1] EVAL_LOGGER(formstr.format(*csv_out)) EVAL_LOGGER( 'epoch,' + 'iter,' + 'B-prec,B-rec,B-f1,' + 'I-prec,I-rec,I-f1,' + 'O-prec,O-rec,O-f1,' + 'BIO-micro-f1,' + 'BIO-macro-f1,' + ''.join(['{0}-prec,{0}-rec,{0}-f1,'.format(t) for t, i in sorted_tag2int]) + 'tag-micro-f1,' + 'tag-macro-f1' ) valid_saver = SaveRestore(filename=None) current_validation_f1 = -float('inf') @chainer.training.make_extension(trigger=(1, 'epoch')) def validation(trainer): nonlocal valid_saver nonlocal current_validation_f1 eval_dict = evaluate( model=model, data=valid_data, batch_size=64, tag2int=tag2int, device=updater.device ) if PARAM_ONLY_TAG: ps_tag, rs_tag, fs_tag, fscore_dict_tag = eval_dict['tag'] new_validation_f1 = fscore_dict_tag['macro'] else: ps_bio, rs_bio, fs_bio, fscore_dict_bio = eval_dict['bio'] new_validation_f1 = fscore_dict_bio['macro'] if new_validation_f1 > current_validation_f1: current_validation_f1 = new_validation_f1 if PARAM_SELF_TRAIN: valid_saver(trainer=trainer) chainer.serializers.save_npz(PARAM_LOGDIR + '/best_model', model) util.print_info('Saved best trainer, model/ epoch: {}, iter:{}'.format( trainer.updater.epoch, trainer.updater.iteration, )) else: record_scores(trainer=trainer) util.print_info('Recorded scores/ epoch: {}, iter:{}'.format( trainer.updater.epoch, trainer.updater.iteration, )) trainer.extend(validation) @chainer.training.make_extension(trigger=(50, 'iteration')) def test(trainer): if PARAM_SELF_TRAIN: return TEST_LOGGER('-- iter: {0} --'.format(trainer.updater.iteration)) for _ in range(3): test_trg = random.choice(test_data) true_bios = test_trg['bio'] true_tags = test_trg['tag'] converted_data = batch_convert([test_trg], updater.device) predict_bios, predict_tags = model.test( source=converted_data['source'], ) predict_bios = predict_bios[0] predict_tags = predict_tags[0] assert len(true_bios) == len(predict_bios) == len(true_tags) == len(predict_tags) TEST_LOGGER('bio y: {0}'.format(true_bios)) TEST_LOGGER('bio ȳ: {0}'.format(predict_bios)) TEST_LOGGER('tag y: {0}'.format(true_tags)) TEST_LOGGER('tag ȳ: {0}'.format(predict_tags)) TEST_LOGGER('') trainer.extend(test) trainer.run() PARAM_LOGGER('train_finished,{}'.format(True)) return
def run(self): torch.cuda.empty_cache() starttime = time.time() if self.gpu_id is not None: # cudnn.benchmark improves training speed when input sizes do not change # https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936 # It selects the best algorithms as the training iterates over the dataset # I found no big difference between True and False, but it also doesn't hurt, so enable it #cudnn.benchmark = True # disable for deterministic behavior pass config = self.config config_id = config["config_id"] n_lm = config["n_lm"] make_deterministic(config['random_seed']) torch.autograd.set_detect_anomaly( True) # This makes debugging much easier jitterTransform = transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1) # TODO store these values in h5 files normMean, normStd = FaceLandmarksTrainingData.TRAIN_MEAN, FaceLandmarksTrainingData.TRAIN_STD normTransform = transforms.Normalize(normMean, normStd) rot_angle = float(config['augment_rotation']) rotation_augmentation = RandomRotation(min_angle=-1 * rot_angle, max_angle=rot_angle, retain_scale=False, rotate_landmarks="same") trainTransform = transforms.Compose([ ImageTransform(transforms.ToPILImage()), ImageTransform(jitterTransform), ImageAndLabelTransform(RandomHorizontalFlip()), ImageAndLabelTransform(rotation_augmentation), ImageTransform(transforms.ToTensor()), ImageTransform(normTransform) ]) testTransform = transforms.Compose([ ImageTransform(transforms.ToPILImage()), ImageTransform(transforms.ToTensor()), ImageTransform(normTransform) ]) # Note: Reading takes only ~0.2s, so it is okay to do this again whenever main.py is called # No need to read in trainer.py and pass results here with h5py.File(self.data, 'r') as f: train_dataset = FaceLandmarksTrainingData(f, transform=trainTransform, n_lm=n_lm) val_dataset = FaceLandmarksAllTestData(f, transform=testTransform, n_lm=n_lm) easy_d = FaceLandmarksEasyTestData(f, transform=testTransform, n_lm=n_lm) hard_d = FaceLandmarksHardTestData(f, transform=testTransform, n_lm=n_lm) print("GPU %d.%d" % (self.gpu_id, self.sub_gpu_id), "Data: %s" % self.data, "Train %d Test %d" % (len(train_dataset), len(val_dataset))) dataloader_params = { 'batch_size': config['batch_size'], 'pin_memory': self.gpu_id is not None, 'num_workers': 8 } train_loader = DataLoader(train_dataset, shuffle=True, **dataloader_params) val_loader = DataLoader(val_dataset, shuffle=False, **dataloader_params) easy = DataLoader(easy_d, shuffle=False, **dataloader_params) hard = DataLoader(hard_d, shuffle=False, **dataloader_params) net = self.create_net(config) _, trainable_parameters, _ = count_parameters(net) self.to_gpu(net) net.train() # Put net into train mode params = [ { "params": net.hourglass.parameters() }, { "params": net.regressor.parameters() }, ] if config["predict_distances_weight"] > 0: # generate ground truth distances y = torch.stack([x["landmarks"] for x in train_dataset]) bs = y.shape[0] n_lm = y.shape[1] dist_gt = torch.zeros(bs, n_lm, n_lm, 2) dist_gt[:, :, :, 0] = y[:, :, 0].view(bs, 1, -1) - y[:, :, 0].view( bs, -1, 1) dist_gt[:, :, :, 1] = y[:, :, 1].view(bs, 1, -1) - y[:, :, 1].view( bs, -1, 1) optimizer = optim.Adam(params, lr=config['lr']) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, 'min', patience=config['lr_scheduler_patience'], verbose=True, factor=config['lr_decay_factor']) early_stopping_patience = config['lr_scheduler_patience'] * 2 + 1 early_stopping_max_ratio = 0.975 should_stop = EarlyStopping(patience=early_stopping_patience, max_ratio=early_stopping_max_ratio, verbose=False) loss_function = self.get_loss_function(config['regression'], config['loss_function']) category_calculator = { "e49": lambda metrics: metrics["e49"], "h49": lambda metrics: metrics["h49"], "e68": lambda metrics: metrics["e68"], "h68": lambda metrics: metrics["h68"], "49": lambda metrics: (metrics["e49"] + metrics["h49"]) / 2, "68": lambda metrics: (metrics["e68"] + metrics["h68"]) / 2, "e": lambda metrics: (metrics["e49"] + metrics["e68"]) / 2, "h": lambda metrics: (metrics["h49"] + metrics["h68"]) / 2, "all": lambda metrics: (metrics["e49"] + metrics["h49"] + metrics["e68"] + metrics["h68"]) / 4 } categories = category_calculator.keys() best_epoch = {k: 0 for k in categories} lowest_error = {k: np.Inf for k in categories} epoch_train_losses = [] epoch_val_losses = [] # Only store models that are better than these values to save storage storage_thresholds = {"e49": 2.1, "h49": 3.4, "e68": 2.7, "h68": 4.5} storage_thresholds["49"] = category_calculator["49"]( storage_thresholds) storage_thresholds["68"] = category_calculator["68"]( storage_thresholds) storage_thresholds["e"] = category_calculator["e"](storage_thresholds) storage_thresholds["h"] = category_calculator["h"](storage_thresholds) storage_thresholds["all"] = category_calculator["all"]( storage_thresholds) loss_history = {} metric_history = [] dist_loss_fct = nn.L1Loss() epochs = config['n_epoch'] for epoch in range(epochs): epoch_start_time = time.time() net.train() epoch_train_loss = 0 epoch_sample_count = 0 for sample in train_loader: x = self.to_gpu(sample['image'].float()) y = self.to_gpu(sample['landmarks'].float()) if config["predict_distances_weight"] > 0: indices = self.to_gpu(sample['index']) dist_y = self.to_gpu(dist_gt[indices]) epoch_sample_count += x.shape[0] optimizer.zero_grad() coords, heatmaps, var, unnormalized_heatmaps = net(x) loss = loss_function(coords, heatmaps, y) epoch_train_loss += loss.float().data.item() if config["normalize_loss"]: if loss.detach().data.item() > 0: loss = loss / loss.detach() if config["predict_distances_weight"] > 0: bs = x.shape[0] distance_pred = torch.zeros(bs, n_lm, n_lm, 2) distance_pred[:, :, :, 0] = coords[:, :, 0].view( bs, 1, -1) - coords[:, :, 0].view(bs, -1, 1) distance_pred[:, :, :, 1] = coords[:, :, 1].view( bs, 1, -1) - coords[:, :, 1].view(bs, -1, 1) distance_pred = self.to_gpu(distance_pred) dist_loss = dist_loss_fct(distance_pred, dist_y) loss = loss + config[ "predict_distances_weight"] * dist_loss / dist_loss.detach( ) else: dist_loss = 0 if torch.isnan(loss): print_info( "ERROR! Invalid loss (nan). Aborting training for config %d in epoch %d" % (config_id, epoch)) raise LossException("loss was nan in config %d, epoch %d" % (config_id, epoch)) if torch.isinf(loss): print_info( "ERROR! Invalid loss (inf). Aborting training for config %d in epoch %d" % (config_id, epoch)) raise LossException("loss was inf in config %d, epoch %d" % (config_id, epoch)) loss.backward() optimizer.step() #### end batch epoch_train_loss /= epoch_sample_count # normalize loss by images that were processed val_loss = self.evaluate_model(val_loader, net, loss_function) scheduler.step(val_loss) epoch_train_losses.append(epoch_train_loss) epoch_val_losses.append(val_loss) loss_history[epoch] = { 'train': epoch_train_losses[-1], 'val': epoch_val_losses[-1] } epoch_end_time = time.time() epoch_duration = epoch_end_time - epoch_start_time metrics = benchmark(net, easy, hard, self.gpu_id) all_metrics = {} for category, calculator in category_calculator.items(): error = calculator(metrics) all_metrics[category] = error if error < lowest_error[ category] and error < 1000: # 100000 is the error for with outline when HG only has 49LM lowest_error[category] = error best_epoch[category] = epoch if error < storage_thresholds[category]: torch.save( { 'model': 'pe_hourglass', 'epoch': epoch + 1, 'state_dict': net.state_dict(), 'val_loss': val_loss, 'config': config, 'category': category, 'metrics': all_metrics }, os.path.join( self.model_dir, "%d_best_%s.torch" % (config_id, category))) metric_history.append(all_metrics) print( "GPU %d.%d" % (self.gpu_id, self.sub_gpu_id), "| conf", config_id, '| %03d/%03d' % (epoch + 1, epochs), '| %ds' % (int(epoch_duration)), '| train %0.6f' % epoch_train_losses[-1], '| val %0.6f' % epoch_val_losses[-1], '| dist %0.6f' % float(dist_loss), '| e68 %0.2f [B %0.2f]' % (metrics["e68"], lowest_error['e68']), '| h68 %0.2f [B %0.2f]' % (metrics["h68"], lowest_error['h68']), '| e49 %0.2f [B %0.2f]' % (metrics["e49"], lowest_error['e49']), '| h49 %0.2f [B %0.2f]' % (metrics["h49"], lowest_error['h49']), ) if should_stop(val_loss): epochs = epoch + 1 print_info( "EarlyStopping (patience = %d, max_ratio=%f) criterion returned true in epoch %d. Stop training" % (should_stop.patience, should_stop.max_ratio, epochs)) break endtime = time.time() # Write a loss plot to CONFIG_ID_loss_plot.txt in the output directory # TODO tensorboardX in addition to matplotlib? x = np.array(range(epochs)) plt.plot(x, np.array(epoch_train_losses), 'r', label='Train Loss') plt.plot(x, np.array(epoch_val_losses), 'b', label='Val Loss') plt.xlabel("Epochs") plt.ylabel("Avg. Train and Val Loss") plt.title("Variation of train and Val loss with epochs") plt.legend(loc='best') plt.savefig(os.path.join(self.plot_dir, "%d_loss_plot.png" % config_id)) plt.close() training_duration = int(endtime - starttime) best_epochs = {"best_%s_epoch" % k: v for k, v in best_epoch.items()} best_errors = {"best_%s" % k: v for k, v in lowest_error.items()} results = { "config_id": config_id, 'dataset': self.data, "gpu_id": self.gpu_id, "duration_seconds": training_duration, "last_epoch": epochs, # is different from n_epoch in case of early stopping "trainable_parameters": trainable_parameters, **self.config, "optimizer_name": optimizer.__class__.__name__, **best_epochs, "training_loss_last_epoch": epoch_train_losses[-1], **best_errors } # Write results to CONFIG_ID_result.json in the output directory with open(os.path.join(self.result_dir, "%d_result.json" % config_id), "w") as f: to_write = { **results, 'loss_history': loss_history, 'metric_history': metric_history } json.dump(to_write, f, indent=4) torch.cuda.empty_cache() return results