def fit_one_cycle(learn: Learner, cyc_len: int, max_lr: Union[Floats, slice] = defaults.lr, moms: Tuple[float, float] = (0.95, 0.85), div_factor: float = 25., pct_start: float = 0.3, final_div: float = None, wd: float = None, callbacks: Optional[CallbackList] = None, tot_epochs: int = None, start_epoch: int = None, teachers: Optional[list] = None) -> None: "Fit a model following the 1cycle policy." max_lr = learn.lr_range(max_lr) callbacks = listify(callbacks) callbacks.append( OneCycleScheduler(learn, max_lr, moms=moms, div_factor=div_factor, pct_start=pct_start, final_div=final_div, tot_epochs=tot_epochs, start_epoch=start_epoch)) learn.fit(cyc_len, max_lr, wd=wd, callbacks=callbacks, teachers=teachers)
def run_ner( lang: str = 'eng', log_dir: str = 'logs', task: str = NER, batch_size: int = 1, epochs: int = 1, dataset: str = 'data/conll-2003/', loss: str = 'cross', max_seq_len: int = 128, do_lower_case: bool = False, warmup_proportion: float = 0.1, rand_seed: int = None, ds_size: int = None, data_bunch_path: str = 'data/conll-2003/db', tuned_learner: str = None, do_train: str = False, do_eval: str = False, save: bool = False, nameX: str = 'ner', mask: tuple = ('s', 's'), ): name = "_".join( map(str, [ nameX, task, lang, mask[0], mask[1], loss, batch_size, max_seq_len, do_train, do_eval ])) log_dir = Path(log_dir) log_dir.mkdir(parents=True, exist_ok=True) init_logger(log_dir, name) if rand_seed: random.seed(rand_seed) np.random.seed(rand_seed) torch.manual_seed(rand_seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(rand_seed) trainset = dataset + lang + '/train.txt' devset = dataset + lang + '/dev.txt' testset = dataset + lang + '/test.txt' bert_model = 'bert-base-cased' if lang == 'eng' else 'bert-base-multilingual-cased' print(f'Lang: {lang}\nModel: {bert_model}\nRun: {name}') model = BertForTokenClassification.from_pretrained(bert_model, num_labels=len(VOCAB), cache_dir='bertm') if tuned_learner: print('Loading pretrained learner: ', tuned_learner) model.bert.load_state_dict(torch.load(tuned_learner)) model = torch.nn.DataParallel(model) model_lr_group = bert_layer_list(model) layers = len(model_lr_group) kwargs = {'max_seq_len': max_seq_len, 'ds_size': ds_size, 'mask': mask} train_dl = DataLoader(dataset=NerDataset(trainset, bert_model, train=True, **kwargs), batch_size=batch_size, shuffle=True, collate_fn=partial(pad, train=True)) dev_dl = DataLoader(dataset=NerDataset(devset, bert_model, **kwargs), batch_size=batch_size, shuffle=False, collate_fn=pad) test_dl = DataLoader(dataset=NerDataset(testset, bert_model, **kwargs), batch_size=batch_size, shuffle=False, collate_fn=pad) data = DataBunch(train_dl=train_dl, valid_dl=dev_dl, test_dl=test_dl, collate_fn=pad, path=Path(data_bunch_path)) train_opt_steps = int(len(train_dl.dataset) / batch_size) * epochs optim = BertAdam(model.parameters(), lr=0.01, warmup=warmup_proportion, t_total=train_opt_steps) loss_fun = ner_loss_func if loss == 'cross' else partial(ner_loss_func, zero=True) metrics = [Conll_F1()] learn = Learner( data, model, BertAdam, loss_func=loss_fun, metrics=metrics, true_wd=False, layer_groups=model_lr_group, path='learn' + nameX, ) learn.opt = OptimWrapper(optim) lrm = 1.6 # select set of starting lrs lrs_eng = [0.01, 5e-4, 3e-4, 3e-4, 1e-5] lrs_deu = [0.01, 5e-4, 5e-4, 3e-4, 2e-5] startlr = lrs_eng if lang == 'eng' else lrs_deu results = [['epoch', 'lr', 'f1', 'val_loss', 'train_loss', 'train_losses']] if do_train: learn.freeze() learn.fit_one_cycle(1, startlr[0], moms=(0.8, 0.7)) learn.freeze_to(-3) lrs = learn.lr_range(slice(startlr[1] / (1.6**15), startlr[1])) learn.fit_one_cycle(1, lrs, moms=(0.8, 0.7)) learn.freeze_to(-6) lrs = learn.lr_range(slice(startlr[2] / (1.6**15), startlr[2])) learn.fit_one_cycle(1, lrs, moms=(0.8, 0.7)) learn.freeze_to(-12) lrs = learn.lr_range(slice(startlr[3] / (1.6**15), startlr[3])) learn.fit_one_cycle(1, lrs, moms=(0.8, 0.7)) learn.unfreeze() lrs = learn.lr_range(slice(startlr[4] / (1.6**15), startlr[4])) learn.fit_one_cycle(1, lrs, moms=(0.8, 0.7)) if do_eval: res = learn.validate(test_dl, metrics=metrics) met_res = [f'{m.__name__}: {r}' for m, r in zip(metrics, res[1:])] print(f'Validation on TEST SET:\nloss {res[0]}, {met_res}') results.append(['val', '-', res[1], res[0], '-', '-']) with open(log_dir / (name + '.csv'), 'a') as resultFile: wr = csv.writer(resultFile) wr.writerows(results)
def main(): parser = ArgumentParser() parser.add_argument('--pregenerated_data', type=Path, required=True) parser.add_argument('--output_dir', type=Path, required=True) parser.add_argument("--bert_model", type=str, required=True, choices=["bert-base-uncased", "bert-large-uncased", "bert-base-cased", "bert-base-multilingual-cased", "bert-base-chinese"]) parser.add_argument("--do_lower_case", action="store_true") parser.add_argument("--reduce_memory", action="store_true", help="Store training data as on-disc memmaps to massively reduce memory usage") parser.add_argument("--epochs", type=int, default=3, help="Number of epochs to train for") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--train_batch_size", default=16, type=int, help="Total batch size for training.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument('--seed', type=int, default=None, help="random seed for initialization") args = parser.parse_args() assert args.pregenerated_data.is_dir(), \ "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!" samples_per_epoch = [] for i in range(args.epochs): epoch_file = args.pregenerated_data / f"epoch_{i}.json" metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json" if epoch_file.is_file() and metrics_file.is_file(): metrics = json.loads(metrics_file.read_text()) samples_per_epoch.append(metrics['num_training_examples']) else: if i == 0: exit("No training data was found!") print(f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs}).") print("This script will loop over the available data, but training diversity may be negatively impacted.") num_data_epochs = i break else: num_data_epochs = args.epochs print(samples_per_epoch) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps if args.seed: random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if args.output_dir.is_dir() and list(args.output_dir.iterdir()): logging.warning(f"Output directory ({args.output_dir}) already exists and is not empty!") args.output_dir.mkdir(parents=True, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) total_train_examples = 0 for i in range(args.epochs): # The modulo takes into account the fact that we may loop over limited epochs of data total_train_examples += samples_per_epoch[i % len(samples_per_epoch)] num_train_optimization_steps = int( total_train_examples / args.train_batch_size / args.gradient_accumulation_steps) # Prepare model model = BertForPreTraining.from_pretrained(args.bert_model) model = torch.nn.DataParallel(model) # Prepare optimizer optimizer = BertAdam train_dataloader = DataLoader( PregeneratedData(args.pregenerated_data, tokenizer,args.epochs, args.train_batch_size), batch_size=args.train_batch_size, ) data = DataBunch(train_dataloader,train_dataloader) global_step = 0 logging.info("***** Running training *****") logging.info(f" Num examples = {total_train_examples}") logging.info(" Batch size = %d", args.train_batch_size) logging.info(" Num steps = %d", num_train_optimization_steps) def loss(x, y): return x.mean() learn = Learner(data, model, optimizer, loss_func=loss, true_wd=False, path='learn', layer_groups=bert_layer_list(model), ) lr= args.learning_rate layers = len(bert_layer_list(model)) lrs = learn.lr_range(slice(lr/(2.6**4), lr)) for epoch in range(args.epochs): learn.fit_one_cycle(1, lrs, wd=0.01) # save model at half way point if epoch == args.epochs//2: savem = learn.model.module.bert if hasattr(learn.model, 'module') else learn.model.bert output_model_file = args.output_dir / (f"pytorch_fastai_model_{args.bert_model}_{epoch}.bin") torch.save(savem.state_dict(), str(output_model_file)) print(f'Saved bert to {output_model_file}') savem = learn.model.module.bert if hasattr(learn.model, 'module') else learn.model.bert output_model_file = args.output_dir / (f"pytorch_fastai_model_{args.bert_model}_{args.epochs}.bin") torch.save(savem.state_dict(), str(output_model_file)) print(f'Saved bert to {output_model_file}')
def run_ner( lang: str = 'eng', log_dir: str = 'logs', task: str = NER, batch_size: int = 1, lr: float = 5e-5, epochs: int = 1, dataset: str = 'data/conll-2003/', loss: str = 'cross', max_seq_len: int = 128, do_lower_case: bool = False, warmup_proportion: float = 0.1, grad_acc_steps: int = 1, rand_seed: int = None, fp16: bool = False, loss_scale: float = None, ds_size: int = None, data_bunch_path: str = 'data/conll-2003/db', bertAdam: bool = False, freez: bool = False, one_cycle: bool = False, discr: bool = False, lrm: int = 2.6, div: int = None, tuned_learner: str = None, do_train: str = False, do_eval: str = False, save: bool = False, name: str = 'ner', mask: tuple = ('s', 's'), ): name = "_".join( map(str, [ name, task, lang, mask[0], mask[1], loss, batch_size, lr, max_seq_len, do_train, do_eval ])) log_dir = Path(log_dir) log_dir.mkdir(parents=True, exist_ok=True) init_logger(log_dir, name) if rand_seed: random.seed(rand_seed) np.random.seed(rand_seed) torch.manual_seed(rand_seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(rand_seed) trainset = dataset + lang + '/train.txt' devset = dataset + lang + '/dev.txt' testset = dataset + lang + '/test.txt' bert_model = 'bert-base-cased' if lang == 'eng' else 'bert-base-multilingual-cased' print(f'Lang: {lang}\nModel: {bert_model}\nRun: {name}') model = BertForTokenClassification.from_pretrained(bert_model, num_labels=len(VOCAB), cache_dir='bertm') model = torch.nn.DataParallel(model) model_lr_group = bert_layer_list(model) layers = len(model_lr_group) kwargs = {'max_seq_len': max_seq_len, 'ds_size': ds_size, 'mask': mask} train_dl = DataLoader(dataset=NerDataset(trainset, bert_model, train=True, **kwargs), batch_size=batch_size, shuffle=True, collate_fn=partial(pad, train=True)) dev_dl = DataLoader(dataset=NerDataset(devset, bert_model, **kwargs), batch_size=batch_size, shuffle=False, collate_fn=pad) test_dl = DataLoader(dataset=NerDataset(testset, bert_model, **kwargs), batch_size=batch_size, shuffle=False, collate_fn=pad) data = DataBunch(train_dl=train_dl, valid_dl=dev_dl, test_dl=test_dl, collate_fn=pad, path=Path(data_bunch_path)) loss_fun = ner_loss_func if loss == 'cross' else partial(ner_loss_func, zero=True) metrics = [Conll_F1()] learn = Learner( data, model, BertAdam, loss_func=loss_fun, metrics=metrics, true_wd=False, layer_groups=None if not freez else model_lr_group, path='learn', ) # initialise bert adam optimiser train_opt_steps = int(len(train_dl.dataset) / batch_size) * epochs optim = BertAdam(model.parameters(), lr=lr, warmup=warmup_proportion, t_total=train_opt_steps) if bertAdam: learn.opt = OptimWrapper(optim) else: print("No Bert Adam") # load fine-tuned learner if tuned_learner: print('Loading pretrained learner: ', tuned_learner) learn.load(tuned_learner) # Uncomment to graph learning rate plot # learn.lr_find() # learn.recorder.plot(skip_end=15) # set lr (discriminative learning rates) if div: layers = div lrs = lr if not discr else learn.lr_range(slice(lr / lrm**(layers), lr)) results = [['epoch', 'lr', 'f1', 'val_loss', 'train_loss', 'train_losses']] if do_train: for epoch in range(epochs): if freez: lay = (layers // (epochs - 1)) * epoch * -1 if lay == 0: print('Freeze') learn.freeze() elif lay == layers: print('unfreeze') learn.unfreeze() else: print('freeze2') learn.freeze_to(lay) print('Freezing layers ', lay, ' off ', layers) # Fit Learner - eg train model if one_cycle: learn.fit_one_cycle(1, lrs, moms=(0.8, 0.7)) else: learn.fit(1, lrs) results.append([ epoch, lrs, learn.recorder.metrics[0][0], learn.recorder.val_losses[0], np.array(learn.recorder.losses).mean(), learn.recorder.losses, ]) if save: m_path = learn.save(f"{lang}_{epoch}_model", return_path=True) print(f'Saved model to {m_path}') if save: learn.export(f'{lang}.pkl') if do_eval: res = learn.validate(test_dl, metrics=metrics) met_res = [f'{m.__name__}: {r}' for m, r in zip(metrics, res[1:])] print(f'Validation on TEST SET:\nloss {res[0]}, {met_res}') results.append(['val', '-', res[1], res[0], '-', '-']) with open(log_dir / (name + '.csv'), 'a') as resultFile: wr = csv.writer(resultFile) wr.writerows(results)