model = nemo_nlp.huggingface.BERT(**config) else: model = nemo_nlp.huggingface.BERT( pretrained_model_name=args.pretrained_bert_model) model.restore_from(args.bert_checkpoint) hidden_size = model.local_parameters["hidden_size"] # uses [CLS] token for classification (the first token) if args.task_name == 'sts-b': pooler = nemo_nlp.SequenceRegression(hidden_size=hidden_size) glue_loss = MSELoss() else: pooler = nemo_nlp.SequenceClassifier(hidden_size=hidden_size, num_classes=num_labels, log_softmax=False) glue_loss = CrossEntropyLoss() def create_pipeline(max_seq_length=args.max_seq_length, batch_size=args.batch_size, local_rank=args.local_rank, num_gpus=args.num_gpus, evaluate=False, processor=task_processors[0]): data_layer = 'GlueDataLayerClassification' if output_mode == 'regression': data_layer = 'GlueDataLayerRegression'
d_model=args.d_model, num_heads=args.num_heads, d_inner=args.d_inner, max_seq_length=args.max_seq_length, hidden_act="gelu") """ create necessary modules for the whole translation pipeline, namely data layers, BERT encoder, and MLM and NSP loss functions """ mlm_classifier = nemo_nlp.TokenClassifier(args.d_model, num_classes=tokenizer.vocab_size, num_layers=1, log_softmax=True) mlm_loss_fn = nemo_nlp.MaskedLanguageModelingLossNM() nsp_classifier = nemo_nlp.SequenceClassifier(args.d_model, num_classes=2, num_layers=2, log_softmax=True) nsp_loss_fn = nemo.backends.pytorch.common.CrossEntropyLoss() bert_loss = nemo_nlp.LossAggregatorNM(num_inputs=2) # tie weights of MLM softmax layer and embedding layer of the encoder mlm_classifier.mlp.last_linear_layer.weight = \ bert_model.bert.embeddings.word_embeddings.weight def create_pipeline(data_file, max_seq_length, mask_probability, short_seq_prob, batch_size): data_layer = nemo_nlp.BertPretrainingDataLayer(tokenizer, data_file, max_seq_length,
""" Load the pretrained BERT parameters See the list of pretrained models, call: nemo_nlp.huggingface.BERT.list_pretrained_models() """ pretrained_bert_model = nemo_nlp.huggingface.BERT( pretrained_model_name=args.pretrained_bert_model) hidden_size = pretrained_bert_model.local_parameters["hidden_size"] tokenizer = BertTokenizer.from_pretrained(args.pretrained_bert_model) data_desc = SentenceClassificationDataDesc( args.dataset_name, args.data_dir, args.do_lower_case) # Create sentence classification loss on top classifier = nemo_nlp.SequenceClassifier(hidden_size=hidden_size, num_classes=data_desc.num_labels, dropout=args.fc_dropout) loss_fn = nemo.backends.pytorch.common.CrossEntropyLoss() def create_pipeline(num_samples=-1, batch_size=32, num_gpus=1, local_rank=0, mode='train'): nf.logger.info(f"Loading {mode} data...") data_file = f'{data_desc.data_dir}/{mode}.tsv' shuffle = args.shuffle_data if mode == 'train' else False data_layer = nemo_nlp.BertSentenceClassificationDataLayer(
if args.bert_checkpoint is not None: bert_model.restore_from(args.bert_checkpoint) """ create necessary modules for the whole translation pipeline, namely data layers, BERT encoder, and MLM and NSP loss functions """ mlm_classifier = nemo_nlp.BertTokenClassifier(args.hidden_size, num_classes=args.vocab_size, activation=args.hidden_act, log_softmax=True) mlm_loss_fn = nemo_nlp.MaskedLanguageModelingLossNM() if not args.only_mlm_loss: nsp_classifier = nemo_nlp.SequenceClassifier(args.hidden_size, num_classes=2, num_layers=2, activation='tanh', log_softmax=False) nsp_loss_fn = nemo.backends.pytorch.common.CrossEntropyLoss() bert_loss = nemo_nlp.LossAggregatorNM(num_inputs=2) # tie weights of MLM softmax layer and embedding layer of the encoder if (mlm_classifier.mlp.last_linear_layer.weight.shape != bert_model.bert.embeddings.word_embeddings.weight.shape): raise ValueError("Final classification layer does not match embedding " "layer.") mlm_classifier.mlp.last_linear_layer.weight = \ bert_model.bert.embeddings.word_embeddings.weight
def sentence_classification(args): # TODO: construct name of experiment based on args """ name = construct_name( args.exp_name, args.lr, args.batch_size, args.num_epochs, args.weight_decay, args.optimizer) work_dir = name if args.work_dir: work_dir = os.path.join(args.work_dir, name) """ # Instantiate neural modules nf = NeuralModuleFactory( backend=nemo.core.Backend.PyTorch, local_rank=args.local_rank, optimization_level=args.amp_opt_level, log_dir=args.work_dir, create_tb_writer=True, files_to_copy=[__file__], add_time_to_log_dir=True) # Pre-trained BERT tokenizer = BertTokenizer.from_pretrained(args.pretrained_bert_model) if args.bert_checkpoint is None: bert = nemo_nlp.BERT(pretrained_model_name=args.pretrained_bert_model) # save bert config for inference after fine-tuning bert_config = bert.config.to_dict() with open(args.work_dir + '/' + args.pretrained_bert_model + '_config.json', 'w+') as json_file: json.dump(bert_config, json_file) else: if args.bert_config is not None: with open(args.bert_config) as json_file: bert_config = json.load(json_file) bert = nemo_nlp.BERT(**bert_config) bert.restore_from(args.bert_checkpoint) # MLP bert_hidden_size = bert.local_parameters['hidden_size'] mlp = nemo_nlp.SequenceClassifier( hidden_size=bert_hidden_size, num_classes=args.num_classes, num_layers=args.num_layers, log_softmax=False, dropout=args.dropout) # TODO: save mlp/all model configs (bake in to Neural Module?) if args.mlp_checkpoint: mlp.restore_from(args.mlp_checkpoint) # Loss function for classification loss_fn = CrossEntropyLoss() # Data layers, pipelines, and callbacks callbacks = [] # callbacks depend on files present if args.train_file: if args.preproc: train_data_layer = preproc_data_layer.PreprocBertSentenceClassificationDataLayer( input_file=args.train_file, shuffle=True, num_samples=args.num_samples, # lower for dev, -1 for all dataset batch_size=args.batch_size, num_workers=0, local_rank=args.local_rank) else: train_data_layer = nemo_nlp.BertSentenceClassificationDataLayer( input_file=args.train_file, tokenizer=tokenizer, max_seq_length=args.max_seq_length, shuffle=True, num_samples=args.num_samples, # lower for dev, -1 for all dataset batch_size=args.batch_size, num_workers=0, local_rank=args.local_rank) train_logits, train_loss, steps_per_epoch, train_labels = create_pipeline( nf, train_data_layer, bert, mlp, loss_fn) train_callback = nemo.core.SimpleLossLoggerCallback( tensors=[train_loss, train_logits], print_func=lambda x: nf.logger.info(f'Train loss: {str(np.round(x[0].item(), 3))}'), tb_writer=nf.tb_writer, get_tb_values=lambda x: [["train_loss", x[0]]], step_freq=steps_per_epoch) callbacks.append(train_callback) if args.num_checkpoints != 0: ckpt_callback = nemo.core.CheckpointCallback( folder=nf.checkpoint_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq, checkpoints_to_keep=args.num_checkpoints) callbacks.append(ckpt_callback) if args.eval_file: if args.preproc: eval_data_layer = preproc_data_layer.PreprocBertSentenceClassificationDataLayer( input_file=args.eval_file, shuffle=False, num_samples=args.num_samples, batch_size=args.batch_size, num_workers=0, local_rank=args.local_rank) else: eval_data_layer = nemo_nlp.BertSentenceClassificationDataLayer( input_file=args.eval_file, tokenizer=tokenizer, max_seq_length=args.max_seq_length, shuffle=False, num_samples=args.num_samples, batch_size=args.batch_size, num_workers=0, local_rank=args.local_rank) eval_logits, eval_loss, _, eval_labels = create_pipeline( nf, eval_data_layer, bert, mlp, loss_fn) eval_callback = nemo.core.EvaluatorCallback( eval_tensors=[eval_logits, eval_labels], user_iter_callback=lambda x, y: eval_iter_callback( x, y, eval_data_layer), user_epochs_done_callback=lambda x: eval_epochs_done_callback( x, f'{nf.work_dir}/graphs'), tb_writer=nf.tb_writer, eval_step=steps_per_epoch) callbacks.append(eval_callback) if args.inference_file: if args.preproc: inference_data_layer = preproc_data_layer.PreprocBertSentenceClassificationDataLayer( input_file=args.inference_file, shuffle=False, num_samples=args.num_samples, batch_size=args.batch_size, num_workers=0, local_rank=args.local_rank) else: inference_data_layer = nemo_nlp.BertSentenceClassificationDataLayer( input_file=args.inference_file, tokenizer=tokenizer, max_seq_length=args.max_seq_length, shuffle=False, num_samples=args.num_samples, batch_size=args.batch_size, num_workers=0, local_rank=args.local_rank) # TODO: Finish inference inference_callback = None # Training, eval and inference if args.train_file: lr_policy_fn = get_lr_policy( args.lr_policy, total_steps=args.num_epochs * steps_per_epoch, warmup_ratio=args.lr_warmup_proportion) nf.train( tensors_to_optimize=[train_loss], callbacks=callbacks, lr_policy=lr_policy_fn, optimizer=args.optimizer_kind, optimization_params={'num_epochs': args.num_epochs, 'lr': args.lr})