num_classes=len(punct_label_ids), dropout=CLASSIFICATION_DROPOUT, num_layers=PUNCT_NUM_FC_LAYERS, name='Punctuation') capit_classifier = TokenClassifier(hidden_size=bert_model.hidden_size, num_classes=len(capit_label_ids), dropout=CLASSIFICATION_DROPOUT, name='Capitalization') # If you don't want to use weighted loss for Punctuation task, use class_weights=None punct_label_freqs = train_data_layer.dataset.punct_label_frequencies class_weights = calc_class_weights(punct_label_freqs) # define loss punct_loss = CrossEntropyLossNM(logits_ndim=3, weight=class_weights) capit_loss = CrossEntropyLossNM(logits_ndim=3) task_loss = LossAggregatorNM(num_inputs=2) input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask, punct_labels, capit_labels = train_data_layer( ) hidden_states = bert_model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask) punct_logits = punct_classifier(hidden_states=hidden_states) capit_logits = capit_classifier(hidden_states=hidden_states) punct_loss = punct_loss(logits=punct_logits, labels=punct_labels,
def create_pipeline( pad_label=args.none_label, max_seq_length=args.max_seq_length, batch_size=args.batch_size, num_gpus=args.num_gpus, mode='train', punct_label_ids=None, capit_label_ids=None, ignore_extra_tokens=args.ignore_extra_tokens, ignore_start_end=args.ignore_start_end, overwrite_processed_files=args.overwrite_processed_files, dropout=args.fc_dropout, punct_num_layers=args.punct_num_fc_layers, capit_num_layers=args.capit_num_fc_layers, classifier=PunctCapitTokenClassifier, ): logging.info(f"Loading {mode} data...") shuffle = args.shuffle_data if mode == 'train' else False text_file = f'{args.data_dir}/text_{mode}.txt' label_file = f'{args.data_dir}/labels_{mode}.txt' if not (os.path.exists(text_file) or (os.path.exists(label_file))): raise FileNotFoundError(f'{text_file} or {label_file} not found. \ The data should be splitted into 2 files: text.txt and labels.txt. \ Each line of the text.txt file contains text sequences, where words\ are separated with spaces. The labels.txt file contains \ corresponding labels for each word in text.txt, the labels are \ separated with spaces. Each line of the files should follow the \ format: \ [WORD] [SPACE] [WORD] [SPACE] [WORD] (for text.txt) and \ [LABEL] [SPACE] [LABEL] [SPACE] [LABEL] (for labels.txt).') data_layer = PunctuationCapitalizationDataLayer( tokenizer=tokenizer, text_file=text_file, label_file=label_file, pad_label=pad_label, punct_label_ids=punct_label_ids, capit_label_ids=capit_label_ids, max_seq_length=max_seq_length, batch_size=batch_size, shuffle=shuffle, ignore_extra_tokens=ignore_extra_tokens, ignore_start_end=ignore_start_end, overwrite_processed_files=overwrite_processed_files, num_workers=args.num_workers, pin_memory=args.enable_pin_memory, ) (input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask, punct_labels, capit_labels) = data_layer() if mode == 'train': punct_label_ids = data_layer.dataset.punct_label_ids capit_label_ids = data_layer.dataset.capit_label_ids class_weights = None if args.use_weighted_loss_punct: logging.info(f"Using weighted loss for punctuation task") punct_label_freqs = data_layer.dataset.punct_label_frequencies class_weights = calc_class_weights(punct_label_freqs) classifier = classifier( hidden_size=hidden_size, punct_num_classes=len(punct_label_ids), capit_num_classes=len(capit_label_ids), dropout=dropout, punct_num_layers=punct_num_layers, capit_num_layers=capit_num_layers, ) punct_loss = CrossEntropyLossNM(logits_ndim=3, weight=class_weights) capit_loss = CrossEntropyLossNM(logits_ndim=3) task_loss = LossAggregatorNM( num_inputs=2, weights=[args.punct_loss_weight, 1.0 - args.punct_loss_weight]) hidden_states = model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask) punct_logits, capit_logits = classifier(hidden_states=hidden_states) if mode == 'train': punct_loss = punct_loss(logits=punct_logits, labels=punct_labels, loss_mask=loss_mask) capit_loss = capit_loss(logits=capit_logits, labels=capit_labels, loss_mask=loss_mask) task_loss = task_loss(loss_1=punct_loss, loss_2=capit_loss) steps_per_epoch = len(data_layer) // (batch_size * num_gpus) losses = [task_loss, punct_loss, capit_loss] logits = [punct_logits, capit_logits] return losses, logits, steps_per_epoch, punct_label_ids, capit_label_ids, classifier else: tensors_to_evaluate = [ punct_logits, capit_logits, punct_labels, capit_labels, subtokens_mask ] return tensors_to_evaluate, data_layer
def create_pipeline( pad_label=args.none_label, max_seq_length=args.max_seq_length, batch_size=args.batch_size, num_gpus=args.num_gpus, mode='train', batches_per_step=args.batches_per_step, label_ids=None, ignore_extra_tokens=args.ignore_extra_tokens, ignore_start_end=args.ignore_start_end, use_cache=args.use_cache, dropout=args.fc_dropout, num_layers=args.num_fc_layers, classifier=TokenClassifier, ): logging.info(f"Loading {mode} data...") shuffle = args.shuffle_data if mode == 'train' else False text_file = f'{args.data_dir}/text_{mode}.txt' label_file = f'{args.data_dir}/labels_{mode}.txt' if not (os.path.exists(text_file) or (os.path.exists(label_file))): raise FileNotFoundError(f'{text_file} or {label_file} not found. \ The data should be splitted into 2 files: text.txt and labels.txt. \ Each line of the text.txt file contains text sequences, where words\ are separated with spaces. The labels.txt file contains \ corresponding labels for each word in text.txt, the labels are \ separated with spaces. Each line of the files should follow the \ format: \ [WORD] [SPACE] [WORD] [SPACE] [WORD] (for text.txt) and \ [LABEL] [SPACE] [LABEL] [SPACE] [LABEL] (for labels.txt).') data_layer = BertTokenClassificationDataLayer( tokenizer=tokenizer, text_file=text_file, label_file=label_file, pad_label=pad_label, label_ids=label_ids, max_seq_length=max_seq_length, batch_size=batch_size, shuffle=shuffle, ignore_extra_tokens=ignore_extra_tokens, ignore_start_end=ignore_start_end, use_cache=use_cache, ) (input_ids, input_type_ids, input_mask, loss_mask, subtokens_mask, labels) = data_layer() if mode == 'train': label_ids = data_layer.dataset.label_ids class_weights = None if args.use_weighted_loss: logging.info(f"Using weighted loss") label_freqs = data_layer.dataset.label_frequencies class_weights = calc_class_weights(label_freqs) classifier = classifier(hidden_size=hidden_size, num_classes=len(label_ids), dropout=dropout, num_layers=num_layers) task_loss = CrossEntropyLossNM(logits_ndim=3, weight=class_weights) hidden_states = model(input_ids=input_ids, token_type_ids=input_type_ids, attention_mask=input_mask) logits = classifier(hidden_states=hidden_states) if mode == 'train': loss = task_loss(logits=logits, labels=labels, loss_mask=loss_mask) steps_per_epoch = len(data_layer) // (batch_size * num_gpus * batches_per_step) tensors_to_evaluate = [loss, logits] return tensors_to_evaluate, loss, steps_per_epoch, label_ids, classifier else: tensors_to_evaluate = [logits, labels, subtokens_mask] return tensors_to_evaluate, data_layer
vocab_size = len(data_desc.vocab) encoder = EncoderRNN(vocab_size, args.emb_dim, args.hid_dim, args.dropout, args.n_layers) decoder = TRADEGenerator( data_desc.vocab, encoder.embedding, args.hid_dim, args.dropout, data_desc.slots, len(data_desc.gating_dict), teacher_forcing=args.teacher_forcing, ) gate_loss_fn = CrossEntropyLossNM(logits_dim=3) ptr_loss_fn = MaskedLogLoss() total_loss_fn = LossAggregatorNM(num_inputs=2) def create_pipeline(num_samples, batch_size, num_gpus, input_dropout, data_prefix, is_training): logging.info(f"Loading {data_prefix} data...") shuffle = args.shuffle_data if is_training else False data_layer = MultiWOZDataLayer( abs_data_dir, data_desc.domains, all_domains=data_desc.all_domains, vocab=data_desc.vocab, slots=data_desc.slots,
hidden_size = pretrained_bert_model.hidden_size data_desc = JointIntentSlotDataDesc(data_dir=args.data_dir, none_slot_label=args.none_slot_label, pad_label=args.pad_label) # Create sentence classification loss on top classifier = JointIntentSlotClassifier(hidden_size=hidden_size, num_intents=data_desc.num_intents, num_slots=data_desc.num_slots, dropout=args.fc_dropout) if args.class_balancing == 'weighted_loss': # To tackle imbalanced classes, you may use weighted loss intent_loss_fn = CrossEntropyLossNM(logits_ndim=2, weight=data_desc.intent_weights) slot_loss_fn = CrossEntropyLossNM(logits_ndim=3, weight=data_desc.slot_weights) else: intent_loss_fn = CrossEntropyLossNM(logits_ndim=2) slot_loss_fn = CrossEntropyLossNM(logits_ndim=3) total_loss_fn = LossAggregatorNM( num_inputs=2, weights=[args.intent_loss_weight, 1.0 - args.intent_loss_weight]) def create_pipeline(num_samples=-1, batch_size=32, data_prefix='train', is_training=True,
vocab_size=output_vocab_size, attn_score_dropout=args.decoder_attn_score_dropout, max_seq_length=args.max_seq_length, embedding_dropout=args.decoder_embedding_dropout, hidden_act=args.decoder_hidden_act, use_full_attention=args.use_full_attention, ) logits = nemo_nlp.nm.trainables.TokenClassifier( hidden_size, num_classes=output_vocab_size, num_layers=1, log_softmax=False, dropout=0.1) loss_fn = CrossEntropyLossNM(logits_ndim=3) loss_eval_metric = CrossEntropyLossNM(logits_ndim=3, reduction='none') if args.command == "infer": beam_search = nemo_nlp.nm.trainables.BeamSearchTranslatorNM( decoder=decoder, log_softmax=logits, max_seq_length=args.max_seq_length, beam_size=args.beam_size, length_penalty=args.length_penalty, bos_token=tokenizer.bos_id, pad_token=tokenizer.pad_id, eos_token=tokenizer.eos_id, ) # tie all embeddings weights
def test_simple_vc_trainer(): # Train a sample model with test data # Create neural factory model_dir = os.path.join(get_data_folder(), ".test_model") nf = nemo.core.NeuralModuleFactory( placement=nemo.core.neural_factory.DeviceType.GPU, checkpoint_dir=model_dir) # Generate dataset bam = os.path.join(get_data_folder(), "small_bam.bam") labels = os.path.join(get_data_folder(), "candidates.vcf.gz") vcf_loader = VCFReader(vcf=labels, bams=[bam], is_fp=False) # Neural Network alexnet = AlexNet(num_input_channels=1, num_output_logits=3) # Create train DAG dataset_train = ReadPileupDataLoader(ReadPileupDataLoader.Type.TRAIN, [vcf_loader], batch_size=32, shuffle=True) vz_ce_loss = CrossEntropyLossNM(logits_ndim=2) vz_labels, encoding = dataset_train() vz = alexnet(encoding=encoding) vz_loss = vz_ce_loss(logits=vz, labels=vz_labels) # Create evaluation DAG using same dataset as training dataset_eval = ReadPileupDataLoader(ReadPileupDataLoader.Type.EVAL, [vcf_loader], batch_size=32, shuffle=False) vz_ce_loss_eval = CrossEntropyLossNM(logits_ndim=2) vz_labels_eval, encoding_eval = dataset_eval() vz_eval = alexnet(encoding=encoding_eval) vz_loss_eval = vz_ce_loss_eval(logits=vz_eval, labels=vz_labels_eval) # Logger callback logger_callback = nemo.core.SimpleLossLoggerCallback( tensors=[vz_loss, vz, vz_labels], step_freq=1, ) evaluator_callback = nemo.core.EvaluatorCallback( eval_tensors=[vz_loss_eval, vz_eval, vz_labels_eval], user_iter_callback=eval_iter_callback, user_epochs_done_callback=eval_epochs_done_callback, eval_step=1, ) # Checkpointing models through NeMo callback checkpoint_callback = nemo.core.CheckpointCallback( folder=nf.checkpoint_dir, load_from_folder=None, # Checkpointing frequency in steps step_freq=-1, # Checkpointing frequency in epochs epoch_freq=1, # Number of checkpoints to keep checkpoints_to_keep=1, # If True, CheckpointCallback will raise an Error if restoring fails force_load=False) # Invoke the "train" action. nf.train( [vz_loss], callbacks=[logger_callback, checkpoint_callback, evaluator_callback], optimization_params={ "num_epochs": 1, "lr": 0.001 }, optimizer="adam") assert (os.path.exists(os.path.join(model_dir, "AlexNet-EPOCH-1.pt")))
def train(args): """Train a sample model with test data.""" # Create neural factory as per NeMo requirements. nf = nemo.core.NeuralModuleFactory( placement=nemo.core.neural_factory.DeviceType.GPU) model = create_model() # Create train DAG train_dataset = HDFDataLoader(args.train_hdf, batch_size=32, shuffle=True, num_workers=args.threads, tensor_keys=["encodings", "labels"], tensor_dims=[('B', 'C', 'H', 'W'), tuple('B')], tensor_neural_types=[ ReadPileupNeuralType(), VariantZygosityNeuralType() ]) vz_ce_loss = CrossEntropyLossNM(logits_ndim=2) encoding, vz_labels = train_dataset() vz = model(encoding=encoding) vz_loss = vz_ce_loss(logits=vz, labels=vz_labels) callbacks = [] # Logger callback loggercallback = nemo.core.SimpleLossLoggerCallback( tensors=[vz_loss], step_freq=5, print_func=lambda x: logging.info(f'Train Loss: {str(x[0].item())}'), ) callbacks.append(loggercallback) # Checkpointing models through NeMo callback checkpointcallback = nemo.core.CheckpointCallback( folder=args.model_dir, load_from_folder=None, # Checkpointing frequency in steps step_freq=-1, # Checkpointing frequency in epochs epoch_freq=1, # Number of checkpoints to keep checkpoints_to_keep=1, # If True, CheckpointCallback will raise an Error if restoring fails force_load=False) callbacks.append(checkpointcallback) # Create eval DAG if eval files are available if args.eval_hdf: eval_dataset = HDFDataLoader(args.eval_hdf, batch_size=32, shuffle=False, num_workers=args.threads, tensor_keys=["encodings", "labels"], tensor_dims=[('B', 'C', 'H', 'W'), tuple('B')], tensor_neural_types=[ ReadPileupNeuralType(), VariantZygosityNeuralType() ]) eval_vz_ce_loss = CrossEntropyLossNM(logits_ndim=2) eval_encoding, eval_vz_labels = eval_dataset() eval_vz = model(encoding=eval_encoding) eval_vz_loss = eval_vz_ce_loss(logits=eval_vz, labels=eval_vz_labels) # Add evaluation callback evaluator_callback = nemo.core.EvaluatorCallback( eval_tensors=[eval_vz_loss, eval_vz, eval_vz_labels], user_iter_callback=eval_iter_callback, user_epochs_done_callback=eval_epochs_done_callback, eval_step=100, eval_epoch=1, eval_at_start=False, ) callbacks.append(evaluator_callback) # Invoke the "train" action. nf.train([vz_loss], callbacks=callbacks, optimization_params={ "num_epochs": args.epochs, "lr": 0.001 }, optimizer="adam")