Ejemplo n.º 1
0
def _build_train_valid_test_datasets(
    data_prefix,
    data_impl,
    splits_string,
    train_valid_test_num_samples,
    max_seq_length,
    masked_lm_prob,
    short_seq_prob,
    seed,
    skip_warmup,
    binary_head,
    max_seq_length_dec,
    dataset_type='standard_bert',
):

    if dataset_type not in DSET_TYPES:
        raise ValueError("Invalid dataset_type: ", dataset_type)

    # Indexed dataset.
    indexed_dataset = get_indexed_dataset_(data_prefix, data_impl, skip_warmup)

    if dataset_type == DSET_TYPE_ICT:
        args = get_args()
        title_dataset = get_indexed_dataset_(args.titles_data_path, data_impl,
                                             skip_warmup)

    # Get start and end indices of train/valid/train into doc-idx
    # Note that doc-idx is desinged to be num-docs + 1 so we can
    # easily iterate over it.
    total_num_of_documents = indexed_dataset.doc_idx.shape[0] - 1
    splits = get_train_valid_test_split_(splits_string, total_num_of_documents)

    # Print stats about the splits.
    logging.info(' > dataset split:')

    def print_split_stats(name, index):
        logging.info('    {}:'.format(name))
        logging.info('     document indices in [{}, {}) total of {} '
                     'documents'.format(splits[index], splits[index + 1],
                                        splits[index + 1] - splits[index]))
        start_index = indexed_dataset.doc_idx[splits[index]]
        end_index = indexed_dataset.doc_idx[splits[index + 1]]
        logging.info('     sentence indices in [{}, {}) total of {} '
                     'sentences'.format(start_index, end_index,
                                        end_index - start_index))

    print_split_stats('train', 0)
    print_split_stats('validation', 1)
    print_split_stats('test', 2)

    def build_dataset(index, name):
        from nemo.collections.nlp.data.language_modeling.megatron.bert_dataset import BertDataset
        from nemo.collections.nlp.data.language_modeling.megatron.t5_dataset import T5Dataset

        dataset = None
        if splits[index + 1] > splits[index]:
            # Get the pointer to the original doc-idx so we can set it later.
            doc_idx_ptr = indexed_dataset.get_doc_idx()
            # Slice the doc-idx
            start_index = splits[index]
            # Add +1 so we can index into the dataset to get the upper bound.
            end_index = splits[index + 1] + 1
            # New doc_idx view.
            indexed_dataset.set_doc_idx(doc_idx_ptr[start_index:end_index])
            # Build the dataset accordingly.
            kwargs = dict(
                name=name,
                data_prefix=data_prefix,
                num_epochs=None,
                max_num_samples=train_valid_test_num_samples[index],
                max_seq_length=max_seq_length,
                seed=seed,
            )

            if dataset_type == DSET_TYPE_T5:
                dataset = T5Dataset(
                    indexed_dataset=indexed_dataset,
                    masked_lm_prob=masked_lm_prob,
                    max_seq_length_dec=max_seq_length_dec,
                    short_seq_prob=short_seq_prob,
                    **kwargs,
                )
            elif dataset_type == DSET_TYPE_BERT:
                dataset = BertDataset(
                    indexed_dataset=indexed_dataset,
                    masked_lm_prob=masked_lm_prob,
                    short_seq_prob=short_seq_prob,
                    binary_head=binary_head,
                    **kwargs,
                )
            else:
                raise NotImplementedError(
                    "Dataset type not fully implemented.")

            # Set the original pointer so dataset remains the main dataset.
            indexed_dataset.set_doc_idx(doc_idx_ptr)
            # Checks.
            assert indexed_dataset.doc_idx[0] == 0
            assert indexed_dataset.doc_idx.shape[0] == (
                total_num_of_documents + 1)
        return dataset

    train_dataset = build_dataset(0, 'train')
    valid_dataset = build_dataset(1, 'valid')
    test_dataset = build_dataset(2, 'test')

    return (train_dataset, valid_dataset, test_dataset)
Ejemplo n.º 2
0
def _apply_filters(manifest,
                   manifest_out,
                   max_cer,
                   max_wer,
                   max_edge_cer,
                   max_len_diff_ratio,
                   max_dur=-1,
                   original_duration=0):
    """ Filters out samples that do not satisfy specified threshold values and saves remaining samples to manifest_out"""
    remaining_duration = 0
    segmented_duration = 0
    with open(manifest, "r") as f, open(manifest_out, "w") as f_out:
        for line in f:
            item = json.loads(line)
            cer = item["CER"]
            wer = item["WER"]
            len_diff_ratio = item["len_diff_ratio"]
            duration = item["duration"]
            segmented_duration += duration
            if (cer <= max_cer and wer <= max_wer
                    and len_diff_ratio <= max_len_diff_ratio
                    and item["end_CER"] <= max_edge_cer
                    and item["start_CER"] <= max_edge_cer and
                (max_dur == -1 or (max_dur > -1 and duration < max_dur))):
                remaining_duration += duration
                f_out.write(json.dumps(item) + "\n")

    logging.info("-" * 50)
    logging.info("Threshold values:")
    logging.info(f"max WER, %: {max_wer}")
    logging.info(f"max CER, %: {max_cer}")
    logging.info(f"max edge CER, %: {max_edge_cer}")
    logging.info(f"max Word len diff: {max_len_diff_ratio}")
    logging.info(f"max Duration, s: {max_dur}")
    logging.info("-" * 50)

    remaining_duration = remaining_duration / 60
    original_duration = original_duration / 60
    segmented_duration = segmented_duration / 60
    logging.info(f"Original audio dur: {round(original_duration, 2)} min")
    logging.info(
        f"Segmented duration: {round(segmented_duration, 2)} min ({round(100 * segmented_duration / original_duration, 2)}% of original audio)"
    )
    logging.info(
        f"Retained {round(remaining_duration, 2)} min ({round(100*remaining_duration/original_duration, 2)}% of original or {round(100 * remaining_duration / segmented_duration, 2)}% of segmented audio)."
    )
    logging.info(f"Retained data saved to {manifest_out}")
Ejemplo n.º 3
0
    def __init__(self,
                 model_name,
                 config,
                 vocab_file,
                 model_parallel_size=None,
                 model_parallel_rank=None):

        super().__init__()

        self._model_parallel_size = model_parallel_size
        self._model_parallel_rank = model_parallel_rank
        self._restore_path = None
        self._app_state = None
        self._model_name = model_name

        if not os.path.exists(vocab_file):
            raise ValueError(f'Vocab file not found at {vocab_file}')

        config["vocab_file"] = vocab_file
        config['tokenizer_type'] = 'BertWordPieceLowerCase'
        config['lazy_mpu_init'] = True
        config['onnx_safe'] = True

        # if 'model_parallel_size' in config:
        if self._model_parallel_size is not None:
            app_state = AppState()
            self._app_state = app_state

            # must be set for model parallel megatron-lm
            os.environ["WORLD_SIZE"] = str(app_state.world_size)
            os.environ["RANK"] = str(self._model_parallel_rank)

            # used to set model_parallel_size in megatron-lm argparser
            def _update_model_parallel_arg(parser):
                parser.set_defaults(
                    model_parallel_size=self._model_parallel_size)
                return parser

            extra_args_provider = _update_model_parallel_arg
        else:
            extra_args_provider = None

        # Initialize part of Megatron global state that is needed for its constructor.
        # We set 'lazy_mpu_init' flag on to make Megatron do only the initialization that does not depend
        # on ddp be initialized yet (and we don't want Megatron to initialize DDP itself either)
        # and to return a hook for us to call after PTL has torch.distributed initialized.
        # We call this hook during .forward
        # TODO: can we call this hook using the PTL hook .setup()
        self._lazy_init_fn = initialize_megatron(
            extra_args_provider=extra_args_provider,
            args_defaults=config,
            ignore_unknown_args=True)

        # read Megatron arguments back
        args = get_args()
        logging.info(f'Megatron-lm argparse args: {args}')

        self.language_model, self._language_model_key = get_language_model(
            attention_mask_func=bert_attention_mask_func,
            num_tokentypes=2,
            add_pooler=False)

        self.config = OmegaConf.create(config)
        # key used for checkpoints
        self._hidden_size = self.language_model.hidden_size
Ejemplo n.º 4
0
    def setup(self, stage: str) -> None:
        """ PTL hook that is called after DDP is initialized.
            Called at the beginning of fit and test.

        Args:
            stage (str): either 'fit' or 'test'
        """
        # TODO: implement model parallel for test stage
        if stage == 'fit':
            # set find_unused_parameters to True by default for NLP models
            if isinstance(self.trainer.accelerator.training_type_plugin,
                          DDPPlugin):
                self.trainer.accelerator.training_type_plugin._ddp_kwargs[
                    'find_unused_parameters'] = True

            # adds self.bert_model config to .nemo file
            if hasattr(self, 'bert_model') and self.bert_model is not None:
                self.register_bert_model()

            app_state = AppState()

            if app_state.model_parallel_size is not None:

                if app_state.model_parallel_group is None:
                    self.init_model_parallel(app_state.global_rank,
                                             app_state.world_size)

                # mpu grad clipping needs parameters to have the attribute model_parallel
                parameters = self._trainer.get_model().parameters()
                for p in parameters:
                    if not hasattr(p, 'model_parallel'):
                        p.model_parallel = False

                # Update PTL trainer to use our configure_ddp
                self._trainer.accelerator_backend.ddp_plugin.configure_ddp = self.configure_ddp
                # Update PTL trainer to use our _clip_gradients
                self._trainer.accelerator_backend._clip_gradients = self._clip_gradients
                self._trainer.checkpoint_connector = NLPCheckpointConnector(
                    self._trainer)

                # Configure checkpointing for model parallel
                if app_state.create_checkpoint_callback:
                    # global rank 0 is configured by exp_manager
                    if not is_global_rank_zero(
                    ) and app_state.data_parallel_rank == 0:
                        configure_checkpointing(
                            self._trainer,
                            app_state.log_dir,
                            app_state.checkpoint_name,
                            app_state.checkpoint_callback_params,
                        )

                if isinstance(self.bert_model, MegatronBertEncoder):
                    self.bert_model.complete_lazy_init()

                    # model parallel checkpoints need to be restored after torch.distributed is initialized
                    if self._trainer.resume_from_checkpoint is not None:
                        # update path based on model parallel rank
                        filepath = self._trainer.resume_from_checkpoint
                        dirname = os.path.dirname(os.path.dirname(filepath))
                        basename = os.path.basename(filepath)
                        filepath = f'{dirname}/mp_rank_{app_state.model_parallel_rank:02d}/{basename}'
                        self._trainer.resume_from_checkpoint = filepath
                        logging.info(
                            f'Resuming training from checkpoint {self._trainer.resume_from_checkpoint}'
                        )
                        # need to set checkpoint version for megatron-lm
                        checkpoint_version = torch.load(
                            self._trainer.resume_from_checkpoint).get(
                                'checkpoint_version', None)
                        if checkpoint_version is not None:
                            set_checkpoint_version(checkpoint_version)
                        else:
                            logging.warning(
                                'Megatron-lm checkpoint version not found. Setting checkpoint_version to 0.'
                            )
                            set_checkpoint_version(0)
                    else:
                        logging.info(
                            f"Restoring from pretrained model parallel checkpoint: {self.bert_model._restore_path}"
                        )
                        self.bert_model.restore_weights(
                            self.bert_model._restore_path)

                    logging.info(
                        "Replacing sampler with model parallel sampler")
                    mp_sampler = torch.utils.data.distributed.DistributedSampler(
                        self._train_dl.dataset,
                        num_replicas=app_state.data_parallel_size,
                        rank=app_state.data_parallel_rank,
                    )
                    mp_dl = self._trainer.replace_sampler(
                        self._train_dl, mp_sampler)
                    self._train_dl = mp_dl
                else:
                    raise NotImplementedError(
                        f'The BERT encoder: {self.bert_model} does not support model parallelism yet.'
                    )
            else:
                # Megatron without model parallelism
                self.complete_megatron_init()
        else:
            # testing stage
            self.complete_megatron_init()
Ejemplo n.º 5
0
def main():
    args = parse_args()
    neural_factory = nemo.core.NeuralModuleFactory(
        optimization_level=args.amp_opt_level,
        backend=nemo.core.Backend.PyTorch,
        local_rank=args.local_rank,
    )

    use_cache = True
    if args.local_rank is not None:
        logging.info("Doing ALL GPU")
        use_cache = False

    # Create text to spectrogram model
    if args.spec_model == "tacotron2":
        yaml = YAML(typ="safe")
        with open(args.spec_model_config) as file:
            tacotron2_params = yaml.load(file)
        spec_neural_modules = create_NMs(args.spec_model_config,
                                         labels=tacotron2_params['labels'],
                                         decoder_infer=False)
        infer_tensors = create_infer_dags(
            neural_factory=neural_factory,
            neural_modules=spec_neural_modules,
            tacotron2_config_file=args.spec_model_config,
            tacotron2_params=tacotron2_params,
            infer_dataset=args.eval_dataset,
            infer_batch_size=args.batch_size,
            labels=tacotron2_params['labels'],
        )

    logging.info("Running Tacotron 2")
    # Run tacotron 2
    evaluated_tensors = neural_factory.infer(
        tensors=infer_tensors,
        checkpoint_dir=args.spec_model_load_dir,
        cache=use_cache,
        offload_to_cpu=True,
    )

    def get_D(alignment, true_len):
        D = np.array([0 for _ in range(np.shape(alignment)[1])])

        for i in range(np.shape(alignment)[0]):
            max_index = alignment[i].tolist().index(alignment[i].max())
            D[max_index] = D[max_index] + 1

        assert D.sum() == alignment.shape[0]
        assert D.sum() == true_len

        return D

    # Save durations.
    alignments_dir = pathlib.Path(args.durations_dir)
    alignments_dir.mkdir(exist_ok=True)
    k = -1
    for alignments, mel_lens, text_lens in zip(
            tqdm.tqdm(evaluated_tensors[2]),
            evaluated_tensors[3],
            evaluated_tensors[4],
    ):
        for alignment, mel_len, text_len in zip(alignments, mel_lens,
                                                text_lens):
            alignment = alignment.cpu().numpy()
            mel_len = mel_len.cpu().numpy().item()
            text_len = text_len.cpu().numpy().item()
            dur = get_D(alignment[:mel_len, :text_len], mel_len)
            k += 1
            np.save(alignments_dir / f'{k}.npy', dur, allow_pickle=False)
Ejemplo n.º 6
0
def vad_tune_threshold_on_dev(params,
                              vad_pred,
                              groundtruth_RTTM,
                              result_file="res",
                              vad_pred_method="frame",
                              focus_metric="DetER"):
    """
    Tune thresholds on dev set. Return best thresholds which gives the lowest detection error rate (DetER) in thresholds.
    Args:
        params (dict): dictionary of parameter to be tuned on.
        vad_pred_method (str): suffix of prediction file. Use to locate file. Should be either in "frame", "mean" or "median".
        vad_pred_dir (str): directory of vad predictions or a file contains the paths of them
        groundtruth_RTTM_dir (str): directory of groundtruch rttm files or a file contains the paths of them.
        focus_metric (str): metrics we care most when tuning threshold. Should be either in "DetER", "FA", "MISS"
    Returns:
        best_threhsold (float): threshold that gives lowest DetER.
    """
    min_score = 100
    all_perf = {}
    try:
        check_if_param_valid(params)
    except:
        raise ValueError("Please check if the parameters are valid")

    paired_filenames, groundtruth_RTTM_dict, vad_pred_dict = pred_rttm_map(
        vad_pred, groundtruth_RTTM, vad_pred_method)
    metric = detection.DetectionErrorRate()
    params_grid = get_parameter_grid(params)

    for param in params_grid:
        # perform binarization, filtering accoring to param and write to rttm-like table
        vad_table_dir = generate_vad_segment_table(vad_pred,
                                                   param,
                                                   shift_length_in_sec=0.01,
                                                   num_workers=20)

        # add reference and hypothesis to metrics
        for filename in paired_filenames:
            groundtruth_RTTM_file = groundtruth_RTTM_dict[filename]
            vad_table_filepath = os.path.join(vad_table_dir, filename + ".txt")
            reference, hypothesis = vad_construct_pyannote_object_per_file(
                vad_table_filepath, groundtruth_RTTM_file)
            metric(reference, hypothesis)  # accumulation

        # delete tmp table files
        shutil.rmtree(vad_table_dir, ignore_errors=True)

        report = metric.report(display=False)
        DetER = report.iloc[[-1]][('detection error rate', '%')].item()
        FA = report.iloc[[-1]][('false alarm', '%')].item()
        MISS = report.iloc[[-1]][('miss', '%')].item()

        assert (
            focus_metric == "DetER" or focus_metric == "FA"
            or focus_metric == "MISS"
        ), "Metric we care most should be only in 'DetER', 'FA'or 'MISS'!"
        all_perf[str(param)] = {
            'DetER (%)': DetER,
            'FA (%)': FA,
            'MISS (%)': MISS
        }
        logging.info(f"parameter {param}, {all_perf[str(param)] }")

        score = all_perf[str(param)][focus_metric + ' (%)']

        del report
        metric.reset()  # reset internal accumulator

        # save results for analysis
        with open(result_file + ".txt", "a", encoding='utf-8') as fp:
            fp.write(f"{param}, {all_perf[str(param)] }\n")

        if score < min_score:
            best_threhsold = param
            optimal_scores = all_perf[str(param)]
            min_score = score

    return best_threhsold, optimal_scores
Ejemplo n.º 7
0
 def on_epoch_start(self):
     if self.global_rank is None or self.global_rank == 0:
         logging.info(f"Starting epoch {self.epoch_num}")
         self._last_epoch_start = time.time()
Ejemplo n.º 8
0
def get_nmt_tokenizer(
    library: str = 'yttm',
    model_name: Optional[str] = None,
    tokenizer_model: Optional[str] = None,
    vocab_file: Optional[str] = None,
    merges_file: Optional[str] = None,
    special_tokens: Optional[Dict[str, str]] = None,
    use_fast: Optional[bool] = False,
    bpe_dropout: Optional[float] = 0.0,
    r2l: Optional[bool] = False,
    legacy: Optional[bool] = False,
):
    """
    Args:
        model_name: if using a pretrained model from NeMo, HuggingFace, or Megatron
        tokenizer_model: tokenizer model file of sentencepiece or youtokentome
        special_tokens: dict of special tokens
        vocab_file: path to vocab file
        use_fast: (only for HuggingFace AutoTokenizer) set to True to use fast HuggingFace tokenizer
        bpe_dropout: (only supported by YTTM tokenizer) BPE dropout tries to corrupt the standard segmentation procedure
            of BPE to help model better learn word compositionality and become robust to segmentation errors.
            It has empirically been shown to improve inference time BLEU scores.
        r2l: Whether to return subword IDs from right to left
    """
    if special_tokens is None:
        special_tokens_dict = {}
    else:
        special_tokens_dict = special_tokens

    if (library != 'byte-level') and (model_name is None
                                      and not os.path.isfile(tokenizer_model)):
        raise ValueError("No Tokenizer path provided or file does not exist!")

    if library == 'yttm':
        logging.info(
            f'Getting YouTokenToMeTokenizer with model: {tokenizer_model} with r2l: {r2l}.'
        )
        return YouTokenToMeTokenizer(model_path=tokenizer_model,
                                     bpe_dropout=bpe_dropout,
                                     r2l=r2l)
    elif library == 'huggingface':
        logging.info(
            f'Getting HuggingFace AutoTokenizer with pretrained_model_name: {model_name}'
        )
        return AutoTokenizer(
            pretrained_model_name=model_name,
            vocab_file=vocab_file,
            merges_file=merges_file,
            **special_tokens_dict,
            use_fast=use_fast,
        )
    elif library == 'sentencepiece':
        logging.info(f'Getting SentencePiece with model: {tokenizer_model}')
        return nemo.collections.common.tokenizers.sentencepiece_tokenizer.SentencePieceTokenizer(
            model_path=tokenizer_model, legacy=legacy)
    elif library == 'byte-level':
        logging.info(f'Using byte-level tokenization')
        return ByteLevelTokenizer(special_tokens_dict)
    elif library == 'regex':
        logging.info(f'Using regex tokenization')
        return RegExTokenizer().load_tokenizer(tokenizer_model)
    elif library == 'megatron':
        if model_name in megatron_tokenizer_model_map:
            model_name = megatron_tokenizer_model_map[model_name]
        logging.info(
            f'Getting Megatron tokenizer for pretrained model name: {model_name} and custom vocab file: {vocab_file}'
        )
        return get_tokenizer(tokenizer_name=model_name,
                             vocab_file=vocab_file,
                             merges_file=merges_file)
    else:
        raise NotImplementedError(
            'Currently we only support "yttm", "huggingface", "sentencepiece", "megatron", and "byte-level" tokenizer'
            'libraries.')
Ejemplo n.º 9
0
def get_tokenizer(
    tokenizer_name: str,
    tokenizer_model: Optional[str] = None,
    vocab_file: Optional[str] = None,
    merges_file: Optional[str] = None,
    special_tokens: Optional[Dict[str, str]] = None,
    use_fast: Optional[bool] = False,
    bpe_dropout: Optional[float] = 0.0,
):
    """
    Args:
        tokenizer_name: sentencepiece or pretrained model from the hugging face list,
            for example: bert-base-cased
            To see the list of all HuggingFace pretrained models, use:
            nemo_nlp.modules.common.get_huggingface_pretrained_lm_models_list()
        tokenizer_model: tokenizer model file of sentencepiece or youtokentome
        special_tokens: dict of special tokens
        vocab_file: path to vocab file
        use_fast: (only for HuggingFace AutoTokenizer) set to True to use fast HuggingFace tokenizer
        bpe_dropout: (only supported by YTTM tokenizer) BPE dropout tries to corrupt the standard segmentation
            procedure of BPE to help
            model better learn word compositionality and become robust to segmentation errors. 
            It has emperically been shown to improve inference time BLEU scores.
    """
    if special_tokens is None:
        special_tokens_dict = {}
    else:
        special_tokens_dict = special_tokens

    if 'megatron' in tokenizer_name:
        if not HAVE_APEX:
            raise ImportError(
                "Apex was not found. Please see the NeMo README for installation instructions: https://github.com/NVIDIA/NeMo#megatron-gpt."
            )
        if vocab_file is None:
            vocab_file = nemo.collections.nlp.modules.common.megatron.megatron_utils.get_megatron_vocab_file(
                tokenizer_name)
            merges_file = nemo.collections.nlp.modules.common.megatron.megatron_utils.get_megatron_merges_file(
                tokenizer_name)
        tokenizer_name = get_megatron_tokenizer(tokenizer_name)

    if tokenizer_name == 'sentencepiece':
        return nemo.collections.common.tokenizers.sentencepiece_tokenizer.SentencePieceTokenizer(
            model_path=tokenizer_model,
            special_tokens=special_tokens,
            legacy=True)
    elif tokenizer_name == 'yttm':
        return YouTokenToMeTokenizer(model_path=tokenizer_model,
                                     bpe_dropout=bpe_dropout)
    elif tokenizer_name == 'word':
        return WordTokenizer(vocab_file=vocab_file, **special_tokens_dict)
    elif tokenizer_name == 'char':
        return CharTokenizer(vocab_file=vocab_file, **special_tokens_dict)
    elif tokenizer_name == 'regex':
        return RegExTokenizer().load_tokenizer(tokenizer_model)

    logging.info(
        f"Getting HuggingFace AutoTokenizer with pretrained_model_name: {tokenizer_name}, vocab_file: {vocab_file}, "
        f"special_tokens_dict: {special_tokens_dict}, and use_fast: {use_fast}"
    )
    return AutoTokenizer(
        pretrained_model_name=tokenizer_name,
        vocab_file=vocab_file,
        merges_file=merges_file,
        **special_tokens_dict,
        use_fast=use_fast,
    )
Ejemplo n.º 10
0
            audio_name = os.path.basename(vad_file).split('.')[0]
            for line in lines:
                vad_out = line.strip().split()
                start, dur, activity = float(vad_out[0]), float(vad_out[1]) - float(vad_out[0]), vad_out[2]
                dur = float("{:.3f}".format(dur))
                start = float("{:.3f}".format(start))
                if activity.lower() == 'speech':
                    audio_path = os.path.join(audio_directory, audio_name + '.wav')
                    meta = {"audio_filepath": audio_path, "offset": start, "duration": dur, "label": 'UNK'}
                    json.dump(meta, outfile)
                    outfile.write("\n")

            f.close()


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--vad_directory", help="path to vad directory", type=str, required=True)
    parser.add_argument(
        "--audio_directory",
        help="path to audio directory of audio files for which vad was computed",
        type=str,
        required=True,
    )
    parser.add_argument("--manifest_file", help="output manifest file name", type=str, required=True)

    args = parser.parse_args()
    vad_directory, audio_directory, manifest_file = (args.vad_directory, args.audio_directory, args.manifest_file)
    write_manifest(vad_directory, audio_directory, manifest_file)
    logging.info("wrote {} file from vad output files present in {}".format(manifest_file, vad_directory))
Ejemplo n.º 11
0
def main(cfg: DictConfig) -> None:
    logging.info(f'\nConfig Params:\n{OmegaConf.to_yaml(cfg)}')
    try:
        plugin = NLPDDPPlugin()
    except (ImportError, ModuleNotFoundError):
        plugin = None

    trainer = pl.Trainer(plugins=plugin, **cfg.trainer)
    exp_manager(trainer, cfg.get("exp_manager", None))

    if not cfg.model.train_ds.file_path:
        raise ValueError(
            "'train_ds.file_path' need to be set for the training!")

    model = TextClassificationModel(cfg.model, trainer=trainer)
    logging.info(
        "==========================================================================================="
    )
    logging.info('Starting training...')
    trainer.fit(model)
    logging.info('Training finished!')
    logging.info(
        "==========================================================================================="
    )

    if cfg.model.nemo_path:
        # '.nemo' file contains the last checkpoint and the params to initialize the model
        model.save_to(cfg.model.nemo_path)
        logging.info(
            f'Model is saved into `.nemo` file: {cfg.model.nemo_path}')

    # We evaluate the trained model on the test set if test_ds is set in the config file
    if cfg.model.test_ds.file_path:
        logging.info(
            "==========================================================================================="
        )
        logging.info(
            "Starting the testing of the trained model on test set...")
        trainer.test(model=model, ckpt_path=None, verbose=False)
        logging.info("Testing finished!")
        logging.info(
            "==========================================================================================="
        )

    # perform inference on a list of queries.
    if "infer_samples" in cfg.model and cfg.model.infer_samples:
        logging.info(
            "==========================================================================================="
        )
        logging.info("Starting the inference on some sample queries...")

        # max_seq_length=512 is the maximum length BERT supports.
        results = model.classifytext(queries=cfg.model.infer_samples,
                                     batch_size=16,
                                     max_seq_length=512)
        logging.info(
            'The prediction results of some sample queries with the trained model:'
        )
        for query, result in zip(cfg.model.infer_samples, results):
            logging.info(f'Query : {query}')
            logging.info(f'Predicted label: {result}')

        logging.info("Inference finished!")
        logging.info(
            "==========================================================================================="
        )
Ejemplo n.º 12
0
def get_megatron_lm_model(
    pretrained_model_name: str,
    config_dict: Optional[dict] = None,
    config_file: Optional[str] = None,
    checkpoint_file: Optional[str] = None,
    vocab_file: Optional[str] = None,
) -> Tuple[MegatronBertEncoder, str]:
    """
    Returns MegatronBertEncoder and a default or user specified path to the checkpoint file

    Args:
        pretrained_mode_name: model name from MEGATRON_CONFIG_MAP
            for example: megatron-bert-cased
        config_dict: model configuration parameters
        config_file: path to model configuration file. Takes precedence over config_dict if both supplied.
        checkpoint_file: path to checkpoint file or directory if using model parallel.
        vocab_file: path to vocab file

    Returns:
        model: MegatronBertEncoder
        checkpoint_file: path to checkpoint file or directory
    """
    config = None
    # get default config and checkpoint
    if config_file:
        with open(config_file) as f:
            config = json.load(f)
            # replace dashes with underscores in config keys
            fixed_config = {}
            for key in config.keys():
                fixed_key = key.replace("-", "_")
                if fixed_key == 'max_seq_length':
                    fixed_key = 'max_position_embeddings'
                fixed_config[fixed_key] = config[key]
            # 'vocab_size" no longer used.
            if 'vocab_size' in fixed_config:
                fixed_config.pop('vocab_size')
            config = fixed_config
    elif config_dict:
        config = config_dict
    elif pretrained_model_name in get_megatron_lm_models_list():
        config = get_megatron_config(pretrained_model_name)
    else:
        raise ValueError(f"{pretrained_model_name} is not supported")

    if config is None:
        raise ValueError(
            f"config_file or config_dict is required for {pretrained_model_name}"
        )

    if not checkpoint_file:
        checkpoint_file = get_megatron_checkpoint(pretrained_model_name)

    if not vocab_file:
        vocab_file = get_megatron_vocab_file(pretrained_model_name)

    app_state = AppState()
    if app_state.model_parallel_size is not None and app_state.model_parallel_rank is not None:
        # model parallel already known from .nemo restore
        model_parallel_size = app_state.model_parallel_size
        model_parallel_rank = app_state.model_parallel_rank
    elif os.path.isdir(checkpoint_file):
        # starting training from megatron-lm checkpoint
        mp_ranks = glob.glob(os.path.join(checkpoint_file, 'mp_rank*'))
        model_parallel_size = len(mp_ranks)
        app_state.model_parallel_size = model_parallel_size
        logging.info((f'restore_path: {checkpoint_file} is a directory. '
                      f'Assuming megatron model parallelism with '
                      f'model_parallel_size: {model_parallel_size}'))
        # try to get local rank from global
        local_rank = None
        try:
            local_rank = int(os.environ['LOCAL_RANK'])
        except:
            logging.info('Global variable LOCAL_RANK not yet specified')
        if local_rank is not None:
            app_state.local_rank = local_rank
        else:
            # if local is None then we are on the main process
            local_rank = 0
        model_parallel_rank = compute_model_parallel_rank(
            local_rank, model_parallel_size)
        app_state.model_parallel_rank = model_parallel_rank
    else:
        model_parallel_size = None
        model_parallel_rank = None

    model = MegatronBertEncoder(
        model_name=pretrained_model_name,
        config=config,
        vocab_file=vocab_file,
        model_parallel_size=model_parallel_size,
        model_parallel_rank=model_parallel_rank,
    )

    return model, checkpoint_file
Ejemplo n.º 13
0
from nemo.collections.nlp.modules.common.megatron.megatron_bert import MegatronBertEncoder
from nemo.utils import AppState, logging

__all__ = [
    "get_megatron_lm_model",
    "get_megatron_lm_models_list",
    "get_megatron_checkpoint",
    "is_lower_cased_megatron",
    "get_megatron_tokenizer",
]

torch_home = _get_torch_home()

if not isinstance(torch_home, str):
    logging.info("Torch home not found, caching megatron in cwd")
    torch_home = os.getcwd()

MEGATRON_CACHE = os.path.join(torch_home, "megatron")

CONFIGS = {
    "345m": {
        "hidden_size": 1024,
        "num_attention_heads": 16,
        "num_layers": 24,
        "max_position_embeddings": 512
    }
}

MEGATRON_CONFIG_MAP = {
    "megatron-bert-345m-uncased": {
Ejemplo n.º 14
0
def get_samples_mapping(indexed_dataset, data_prefix, num_epochs,
                        max_num_samples, max_seq_length, short_seq_prob, seed,
                        name, binary_head):
    """Get a list that maps a sample index to a starting sentence index, end sentence index, and length"""

    if not num_epochs:
        if not max_num_samples:
            raise ValueError("Need to specify either max_num_samples "
                             "or num_epochs")
        num_epochs = np.iinfo(np.int32).max - 1
    if not max_num_samples:
        max_num_samples = np.iinfo(np.int64).max - 1

    # Filename of the index mapping
    indexmap_filename = data_prefix
    indexmap_filename += '_{}_indexmap'.format(name)
    if num_epochs != (np.iinfo(np.int32).max - 1):
        indexmap_filename += '_{}ep'.format(num_epochs)
    if max_num_samples != (np.iinfo(np.int64).max - 1):
        indexmap_filename += '_{}mns'.format(max_num_samples)
    indexmap_filename += '_{}msl'.format(max_seq_length)
    indexmap_filename += '_{:0.2f}ssp'.format(short_seq_prob)
    indexmap_filename += '_{}s'.format(seed)
    indexmap_filename += '.npy'

    # Build the indexed mapping if not exist.
    if torch.distributed.get_rank(
    ) == 0 and not os.path.isfile(indexmap_filename):
        print(' > WARNING: could not find index map file {}, building '
              'the indices on rank 0 ...'.format(indexmap_filename))

        # Make sure the types match the helpers input types.
        assert indexed_dataset.doc_idx.dtype == np.int64
        assert indexed_dataset.sizes.dtype == np.int32

        # Build samples mapping
        verbose = torch.distributed.get_rank() == 0
        start_time = time.time()
        logging.info(
            ' > building samples index mapping for {} ...'.format(name))
        # First compile and then import.

        try:
            if is_global_rank_zero():
                compile_helper()
            from nemo.collections.nlp.data.language_modeling.megatron import helpers
        except:
            raise Exception(f'Could not compile helpers.')
        samples_mapping = helpers.build_mapping(
            indexed_dataset.doc_idx,
            indexed_dataset.sizes,
            num_epochs,
            max_num_samples,
            max_seq_length,
            short_seq_prob,
            seed,
            verbose,
            2 if binary_head else 1,
        )
        logging.info(' > done building samples index maping')
        np.save(indexmap_filename, samples_mapping, allow_pickle=True)
        logging.info(
            ' > saved the index mapping in {}'.format(indexmap_filename))
        # Make sure all the ranks have built the mapping
        logging.info(' > elasped time to build and save samples mapping '
                     '(seconds): {:4f}'.format(time.time() - start_time))
    # This should be a barrier but nccl barrier assumes
    # device_index=rank which is not the case for model
    # parallel case
    counts = torch.cuda.LongTensor([1])
    torch.distributed.all_reduce(
        counts, group=parallel_state.get_data_parallel_group())
    torch.distributed.all_reduce(
        counts, group=parallel_state.get_pipeline_model_parallel_group())
    assert counts[0].item() == (
        torch.distributed.get_world_size() // torch.distributed.get_world_size(
            group=parallel_state.get_tensor_model_parallel_group()))

    # Load indexed dataset.
    logging.info(
        ' > loading indexed mapping from {}'.format(indexmap_filename))
    start_time = time.time()
    samples_mapping = np.load(indexmap_filename,
                              allow_pickle=True,
                              mmap_mode='r')
    logging.info(
        '    loaded indexed file in {:3.3f} seconds'.format(time.time() -
                                                            start_time))
    logging.info('    total number of samples: {}'.format(
        samples_mapping.shape[0]))

    return samples_mapping
Ejemplo n.º 15
0
def main():
    parser = ArgumentParser()
    parser.add_argument("--vad_model",
                        type=str,
                        default="MatchboxNet-VAD-3x2",
                        required=False,
                        help="Pass: '******'")
    parser.add_argument(
        "--dataset",
        type=str,
        required=True,
        help=
        "Path of json file of evaluation data. Audio files should have unique names.",
    )
    parser.add_argument("--out_dir",
                        type=str,
                        default="vad_frame",
                        help="Dir of your vad outputs")
    parser.add_argument("--time_length", type=float, default=0.63)
    parser.add_argument("--shift_length", type=float, default=0.01)
    args = parser.parse_args()

    torch.set_grad_enabled(False)

    if args.vad_model.endswith('.nemo'):
        logging.info(f"Using local VAD model from {args.vad_model}")
        vad_model = EncDecClassificationModel.restore_from(
            restore_path=args.vad_model)
    else:
        logging.info(f"Using NGC cloud VAD model {args.vad_model}")
        vad_model = EncDecClassificationModel.from_pretrained(
            model_name=args.vad_model)

    if not os.path.exists(args.out_dir):
        os.mkdir(args.out_dir)

    # setup_test_data
    vad_model.setup_test_data(
        test_data_config={
            'vad_stream': True,
            'sample_rate': 16000,
            'manifest_filepath': args.dataset,
            'labels': [
                'infer',
            ],
            'num_workers': 20,
            'shuffle': False,
            'time_length': args.time_length,
            'shift_length': args.shift_length,
            'trim_silence': False,
        })

    vad_model = vad_model.to(device)
    vad_model.eval()

    data = []
    for line in open(args.dataset, 'r'):
        file = json.loads(line)['audio_filepath'].split("/")[-1]
        data.append(file.split(".wav")[0])
    print(f"Inference on {len(data)} audio files/json lines!")

    time_unit = int(args.time_length / args.shift_length)
    trunc = int(time_unit / 2)
    trunc_l = time_unit - trunc
    all_len = 0

    for i, test_batch in enumerate(vad_model.test_dataloader()):
        if i == 0:
            status = 'start' if data[i] == data[i + 1] else 'single'
        elif i == len(data) - 1:
            status = 'end' if data[i] == data[i - 1] else 'single'
        else:
            if data[i] != data[i - 1] and data[i] == data[i + 1]:
                status = 'start'
            elif data[i] == data[i - 1] and data[i] == data[i + 1]:
                status = 'next'
            elif data[i] == data[i - 1] and data[i] != data[i + 1]:
                status = 'end'
            else:
                status = 'single'
        print(data[i], status)

        test_batch = [x.to(device) for x in test_batch]
        with autocast():
            log_probs = vad_model(input_signal=test_batch[0],
                                  input_signal_length=test_batch[1])
            probs = torch.softmax(log_probs, dim=-1)
            pred = probs[:, 1]

            if status == 'start':
                to_save = pred[:-trunc]
            elif status == 'next':
                to_save = pred[trunc:-trunc_l]
            elif status == 'end':
                to_save = pred[trunc_l:]
            else:
                to_save = pred
            all_len += len(to_save)

            outpath = os.path.join(args.out_dir, data[i] + ".frame")
            with open(outpath, "a") as fout:
                for f in range(len(to_save)):
                    fout.write('{0:0.4f}\n'.format(to_save[f]))

        del test_batch
        if status == 'end' or status == 'single':
            print(f"Overall length of prediction of {data[i]} is {all_len}!")
            all_len = 0
Ejemplo n.º 16
0
def process_jarvis_datasets(infold,
                            outfold,
                            modes=['train', 'test', 'dev'],
                            do_lower_case=False,
                            ignore_prev_intent=False):
    """ process and convert Jarvis datasets into NeMo's BIO format
    """
    dataset_name = "jarvis"
    if if_exist(outfold, ['dict.intents.csv', 'dict.slots.csv']):
        logging.info(DATABASE_EXISTS_TMP.format(dataset_name, outfold))
        return outfold

    logging.info(f'Processing {dataset_name} dataset and storing at {outfold}')

    os.makedirs(outfold, exist_ok=True)

    outfiles = {}
    intents_list = {}
    slots_list = {}
    slots_list_all = {}

    outfiles['dict_intents'] = open(f'{outfold}/dict.intents.csv',
                                    'w',
                                    encoding='utf-8')
    outfiles['dict_slots'] = open(f'{outfold}/dict.slots.csv',
                                  'w',
                                  encoding='utf-8')

    outfiles['dict_slots'].write('O\n')
    slots_list["O"] = 0
    slots_list_all["O"] = 0

    for mode in modes:
        if if_exist(outfold, [f'{mode}.tsv']):
            logging.info(
                MODE_EXISTS_TMP.format(mode, dataset_name, outfold, mode))
            continue

        if not if_exist(infold, [f'{mode}.tsv']):
            logging.info(f'{mode} mode of {dataset_name}'
                         f' is skipped as it was not found.')
            continue

        outfiles[mode] = open(os.path.join(outfold, mode + '.tsv'),
                              'w',
                              encoding='utf-8')
        outfiles[mode].write('sentence\tlabel\n')
        outfiles[mode + '_slots'] = open(f'{outfold}/{mode}_slots.tsv',
                                         'w',
                                         encoding='utf-8')

        queries = open(f'{infold}/{mode}.tsv', 'r',
                       encoding='utf-8').readlines()

        for i, query in enumerate(queries):
            line_splits = query.strip().split("\t")
            if len(line_splits) == 3:
                intent_str, slot_tags_str, sentence = line_splits
            else:
                intent_str, sentence = line_splits
                slot_tags_str = ""

            if intent_str not in intents_list:
                intents_list[intent_str] = len(intents_list)
                outfiles['dict_intents'].write(f'{intent_str}\n')

            if ignore_prev_intent:
                start_token = 2
            else:
                start_token = 1

            if do_lower_case:
                sentence = sentence.lower()
            sentence_cld = " ".join(sentence.strip().split()[start_token:-1])
            outfiles[mode].write(f'{sentence_cld}\t'
                                 f'{str(intents_list[intent_str])}\n')

            slot_tags_list = []
            if slot_tags_str.strip():
                slot_tags = slot_tags_str.strip().split(",")
                for st in slot_tags:
                    if not st.strip():
                        continue
                    [start_i, end_i, slot_name] = st.strip().split(":")
                    slot_tags_list.append(
                        [int(start_i), int(end_i), slot_name])
                    if slot_name not in slots_list:
                        slots_list[slot_name] = len(slots_list)
                        slots_list_all[f'B-{slot_name}'] = len(slots_list_all)
                        slots_list_all[f'I-{slot_name}'] = len(slots_list_all)
                        outfiles['dict_slots'].write(f'B-{slot_name}\n')
                        outfiles['dict_slots'].write(f'I-{slot_name}\n')

            slot_tags_list.sort(key=lambda x: x[0])
            slots = []
            processed_index = 0
            for tag_start, tag_end, tag_str in slot_tags_list:
                if tag_start > processed_index:
                    words_list = sentence[processed_index:tag_start].strip(
                    ).split()
                    slots.extend([str(slots_list_all['O'])] * len(words_list))
                words_list = sentence[tag_start:tag_end].strip().split()
                slots.append(str(slots_list_all[f'B-{tag_str}']))
                slots.extend([str(slots_list_all[f'I-{tag_str}'])] *
                             (len(words_list) - 1))
                processed_index = tag_end

            if processed_index < len(sentence):
                words_list = sentence[processed_index:].strip().split()
                slots.extend([str(slots_list_all['O'])] * len(words_list))

            slots = slots[1:-1]
            slot = ' '.join(slots)
            outfiles[mode + '_slots'].write(slot + '\n')

        outfiles[mode + '_slots'].close()
        outfiles[mode].close()

    outfiles['dict_slots'].close()
    outfiles['dict_intents'].close()

    return outfold
Ejemplo n.º 17
0
    # Loss.
    nll_loss = NLLLoss()

    # Create a training graph.
    with NeuralGraph(operation_mode=OperationMode.training) as training_graph:
        _, img, tgt, _ = dl()
        feat_map = cnn(inputs=img)
        res_img = reshaper(inputs=feat_map)
        logits = ffn(inputs=res_img)
        pred = nl(inputs=logits)
        loss = nll_loss(predictions=pred, targets=tgt)
        # Set output - that output will be used for training.
        training_graph.outputs["loss"] = loss

    # Display the graph summmary.
    logging.info(training_graph.summary())

    # SimpleLossLoggerCallback will print loss values to console.
    callback = SimpleLossLoggerCallback(
        tensors=[loss],
        print_func=lambda x: logging.info(f'Training Loss: {str(x[0].item())}'
                                          ))

    # Invoke the "train" action.
    nf.train(
        training_graph=training_graph,
        callbacks=[callback],
        optimization_params={
            "num_epochs": 10,
            "lr": 0.001
        },
Ejemplo n.º 18
0
def main():
    parser = ArgumentParser()
    parser.add_argument(
        "--spkr_model",
        type=str,
        default="ecapa_tdnn",
        required=True,
        help="Pass your trained .nemo model",
    )
    parser.add_argument("--train_manifest",
                        type=str,
                        required=True,
                        help="path to train manifest file to match labels")
    parser.add_argument("--test_manifest",
                        type=str,
                        required=True,
                        help="path to test manifest file to perform inference")
    parser.add_argument("--batch_size", type=int, default=32)
    args = parser.parse_args()
    torch.set_grad_enabled(False)

    if args.spkr_model.endswith('.nemo'):
        logging.info(f"Using local speaker model from {args.spkr_model}")
        speaker_model = EncDecSpeakerLabelModel.restore_from(
            restore_path=args.spkr_model)
    else:
        logging.error(f"Please pass a trained .nemo file")
        sys.exit()

    labels = []
    with open(args.train_manifest, 'rb') as f:
        lines = f.readlines()
        for line in lines:
            line = line.strip()
            item = json.loads(line)
            labels.append(item['label'])

    labels_map = sorted(set(labels))
    label2id, id2label = {}, {}
    for label_id, label in enumerate(labels_map):
        label2id[label] = label_id
        id2label[label_id] = label

    speaker_model.setup_test_data(
        test_data_layer_params={
            'sample_rate': 16000,
            'manifest_filepath': args.test_manifest,
            'labels': labels_map,
            'batch_size': args.batch_size,
            'trim_silence': False,
            'shuffle': False,
        })
    if can_gpu:
        speaker_model = speaker_model.cuda()
    speaker_model.eval()

    speaker_model.test_dataloader()
    all_labels = []
    all_logits = []
    for test_batch in tqdm(speaker_model.test_dataloader()):
        if can_gpu:
            test_batch = [x.cuda() for x in test_batch]
        with autocast():
            audio_signal, audio_signal_len, labels, _ = test_batch
            logits, _ = speaker_model.forward(
                input_signal=audio_signal,
                input_signal_length=audio_signal_len)

            all_logits.extend(logits.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    all_logits, true_labels = np.asarray(all_logits), np.asarray(all_labels)
    infer_labels = all_logits.argmax(axis=1)

    out_manifest = os.path.basename(
        args.test_manifest).split('.')[0] + '_infer.json'
    out_manifest = os.path.join(os.path.dirname(args.test_manifest),
                                out_manifest)
    with open(args.test_manifest, 'rb') as f1, open(out_manifest, 'w') as f2:
        lines = f1.readlines()
        for idx, line in enumerate(lines):
            line = line.strip()
            item = json.loads(line)
            item['infer'] = id2label[infer_labels[idx]]
            json.dump(item, f2)
            f2.write('\n')

    logging.info(
        "Inference labels have been written to {} manifest file".format(
            out_manifest))
Ejemplo n.º 19
0
 def on_action_start(self):
     if self.global_rank is None or self.global_rank == 0:
         logging.info("Starting .....")
         self._start_time = time.time()
Ejemplo n.º 20
0
    def __init__(
        self,
        *,
        audio_tar_filepaths: Union[str, List[str]],
        manifest_filepath: Union[str, List[str]],
        labels: List[str],
        featurizer,
        shuffle_n: int = 0,
        min_duration: Optional[float] = 0.1,
        max_duration: Optional[float] = None,
        trim: bool = False,
        shard_strategy: str = "scatter",
        global_rank: int = 0,
        world_size: int = 0,
        is_regression_task: bool = False,
    ):
        self.collection = collections.ASRSpeechLabel(
            manifests_files=manifest_filepath,
            min_duration=min_duration,
            max_duration=max_duration,
            index_by_file_id=
            True,  # Must set this so the manifest lines can be indexed by file ID
        )

        self.file_occurence = count_occurence(self.collection.mapping)

        self.featurizer = featurizer
        self.trim = trim

        self.labels = labels if labels else self.collection.uniq_labels
        self.num_classes = len(self.labels)

        self.label2id, self.id2label = {}, {}
        for label_id, label in enumerate(self.labels):
            self.label2id[label] = label_id
            self.id2label[label_id] = label

        for idx in range(len(self.labels[:5])):
            logging.debug(" label id {} and its mapped label {}".format(
                idx, self.id2label[idx]))

        valid_shard_strategies = ['scatter', 'replicate']
        if shard_strategy not in valid_shard_strategies:
            raise ValueError(
                f"`shard_strategy` must be one of {valid_shard_strategies}")

        if isinstance(audio_tar_filepaths, str):
            # Replace '(' and '[' with '{'
            brace_keys_open = ['(', '[', '<', '_OP_']
            for bkey in brace_keys_open:
                if bkey in audio_tar_filepaths:
                    audio_tar_filepaths = audio_tar_filepaths.replace(
                        bkey, "{")

            # Replace ')' and ']' with '}'
            brace_keys_close = [')', ']', '>', '_CL_']
            for bkey in brace_keys_close:
                if bkey in audio_tar_filepaths:
                    audio_tar_filepaths = audio_tar_filepaths.replace(
                        bkey, "}")

        # Check for distributed and partition shards accordingly
        if world_size > 1:
            if isinstance(audio_tar_filepaths, str):
                # Brace expand
                audio_tar_filepaths = list(
                    braceexpand.braceexpand(audio_tar_filepaths))

            if shard_strategy == 'scatter':
                logging.info(
                    "All tarred dataset shards will be scattered evenly across all nodes."
                )

                if len(audio_tar_filepaths) % world_size != 0:
                    logging.warning(
                        f"Number of shards in tarred dataset ({len(audio_tar_filepaths)}) is not divisible "
                        f"by number of distributed workers ({world_size}).")

                begin_idx = (len(audio_tar_filepaths) //
                             world_size) * global_rank
                end_idx = begin_idx + (len(audio_tar_filepaths) // world_size)
                audio_tar_filepaths = audio_tar_filepaths[begin_idx:end_idx]
                logging.info(
                    "Partitioning tarred dataset: process (%d) taking shards [%d, %d)",
                    global_rank, begin_idx, end_idx)

            elif shard_strategy == 'replicate':
                logging.info(
                    "All tarred dataset shards will be replicated across all nodes."
                )

            else:
                raise ValueError(
                    f"Invalid shard strategy ! Allowed values are : {valid_shard_strategies}"
                )

        # Put together WebDataset
        self._dataset = wd.WebDataset(urls=audio_tar_filepaths,
                                      nodesplitter=None)

        if shuffle_n > 0:
            self._dataset = self._dataset.shuffle(shuffle_n)
        else:
            logging.info(
                "WebDataset will not shuffle files within the tar files.")

        self._dataset = (self._dataset.rename(
            audio=VALID_FILE_FORMATS, key='__key__').to_tuple(
                'audio', 'key').pipe(self._filter).map(f=self._build_sample))
Ejemplo n.º 21
0
    def _register_vocab_from_tokenizer(
        self,
        vocab_file_config_path: str = 'tokenizer.vocab_file',
        vocab_dict_config_path: str = 'tokenizer_vocab_dict.json',
        cfg: DictConfig = None,
    ):
        """Creates vocab file from tokenizer if vocab file is None.

        Args:
            vocab_file_config_path: path to the vocab_file in the config
            vocab_dict_config_path: path to the vocab_dict in the config
            cfg: tokenizer config
        """
        if self.tokenizer is None:
            raise ValueError(
                'Instantiate self.tokenizer before registering vocab from it.')
        else:
            if isinstance(self.tokenizer, AutoTokenizer):
                # extract vocab from tokenizer
                vocab_dict = self.tokenizer.tokenizer.get_vocab()

                # for fast and slow tokenizer vocabularies compatibility
                vocab_dict = dict(
                    sorted(vocab_dict.items(), key=lambda item: item[1]))

                # get hash of vocab_dict to create a unique directory to write vocab_dict and vocab_file
                m = hashlib.md5()
                if 'tokenizer_name' in cfg:
                    if cfg.tokenizer_name is not None:
                        # different pretrained models with the same vocab will have different hash
                        m.update(cfg.tokenizer_name.encode())
                # get string representation of vocab_dict
                vocab_dict_str = json.dumps(vocab_dict,
                                            sort_keys=True).encode()
                m.update(vocab_dict_str)
                vocab_dict_hash = m.hexdigest()

                hash_path = os.path.join(NEMO_NLP_TMP, vocab_dict_hash)
                os.makedirs(hash_path, exist_ok=True)

                vocab_json_src = os.path.join(hash_path,
                                              vocab_dict_config_path)

                with open(vocab_json_src, 'w', encoding='utf-8') as f:
                    f.write(
                        json.dumps(vocab_dict, indent=2, sort_keys=True) +
                        '\n')
                self.register_artifact(config_path=vocab_dict_config_path,
                                       src=vocab_json_src)
                # create vocab file
                vocab_file_src = os.path.join(hash_path,
                                              vocab_file_config_path)
                with open(vocab_file_src, 'w', encoding='utf-8') as f:
                    for key in vocab_dict:
                        f.write(key + '\n')

                cfg.vocab_file = vocab_file_src
                self.register_artifact(config_path=vocab_file_config_path,
                                       src=vocab_file_src)
            else:
                logging.info(
                    f'Registering tokenizer vocab for {self.tokenizer} is not yet supported. Please override this method if needed.'
                )
Ejemplo n.º 22
0
def check_resume(
    trainer: 'pytorch_lightning.Trainer',
    log_dir: str,
    resume_past_end: bool = False,
    resume_ignore_no_checkpoint: bool = False,
):
    """Checks that resume=True was used correctly with the arguments pass to exp_manager. Sets
    trainer.resume_from_checkpoint as necessary.

    Returns:
        log_dir (Path): the log_dir
        exp_dir (str): the base exp_dir without name nor version
        name (str): The name of the experiment
        version (str): The version of the experiment

    Raises:
        NotFoundError: If resume is True, resume_ignore_no_checkpoint is False, and checkpoints could not be found.
        ValueError: If resume is True, and there were more than 1 checkpoint could found.
    """
    if not log_dir:
        raise ValueError(f"Resuming requires the log_dir {log_dir} to be passed to exp_manager")

    checkpoint_dir = Path(Path(log_dir) / "checkpoints")
    checkpoint = None
    end_checkpoints = list(checkpoint_dir.glob("*end.ckpt"))
    end_checkpoints.extend(list(checkpoint_dir.glob("*.nemo")))
    last_checkpoints = list(checkpoint_dir.glob("*last.ckpt"))
    if not checkpoint_dir.exists():
        if resume_ignore_no_checkpoint:
            logging.warning(
                f"There was no checkpoint folder at checkpoint_dir :{checkpoint_dir}. Training from scratch."
            )
            return
        else:
            raise NotFoundError(f"There was no checkpoint folder at checkpoint_dir :{checkpoint_dir}. Cannot resume.")
    elif len(end_checkpoints) > 0:
        if resume_past_end:
            if len(end_checkpoints) > 1:
                raise ValueError(f"Multiple multiple checkpoints {end_checkpoints} that matches *end.ckpt.")
            logging.info(f"Resuming from {end_checkpoints[0]}")
            checkpoint = end_checkpoints[0]
        else:
            raise ValueError(
                f"Found {end_checkpoints[0]} indicating that the last training run has already completed."
            )
    elif not len(last_checkpoints) > 0:
        if resume_ignore_no_checkpoint:
            logging.warning(f"There were no checkpoints found in {checkpoint_dir}. Training from scratch.")
            return
        else:
            raise NotFoundError(f"There were no checkpoints found in {checkpoint_dir}. Cannot resume.")
    elif len(last_checkpoints) > 1:
        raise ValueError(f"Multiple multiple checkpoints {last_checkpoints} that matches *last.ckpt.")
    else:
        logging.info(f"Resuming from {last_checkpoints[0]}")
        checkpoint = last_checkpoints[0]

    trainer.resume_from_checkpoint = str(checkpoint)

    if is_global_rank_zero():
        # Check to see if any files exist that need to be moved
        files_to_move = []
        for child in Path(log_dir).iterdir():
            if child.is_file():
                files_to_move.append(child)

        if len(files_to_move) > 0:
            # Move old files to a new folder
            other_run_dirs = Path(log_dir).glob("run_*")
            run_count = 0
            for fold in other_run_dirs:
                if fold.is_dir():
                    run_count += 1
            new_run_dir = Path(Path(log_dir) / f"run_{run_count}")
            new_run_dir.mkdir()
            for _file in files_to_move:
                move(str(_file), str(new_run_dir))
Ejemplo n.º 23
0
def main():
    parser = ArgumentParser()
    parser.add_argument(
        "--pretrained_model",
        type=str,
        default="titanet_large",
        required=False,
        help="Pass your trained .nemo model",
    )
    parser.add_argument(
        "--finetune_config_file",
        type=str,
        required=True,
        help=
        "path to speakernet config yaml file to load train, validation dataset and also for trainer parameters",
    )

    parser.add_argument(
        "--freeze_encoder",
        type=bool,
        required=False,
        default=True,
        help=
        "True if speakernet encoder paramteres needs to be frozen while finetuning",
    )

    args = parser.parse_args()

    if args.pretrained_model.endswith('.nemo'):
        logging.info(f"Using local speaker model from {args.pretrained_model}")
        speaker_model = EncDecSpeakerLabelModel.restore_from(
            restore_path=args.pretrained_model)
    elif args.pretrained_model.endswith('.ckpt'):
        logging.info(
            f"Using local speaker model from checkpoint {args.pretrained_model}"
        )
        speaker_model = EncDecSpeakerLabelModel.load_from_checkpoint(
            checkpoint_path=args.pretrained_model)
    else:
        logging.info("Using pretrained speaker recognition model from NGC")
        speaker_model = EncDecSpeakerLabelModel.from_pretrained(
            model_name=args.pretrained_model)

    finetune_config = OmegaConf.load(args.finetune_config_file)

    if 'test_ds' in finetune_config.model and finetune_config.model.test_ds is not None:
        finetune_config.model.test_ds = None
        logging.warning("Removing test ds")

    speaker_model.setup_finetune_model(finetune_config.model)
    finetune_trainer = pl.Trainer(**finetune_config.trainer)
    speaker_model.set_trainer(finetune_trainer)

    _ = exp_manager(finetune_trainer, finetune_config.get('exp_manager', None))
    speaker_model.setup_optimization(finetune_config.optim)

    if args.freeze_encoder:
        for param in speaker_model.encoder.parameters():
            param.requires_grad = False

    finetune_trainer.fit(speaker_model)
Ejemplo n.º 24
0
def exp_manager(trainer: 'pytorch_lightning.Trainer', cfg: Optional[Union[DictConfig, Dict]] = None) -> Path:
    """
    exp_manager is a helper function used to manage folders for experiments. It follows the pytorch lightning paradigm
    of exp_dir/model_or_experiment_name/version. If the lightning trainer has a logger, exp_manager will get exp_dir,
    name, and version from the logger. Otherwise it will use the exp_dir and name arguments to create the logging
    directory. exp_manager also allows for explicit folder creation via explicit_log_dir.
    The version will be a datetime string or an integer. Note, exp_manager does not handle versioning on slurm
    multi-node runs. Datestime version can be disabled if use_datetime_version is set to False.
    It optionally creates TensorBoardLogger, WandBLogger, ModelCheckpoint objects from pytorch lightning. It copies
    sys.argv, and git information if available to the logging directory. It creates a log file for each process to log
    their output into.
    exp_manager additionally has a resume feature which can be used to continuing training from the constructed log_dir.

    Args:
        trainer (pytorch_lightning.Trainer): The lightning trainer.
        cfg (DictConfig, dict): Can have the following keys:
            - explicit_log_dir (str, Path): Can be used to override exp_dir/name/version folder creation. Defaults to
                None, which will use exp_dir, name, and version to construct the logging directory.
            - exp_dir (str, Path): The base directory to create the logging directory. Defaults to None, which logs to
                ./nemo_experiments.
            - name (str): The name of the experiment. Defaults to None which turns into "default" via name = name or
                "default".
            - version (str): The version of the experiment. Defaults to None which uses either a datetime string or
                lightning's TensorboardLogger system of using version_{int}.
            - use_datetime_version (bool): Whether to use a datetime string for version. Defaults to True.
            - resume_if_exists (bool): Whether this experiment is resuming from a previous run. If True, it sets
                trainer.resume_from_checkpoint so that the trainer should auto-resume. exp_manager will move files
                under log_dir to log_dir/run_{int}. Defaults to False.
            - resume_past_end (bool): exp_manager errors out if resume_if_exists is True and a checkpoint matching
                *end.ckpt indicating a previous training run fully completed. This behaviour can be disabled, in which
                case the *end.ckpt will be loaded by setting resume_past_end to True. Defaults to False.
            - resume_ignore_no_checkpoint (bool): exp_manager errors out if resume_if_exists is True and no checkpoint
                could be found. This behaviour can be disabled, in which case exp_manager will print a message and
                continue without restoring, by setting resume_ignore_no_checkpoint to True. Defaults to False.
            - create_tensorboard_logger (bool): Whether to create a tensorboard logger and attach it to the pytorch
                lightning trainer. Defaults to True.
            - summary_writer_kwargs (dict): A dictionary of kwargs that can be passed to lightning's TensorboardLogger
                class. Note that log_dir is passed by exp_manager and cannot exist in this dict. Defaults to None.
            - create_wandb_logger (bool): Whether to create a Weights and Baises logger and attach it to the pytorch
                lightning trainer. Defaults to False.
            - wandb_logger_kwargs (dict): A dictionary of kwargs that can be passed to lightning's WandBLogger
                class. Note that name and project are required parameters if create_wandb_logger is True.
                Defaults to None.
            - create_checkpoint_callback (bool): Whether to create a ModelCheckpoint callback and attach it to the
                pytorch lightning trainer. The ModelCheckpoint saves the top 3 models with the best "val_loss", the most
                recent checkpoint under *last.ckpt, and the final checkpoint after training completes under *end.ckpt.
                Defaults to True.
            - files_to_copy (list): A list of files to copy to the experiment logging directory. Defaults to None which
                copies no files.

    returns:
        log_dir (Path): The final logging directory where logging files are saved. Usually the concatenation of
            exp_dir, name, and version.
    """
    if cfg is None:
        logging.error("exp_manager did not receive a cfg argument. It will be disabled.")
        return

    # Ensure passed cfg is compliant with ExpManagerConfig
    schema = OmegaConf.structured(ExpManagerConfig)
    if isinstance(cfg, dict):
        cfg = OmegaConf.create(cfg)
    elif not isinstance(cfg, DictConfig):
        raise ValueError(f"cfg was type: {type(cfg)}. Expected either a dict or a DictConfig")
    cfg = OmegaConf.create(OmegaConf.to_container(cfg, resolve=True))
    cfg = OmegaConf.merge(schema, cfg)

    error_checks(trainer, cfg)  # Ensures that trainer options are compliant with NeMo and exp_manager arguments

    log_dir, exp_dir, name, version = get_log_dir(
        trainer=trainer,
        exp_dir=cfg.exp_dir,
        name=cfg.name,
        version=cfg.version,
        explicit_log_dir=cfg.explicit_log_dir,
        use_datetime_version=cfg.use_datetime_version,
    )
    if cfg.resume_if_exists:
        check_resume(trainer, log_dir, cfg.resume_past_end, cfg.resume_ignore_no_checkpoint)

    checkpoint_name = name
    # If name returned from get_log_dir is "", use cfg.name for checkpointing
    if checkpoint_name is None or checkpoint_name == '':
        checkpoint_name = cfg.name or "default"
    cfg.name = name  # Used for configure_loggers so that the log_dir is properly set even if name is ""
    cfg.version = version

    # Create the logging directory if it does not exist
    os.makedirs(log_dir, exist_ok=True)  # Cannot limit creation to global zero as all ranks write to own log file
    logging.info(f'Experiments will be logged at {log_dir}')
    trainer._default_root_dir = log_dir

    # Handle Loggers by creating file and handle DEBUG statements
    # Note: trainer.global_rank and trainer.is_global_zero are not set until trainer.fit, so have to hack around it
    global_rank = trainer.node_rank * trainer.num_gpus + trainer.local_rank
    log_file = log_dir / f'nemo_log_globalrank-{global_rank}_localrank-{trainer.local_rank}.txt'
    logging.add_file_handler(log_file)
    logging.rank = global_rank

    # For some reason, LearningRateLogger requires trainer to have a logger. Safer to create logger on all ranks
    # not just global rank 0.
    if cfg.create_tensorboard_logger or cfg.create_wandb_logger:
        configure_loggers(
            trainer,
            exp_dir,
            cfg.name,
            cfg.version,
            cfg.create_tensorboard_logger,
            cfg.summary_writer_kwargs,
            cfg.create_wandb_logger,
            cfg.wandb_logger_kwargs,
        )

    if is_global_rank_zero():
        if cfg.create_checkpoint_callback:
            configure_checkpointing(trainer, log_dir, checkpoint_name, cfg.checkpoint_callback_params)

        # Move files_to_copy to folder and add git information if present
        if cfg.files_to_copy:
            for _file in cfg.files_to_copy:
                copy(Path(_file), log_dir)

        # Create files for cmd args and git info
        with open(log_dir / 'cmd-args.log', 'w') as _file:
            _file.write(" ".join(sys.argv))

        # Try to get git hash
        git_repo, git_hash = get_git_hash()
        if git_repo:
            with open(log_dir / 'git-info.log', 'w') as _file:
                _file.write(f'commit hash: {git_hash}')
                _file.write(get_git_diff())

        # Add err_file logging to global_rank zero
        logging.add_err_file_handler(log_dir / 'nemo_error_log.txt')

        # Add lightning file logging to global_rank zero
        add_filehandlers_to_pl_logger(log_dir / 'lightning_logs.txt', log_dir / 'nemo_error_log.txt')

    return log_dir
Ejemplo n.º 25
0
    def __init__(
        self,
        audio_tar_filepaths: Union[str, List[str]],
        manifest_filepath: str,
        parser: Callable,
        sample_rate: int,
        int_values: bool = False,
        augmentor: Optional[
            'nemo.collections.asr.parts.perturb.AudioAugmentor'] = None,
        shuffle_n: int = 0,
        min_duration: Optional[float] = None,
        max_duration: Optional[float] = None,
        max_utts: int = 0,
        trim: bool = False,
        bos_id: Optional[int] = None,
        eos_id: Optional[int] = None,
        add_misc: bool = False,
        pad_id: int = 0,
        shard_strategy: str = "scatter",
        global_rank: int = 0,
        world_size: int = 0,
    ):
        self.collection = collections.ASRAudioText(
            manifests_files=manifest_filepath.split(','),
            parser=parser,
            min_duration=min_duration,
            max_duration=max_duration,
            max_number=max_utts,
            index_by_file_id=
            True,  # Must set this so the manifest lines can be indexed by file ID
        )

        self.featurizer = WaveformFeaturizer(sample_rate=sample_rate,
                                             int_values=int_values,
                                             augmentor=augmentor)
        self.trim = trim
        self.eos_id = eos_id
        self.bos_id = bos_id
        self.pad_id = pad_id
        self._add_misc = add_misc

        valid_shard_strategies = ['scatter', 'replicate']
        if shard_strategy not in valid_shard_strategies:
            raise ValueError(
                f"`shard_strategy` must be one of {valid_shard_strategies}")

        if isinstance(audio_tar_filepaths, str):
            # Replace '(' and '[' with '{'
            brace_keys_open = ['(', '[', '<', '_OP_']
            for bkey in brace_keys_open:
                if bkey in audio_tar_filepaths:
                    audio_tar_filepaths = audio_tar_filepaths.replace(
                        bkey, "{")

            # Replace ')' and ']' with '}'
            brace_keys_close = [')', ']', '>', '_CL_']
            for bkey in brace_keys_close:
                if bkey in audio_tar_filepaths:
                    audio_tar_filepaths = audio_tar_filepaths.replace(
                        bkey, "}")

        # Check for distributed and partition shards accordingly
        if world_size > 1:
            if isinstance(audio_tar_filepaths, str):
                # Brace expand
                audio_tar_filepaths = list(
                    braceexpand.braceexpand(audio_tar_filepaths))

            if shard_strategy == 'scatter':
                logging.info(
                    "All tarred dataset shards will be scattered evenly across all nodes."
                )

                if len(audio_tar_filepaths) % world_size != 0:
                    logging.warning(
                        f"Number of shards in tarred dataset ({len(audio_tar_filepaths)}) is not divisible "
                        f"by number of distributed workers ({world_size}).")

                begin_idx = (len(audio_tar_filepaths) //
                             world_size) * global_rank
                end_idx = begin_idx + (len(audio_tar_filepaths) // world_size)
                audio_tar_filepaths = audio_tar_filepaths[begin_idx:end_idx]
                logging.info(
                    "Partitioning tarred dataset: process (%d) taking shards [%d, %d)",
                    global_rank, begin_idx, end_idx)

            elif shard_strategy == 'replicate':
                logging.info(
                    "All tarred dataset shards will be replicated across all nodes."
                )

            else:
                raise ValueError(
                    f"Invalid shard strategy ! Allowed values are : {valid_shard_strategies}"
                )

        # Put together WebDataset
        self._dataset = wd.WebDataset(audio_tar_filepaths)

        if shuffle_n > 0:
            self._dataset = self._dataset.shuffle(shuffle_n)
        else:
            logging.info(
                "WebDataset will not shuffle files within the tar files.")

        self._dataset = (self._dataset.rename(
            audio='wav', key='__key__').to_tuple('audio', 'key').pipe(
                self._filter).map(f=self._build_sample))
Ejemplo n.º 26
0
    def _setup_dataloader_from_config(self, config: DictConfig):

        OmegaConf.set_struct(config, False)
        config.is_regression_task = self.is_regression_task
        OmegaConf.set_struct(config, True)

        if 'augmentor' in config:
            augmentor = process_augmentations(config['augmentor'])
        else:
            augmentor = None

        featurizer = WaveformFeaturizer(
            sample_rate=config['sample_rate'], int_values=config.get('int_values', False), augmentor=augmentor
        )
        shuffle = config['shuffle']

        # Instantiate tarred dataset loader or normal dataset loader
        if config.get('is_tarred', False):
            if ('tarred_audio_filepaths' in config and config['tarred_audio_filepaths'] is None) or (
                'manifest_filepath' in config and config['manifest_filepath'] is None
            ):
                logging.warning(
                    "Could not load dataset as `manifest_filepath` is None or "
                    f"`tarred_audio_filepaths` is None. Provided config : {config}"
                )
                return None

            if 'vad_stream' in config and config['vad_stream']:
                logging.warning("VAD inference does not support tarred dataset now")
                return None

            shuffle_n = config.get('shuffle_n', 4 * config['batch_size']) if shuffle else 0
            dataset = audio_to_label_dataset.get_tarred_classification_label_dataset(
                featurizer=featurizer,
                config=OmegaConf.to_container(config),
                shuffle_n=shuffle_n,
                global_rank=self.global_rank,
                world_size=self.world_size,
            )
            shuffle = False
            batch_size = config['batch_size']
            collate_func = dataset.collate_fn

        else:
            if 'manifest_filepath' in config and config['manifest_filepath'] is None:
                logging.warning(f"Could not load dataset as `manifest_filepath` is None. Provided config : {config}")
                return None

            if 'vad_stream' in config and config['vad_stream']:
                logging.info("Perform streaming frame-level VAD")
                dataset = audio_to_label_dataset.get_speech_label_dataset(
                    featurizer=featurizer, config=OmegaConf.to_container(config)
                )
                batch_size = 1
                collate_func = dataset.vad_frame_seq_collate_fn
            else:
                dataset = audio_to_label_dataset.get_classification_label_dataset(
                    featurizer=featurizer, config=OmegaConf.to_container(config)
                )
                batch_size = config['batch_size']
                collate_func = dataset.collate_fn

        return torch.utils.data.DataLoader(
            dataset=dataset,
            batch_size=batch_size,
            collate_fn=collate_func,
            drop_last=config.get('drop_last', False),
            shuffle=shuffle,
            num_workers=config.get('num_workers', 0),
            pin_memory=config.get('pin_memory', False),
        )
Ejemplo n.º 27
0
def get_label_ids(
    label_file: str,
    is_training: bool = False,
    pad_label: str = 'O',
    label_ids_dict: Dict[str, int] = None,
    get_weights: bool = True,
):
    """
    Generates str to int labels mapping for training data or checks correctness of the label_ids_dict
    file for non-training files or if label_ids_dict is specified

    Args:
        label_file: the path of the label file to process
        is_training: indicates whether the label_file is used for training
        pad_label: token used for padding
        label_ids_dict: str label name to int ids mapping. Required for non-training data.
            If specified, the check that all labels from label_file are present in label_ids_dict will be performed.
            For training data, if label_ids_dict is None, a new mapping will be generated from label_file.
        get_weights: set to True to calculate class weights, required for Weighted Loss.
    """
    if not os.path.exists(label_file):
        raise ValueError(f'File {label_file} was not found.')

    logging.info(f'Processing {label_file}')
    if not is_training and label_ids_dict is None:
        raise ValueError(
            f'For non training data, label_ids_dict created during preprocessing of the training data '
            f'should be provided')

    # collect all labels from the label_file
    data_dir = os.path.dirname(label_file)
    unique_labels = set(pad_label)
    all_labels = []
    with open(label_file, 'r') as f:
        for line in f:
            line = line.strip().split()
            all_labels.extend(line)
            unique_labels.update(line)

    # check that all labels from label_file are present in the specified label_ids_dict
    # or generate label_ids_dict from data (for training only)
    save_label_ids = True
    if label_ids_dict:
        logging.info(f'Using provided labels mapping {label_ids_dict}')
        save_label_ids = False
        for name in unique_labels:
            if name not in label_ids_dict:
                raise ValueError(
                    f'{name} class from {label_file} not found in the provided mapping: {label_ids_dict}'
                )
    else:
        label_ids_dict = {pad_label: 0}
        if pad_label in unique_labels:
            unique_labels.remove(pad_label)
        for label in sorted(unique_labels):
            label_ids_dict[label] = len(label_ids_dict)

    label_ids_filename = os.path.join(data_dir, 'label_ids.csv')
    if is_training and save_label_ids:
        with open(label_ids_filename, 'w') as f:
            labels, _ = zip(
                *sorted(label_ids_dict.items(), key=lambda x: x[1]))
            f.write('\n'.join(labels))
        logging.info(
            f'Labels mapping {label_ids_dict} saved to : {label_ids_filename}')

    # calculate label statistics
    base_name = os.path.splitext(os.path.basename(label_file))[0]
    stats_file = os.path.join(data_dir, f'{base_name}_label_stats.tsv')
    if os.path.exists(stats_file) and not is_training and not get_weights:
        logging.info(f'{stats_file} found, skipping stats calculation.')
    else:
        all_labels = [label_ids_dict[label] for label in all_labels]
        logging.info(f'Three most popular labels in {label_file}:')
        total_labels, label_frequencies, max_id = get_label_stats(
            all_labels, stats_file)
        logging.info(
            f'Total labels: {total_labels}. Label frequencies - {label_frequencies}'
        )

    if get_weights:
        class_weights_pkl = os.path.join(data_dir, f'{base_name}_weights.p')
        if os.path.exists(class_weights_pkl):
            class_weights = pickle.load(open(class_weights_pkl, 'rb'))
            logging.info(f'Class weights restored from {class_weights_pkl}')
        else:
            class_weights_dict = get_freq_weights(label_frequencies)
            logging.info(f'Class Weights: {class_weights_dict}')
            class_weights = fill_class_weights(class_weights_dict, max_id)

            pickle.dump(class_weights, open(class_weights_pkl, "wb"))
            logging.info(f'Class weights saved to {class_weights_pkl}')
    else:
        class_weights = None

    return label_ids_dict, label_ids_filename, class_weights
Ejemplo n.º 28
0
    def change_vocabulary(self,
                          new_vocabulary: List[str],
                          decoding_cfg: Optional[DictConfig] = None):
        """
        Changes vocabulary used during RNNT decoding process. Use this method when fine-tuning a pre-trained model.
        This method changes only decoder and leaves encoder and pre-processing modules unchanged. For example, you would
        use it if you want to use pretrained encoder when fine-tuning on data in another language, or when you'd need
        model to learn capitalization, punctuation and/or special characters.

        Args:
            new_vocabulary: list with new vocabulary. Must contain at least 2 elements. Typically, \
                this is target alphabet.
            decoding_cfg: A config for the decoder, which is optional. If the decoding type
                needs to be changed (from say Greedy to Beam decoding etc), the config can be passed here.

        Returns: None

        """
        if self.joint.vocabulary == new_vocabulary:
            logging.warning(
                f"Old {self.joint.vocabulary} and new {new_vocabulary} match. Not changing anything."
            )
        else:
            if new_vocabulary is None or len(new_vocabulary) == 0:
                raise ValueError(
                    f'New vocabulary must be non-empty list of chars. But I got: {new_vocabulary}'
                )

            joint_config = self.joint.to_config_dict()
            new_joint_config = copy.deepcopy(joint_config)
            new_joint_config['vocabulary'] = new_vocabulary
            new_joint_config['num_classes'] = len(new_vocabulary)
            del self.joint
            self.joint = EncDecRNNTModel.from_config_dict(new_joint_config)

            decoder_config = self.decoder.to_config_dict()
            new_decoder_config = copy.deepcopy(decoder_config)
            new_decoder_config.vocab_size = len(new_vocabulary)
            del self.decoder
            self.decoder = EncDecRNNTModel.from_config_dict(new_decoder_config)

            del self.loss
            self.loss = RNNTLoss(
                num_classes=self.joint.num_classes_with_blank - 1)

            if decoding_cfg is None:
                # Assume same decoding config as before
                decoding_cfg = self.cfg.decoding

            self.decoding = RNNTDecoding(
                decoding_cfg=decoding_cfg,
                decoder=self.decoder,
                joint=self.joint,
                vocabulary=self.joint.vocabulary,
            )

            self.wer = RNNTWER(
                decoding=self.decoding,
                batch_dim_index=self.wer.batch_dim_index,
                use_cer=self.wer.use_cer,
                log_prediction=self.wer.log_prediction,
                dist_sync_on_step=True,
            )

            # Setup fused Joint step
            if self.joint.fuse_loss_wer:
                self.joint.set_loss(self.loss)
                self.joint.set_wer(self.wer)

            # Update config
            with open_dict(self.cfg.joint):
                self.cfg.joint = new_joint_config

            with open_dict(self.cfg.decoder):
                self.cfg.decoder = new_decoder_config

            with open_dict(self.cfg.decoding):
                self.cfg.decoding = decoding_cfg

            logging.info(
                f"Changed decoder to output to {self.joint.vocabulary} vocabulary."
            )
Ejemplo n.º 29
0
def convert(local_rank, rank, world_size, args):

    app_state = AppState()
    app_state.data_parallel_rank = 0
    tensor_model_parallel_size = args.tensor_model_parallel_size
    num_nodes = world_size // args.gpus_per_node
    pipeline_model_parallel_size = world_size // args.tensor_model_parallel_size
    assert args.pipeline_model_parallel_size == pipeline_model_parallel_size

    trainer = Trainer(devices=args.gpus_per_node, accelerator='gpu', num_nodes=num_nodes)

    # TODO: reach out to PTL For an API-safe local rank override
    trainer.accelerator.training_type_plugin._local_rank = local_rank

    app_state.pipeline_model_parallel_size = args.pipeline_model_parallel_size
    app_state.tensor_model_parallel_size = args.tensor_model_parallel_size
    app_state.model_parallel_size = app_state.tensor_model_parallel_size * app_state.pipeline_model_parallel_size

    parallel_state.initialize_model_parallel(
        tensor_model_parallel_size_=app_state.tensor_model_parallel_size,
        pipeline_model_parallel_size_=app_state.pipeline_model_parallel_size,
    )

    app_state.pipeline_model_parallel_rank = parallel_state.get_pipeline_model_parallel_rank()
    app_state.tensor_model_parallel_rank = parallel_state.get_tensor_model_parallel_rank()

    pipeline_rank = rank // tensor_model_parallel_size
    tensor_rank = app_state.tensor_model_parallel_rank
    assert pipeline_rank == app_state.pipeline_model_parallel_rank

    if tensor_model_parallel_size is not None and tensor_model_parallel_size > 1 and pipeline_model_parallel_size == 1:
        # inject model parallel rank
        checkpoint_path = os.path.join(args.checkpoint_folder, f'mp_rank_{tensor_rank:02d}', args.checkpoint_name)
    elif tensor_model_parallel_size is not None and pipeline_model_parallel_size > 1:
        checkpoint_path = os.path.join(
            args.checkpoint_folder, f'mp_rank_{tensor_rank:02d}_{pipeline_rank:03d}', args.checkpoint_name
        )
    else:
        checkpoint_path = os.path.join(args.checkpoint_folder, args.checkpoint_name)
    logging.info(f"loading checkpoint {checkpoint_path}")

    if args.model_type == 'gpt':
        ## this dictionary is used to rename the model parameters
        name_translate = {}
        name_translate['transformer'] = 'encoder'
        name_translate['.attention.'] = '.self_attention.'
        # nemo megatron doesn't have _for_head key
        name_translate['word_embeddings_for_head'] = 'word_embeddings'
        checkpoint, consumed, steps, version = load_from_checkpoint(
            MegatronGPTModel,
            checkpoint_path,
            hparams_file=args.hparams_file,
            trainer=trainer,
            translator=name_translate,
            strict=False,
        )
    elif args.model_type == 'bert':
        ## this dictionary is used to rename the model parameters
        name_translate = {}
        name_translate['transformer'] = 'encoder'
        name_translate['.attention.'] = '.self_attention.'
        # nemo megatron doesn't have _for_head key
        name_translate['word_embeddings_for_head'] = 'word_embeddings'
        checkpoint, consumed, steps, version = load_from_checkpoint(
            MegatronBertModel,
            checkpoint_path,
            hparams_file=args.hparams_file,
            trainer=trainer,
            translator=name_translate,
            strict=False,
        )
    else:
        raise NotImplemented("{} is not supported".format(args.model_type))

    if torch.distributed.is_initialized():
        torch.distributed.barrier()

    if args.output_ckpt_file_path:
        filepath = args.output_ckpt_file_path
        base_dir = pathlib.Path(filepath).parent
        filename_str = pathlib.Path(filepath).name
        suffix = '.ckpt'
        content = {}
        if consumed is not None:
            content['consumed'] = consumed
        else:
            content['consumed'] = 0
        if steps is not None:
            content['steps'] = steps
        else:
            content['steps'] = 0
        filename = filename_str.format(**content) + suffix
        checkpoint_path_output = inject_model_parallel_rank(os.path.join(base_dir, filename))
        trainer.accelerator.training_type_plugin.checkpoint_io.save_checkpoint(checkpoint, checkpoint_path_output)
        logging.info(f'NeMo model checkpoint files saved to: {args.output_ckpt_file_path}')

    if args.nemo_file_path:
        if args.model_type == 'gpt':
            model = load_model(MegatronGPTModel, checkpoint, strict=False, trainer=trainer)
        elif args.model_type == 'bert':
            model = load_model(MegatronBertModel, checkpoint, strict=False, trainer=trainer)
        else:
            raise NotImplemented("{} is not supported".format(args.model_type))

        # verify tensor parallel rank id and pipeline parallel rank id matches
        assert app_state.data_parallel_size == 1
        assert app_state.tensor_model_parallel_size == tensor_model_parallel_size
        assert app_state.tensor_model_parallel_rank == tensor_rank
        assert app_state.pipeline_model_parallel_size == pipeline_model_parallel_size
        assert app_state.pipeline_model_parallel_rank == pipeline_rank
        model._save_restore_connector = NLPSaveRestoreConnector()
        model.save_to(args.nemo_file_path)
        logging.info(f'NeMo model saved to: {args.nemo_file_path}')
Ejemplo n.º 30
0
    def restore_weights(self, restore_path: str):
        """Restores module/model's weights.
           For model parallel checkpoints the directory structure
           should be restore_path/mp_rank_0X/model_optim_rng.pt

        Args:
            restore_path (str): restore_path should a file or a directory if using model parallel
        """
        self._restore_path = restore_path
        if os.path.isfile(restore_path):
            logging.info(
                f'restore_path: {restore_path} is a file. Assuming no megatron model parallelism'
            )
            state_dict = torch.load(restore_path, map_location='cpu')
            if 'checkpoint_version' in state_dict:
                if state_dict['checkpoint_version'] is not None:
                    set_checkpoint_version(state_dict['checkpoint_version'])
            else:
                logging.warning(
                    'Megatron-lm checkpoint version not found. Setting checkpoint_version to 0.'
                )
                set_checkpoint_version(0)
            # to load from Megatron pretrained checkpoint
            if 'model' in state_dict:
                self.language_model.load_state_dict(
                    state_dict['model'][self._language_model_key])
            else:
                self.load_state_dict(state_dict)
            logging.info(f"weights restored from {restore_path}")
        elif os.path.isdir(restore_path):
            # TODO: need to refactor this so we're not repeating code

            # need model parallel groups to restore model parallel checkpoints
            if model_parallel_is_initialized():
                model_parallel_rank = torch.distributed.get_rank(
                    group=get_model_parallel_group())
                mp_restore_path = f'{restore_path}/mp_rank_{model_parallel_rank:02d}/model_optim_rng.pt'
                logging.info(
                    f'Restoring model parallel checkpoint from: {mp_restore_path}'
                )
                state_dict = torch.load(mp_restore_path, map_location='cpu')
                if 'checkpoint_version' in state_dict:
                    if state_dict['checkpoint_version'] is not None:
                        set_checkpoint_version(
                            state_dict['checkpoint_version'])
                else:
                    logging.warning(
                        'Megatron-lm checkpoint version not found. Setting checkpoint_version to 0.'
                    )
                    set_checkpoint_version(0)
                # to load from Megatron pretrained checkpoint
                if 'model' in state_dict:
                    self.language_model.load_state_dict(
                        state_dict['model'][self._language_model_key])
                else:
                    self.load_state_dict(state_dict)
            else:
                logging.info(
                    f'torch.distributed not initialized yet. Will not restore model parallel checkpoint'
                )
        else:
            logging.error(
                f'restore_path: {restore_path} must be a file or directory.')