def main():
    parser = ArgumentParser()
    parser.add_argument(
        "--manifest",
        type=str,
        required=True,
        help="Path to manifest file",
    )
    parser.add_argument(
        "--model_path",
        type=str,
        default='speakerverification_speakernet',
        required=False,
        help=
        "path to .nemo speaker verification model file to extract embeddings, if not passed SpeakerNet-M model would be downloaded from NGC and used to extract embeddings",
    )
    parser.add_argument(
        "--embedding_dir",
        type=str,
        default='./',
        required=False,
        help="path to directory where embeddings need to stored default:'./'",
    )
    args = parser.parse_args()
    torch.set_grad_enabled(False)

    if args.model_path.endswith('.nemo'):
        logging.info(f"Using local speaker model from {args.model_path}")
        speaker_model = ExtractSpeakerEmbeddingsModel.restore_from(
            restore_path=args.model_path)
    elif args.model_path.endswith('.ckpt'):
        speaker_model = ExtractSpeakerEmbeddingsModel.load_from_checkpoint(
            checkpoint_path=args.model_path)
    else:
        speaker_model = ExtractSpeakerEmbeddingsModel.from_pretrained(
            model_name="speakerverification_speakernet")
        logging.info(f"using pretrained speaker verification model from NGC")

    num_gpus = 1 if torch.cuda.is_available() else 0
    if not num_gpus:
        logging.warning(
            "Running model on CPU, for faster performance it is adviced to use atleast one NVIDIA GPUs"
        )

    trainer = pl.Trainer(gpus=num_gpus, accelerator=None)

    test_config = OmegaConf.create(
        dict(
            manifest_filepath=args.manifest,
            sample_rate=16000,
            labels=None,
            batch_size=1,
            shuffle=False,
            time_length=20,
            embedding_dir=args.embedding_dir,
        ))
    speaker_model.setup_test_data(test_config)
    trainer.test(speaker_model)
Esempio n. 2
0
    def _init_speaker_model(self):
        model_path = self._cfg.diarizer.speaker_embeddings.model_path
        if model_path is not None and model_path.endswith('.nemo'):
            self._speaker_model = ExtractSpeakerEmbeddingsModel.restore_from(model_path)
            logging.info("Speaker Model restored locally from {}".format(model_path))
        else:
            if model_path not in get_available_model_names(ExtractSpeakerEmbeddingsModel):
                logging.warning(
                    "requested {} model name not available in pretrained models, instead".format(model_path)
                )
                model_path = "speakerdiarization_speakernet"
            logging.info("Loading pretrained {} model from NGC".format(model_path))
            self._speaker_model = ExtractSpeakerEmbeddingsModel.from_pretrained(model_name=model_path)

        self._speaker_dir = os.path.join(self._out_dir, 'speaker_outputs')
Esempio n. 3
0
    def __init__(self, cfg: DictConfig):
        cfg = model_utils.convert_model_config_to_dict_config(cfg)
        # Convert config to support Hydra 1.0+ instantiation
        cfg = model_utils.maybe_update_config_version(cfg)
        self._cfg = cfg
        self._out_dir = self._cfg.diarizer.out_dir
        if not os.path.exists(self._out_dir):
            os.mkdir(self._out_dir)

        # init vad model
        self.has_vad_model = False
        self.has_vad_model_to_save = False

        self._speaker_manifest_path = self._cfg.diarizer.speaker_embeddings.oracle_vad_manifest
        self.AUDIO_RTTM_MAP = None
        self.paths2audio_files = self._cfg.diarizer.paths2audio_files

        if self._cfg.diarizer.vad.model_path is not None:
            self._init_vad_model()
            self._vad_dir = os.path.join(self._out_dir, 'vad_outputs')
            self._vad_out_file = os.path.join(self._vad_dir, "vad_out.json")
            shutil.rmtree(self._vad_dir, ignore_errors=True)
            os.makedirs(self._vad_dir)

        # init speaker model
        self._speaker_model = ExtractSpeakerEmbeddingsModel.restore_from(
            self._cfg.diarizer.speaker_embeddings.model_path
        )
        self._num_speakers = self._cfg.diarizer.num_speakers
        self._speaker_dir = os.path.join(self._out_dir, 'speaker_outputs')

        self._device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
def main(cfg: DictConfig) -> None:
    os.chdir('/home/lucas/PycharmProjects/NeMo_SpeakerVerification')
    cuda = 1 if torch.cuda.is_available() else 0
    model = ExtractSpeakerEmbeddingsModel.from_pretrained(model_name='SpeakerNet_verification')
    #SpeakerNet_verification = nemo_asr.models.EncDecSpeakerLabelModel.from_pretrained(model_name="SpeakerNet_verification")
    if cfg.audio.num_target_tracks > -1:
        audio_tracks = glob.glob(cfg.audio.target_path, recursive=True)[:cfg.audio.num_target_tracks]
    else:
        audio_tracks = glob.glob(cfg.audio.target_path, recursive=True)

    #Text files for logging
    der_log = open('/home/lucas/PycharmProjects/NeMo_SpeakerVerification/Txt_outs/der_cluster_noiseless.txt', 'w')

    if os.path.exists(os.path.join(os.getcwd(), 'manifest_files', 'target.json')):
        os.remove(os.path.join(os.getcwd(), 'manifest_files', 'target.json'))


    #Write target-speaker manifest files and check to see that all audio files have matching target files
    for track in audio_tracks:
        agent=track[track.find('-')+1:track.find('.')]
        agent_samples = glob.glob(cfg.audio.verification_path+agent+'.wav', recursive=True)
        if len(agent_samples) > 0:
            write_target_manifest(audio_path=agent_samples[0], length=cfg.audio.verification_length, manifest_file='target.json',agent=agent)
            # write_track_manifest(audio_path=track, frame_list=frame_list, manifest_file='track_manifest.json')
            #model.setup_test_data(write_target_manifest(audio_path=agent_samples[0], length=cfg.audio.verification_length, manifest_file='target.json',agent=agent))
            #trainer = pl.Trainer(gpus=cuda, accelerator=None)
            #trainer.test(model)
        else:
            warnings.warn('Verification audio for {} not found '.format(agent))
    test_config = OmegaConf.create(dict(
        manifest_filepath = os.path.join(os.getcwd(), 'manifest_files', 'target.json'),
        sample_rate = 16000,
        labels = None,
        batch_size = 1,
        shuffle=False,
        embedding_dir='./'#os.path.join(os.getcwd(),'embeddings')
    ))
    model.setup_test_data(test_config)
    trainer = pl.Trainer(gpus=cuda)
    trainer.test(model)

    test_config = OmegaConf.create(dict(
        manifest_filepath=os.path.join(os.getcwd(), 'manifest_files', 'track_manifest.json'),
        sample_rate=16000,
        labels=None,
        batch_size=16,
        shuffle=False,
        embedding_dir='./' ,
        num_workers = 4# os.path.join(os.getcwd(),'embeddings')
    ))

    if os.path.exists(os.path.join(os.getcwd(), 'manifest_files', 'track_manifest.json')):
        os.remove(os.path.join(os.getcwd(), 'manifest_files', 'track_manifest.json'))

    for window_length in cfg.audio.window_length:
        for step_length in cfg.audio.step_length:
            for track in audio_tracks:
                agent = track[track.find('-') + 1:track.find('.')]
                agent_samples = glob.glob(cfg.audio.verification_path + agent + '.wav', recursive=True)
                rttm = glob.glob(cfg.audio.rttm_path + track[track.rfind('/') + 1:track.rfind('.')] + '.rttm',
                                 recursive=False)[0]
                #print(agent_samples)
                if len(agent_samples) > 0:
                    label_path = track[track.rfind('/')+1:track.find('.wav')]+'.labs'
                    frame_list, speaker_df = label_frames(label_path=os.path.join(cfg.audio.label_path, label_path),
                                                      window_size=window_length,
                                                      step_size=float(window_length*step_length))
                    write_track_manifest(audio_path=track, frame_list=frame_list, manifest_file='track_manifest.json', window_length=window_length, step_length=step_length)
    model.setup_test_data(test_config)
    trainer = pl.Trainer(gpus=cuda)
    trainer.test(model)
    track_manifest = [json.loads(line.replace('\n', '')) for line in
                      open(os.path.join(os.getcwd(), 'manifest_files', 'track_manifest.json'))]
    with open(os.path.join(os.getcwd(),'embeddings','track_manifest_embeddings.pkl'), 'rb') as f:
        data = pickle.load(f).items()
        all_track_embeddings = [emb for _, emb in data]
    for window_length in cfg.audio.window_length:
        for step_length in cfg.audio.step_length:
            for track in audio_tracks:
                agent = track[track.find('-') + 1:track.find('.')]
                agent_samples = glob.glob(cfg.audio.verification_path + agent + '.wav', recursive=True)
                rttm = glob.glob(cfg.audio.rttm_path + track[track.rfind('/') + 1:track.rfind('.')] + '.rttm',
                                 recursive=False)[0]
                # print(agent_samples)
                if len(agent_samples) > 0:
                    label_path = track[track.rfind('/') + 1:track.find('.wav')] + '.labs'
                    frame_list, speaker_df = label_frames(label_path=os.path.join(cfg.audio.label_path, label_path),
                                                          window_size=window_length,
                                                          step_size=float(window_length * step_length))
                    indices = [track_manifest.index(item) for item in track_manifest if
                               item['audio_filepath'] == track and item["duration"] == window_length and item[
                                   "step_length"] == step_length]
                    print(indices)
                    embedddings = all_track_embeddings[min(indices):max(indices)+1]
                    cluster_outputs = cluster_embeddings(agent=agent, track=track, window_length=window_length, step_length=step_length, track_embedding=embedddings)
                    #print(len(cluster_outputs))
                    #print(speaker_df.describe())
                    coverage, purity = get_performance_metrics(speaker_df, np.array(cluster_outputs))
                    print("The results for {} -> Coverage {} / Purity {}".format(track, coverage, purity))
                    annotation = merge_frames(outputs=cluster_outputs, frame_list=frame_list)
                    der = get_der(cfg=cfg, rttm=rttm, output_annotations=annotation)
                    print('THE DER IS {}'.format(der))
                    der_log.write('{} \t {} \t {} \t {} \t {} \t {} \n'.format(track, window_length, step_length, coverage,
                                                                     purity, der))

    der_log.close()