Example #1
0
def init_model():
    # gpu configuration
    toolkits.initialize_GPU(args)

    import model

    # construct the data generator.
    params = {
        'dim': (257, None, 1),
        'nfft': 512,
        'min_slice': 720,
        'win_length': 400,
        'hop_length': 160,
        'n_classes': 5994,
        'sampling_rate': 16000,
        'normalize': True,
    }

    network_eval = model.vggvox_resnet2d_icassp(input_dim=params['dim'],
                                                num_class=params['n_classes'],
                                                mode='eval',
                                                args=args)

    if args.resume:
        if os.path.isfile(args.resume):
            network_eval.load_weights(os.path.join(args.resume), by_name=True)
        else:
            raise IOError("==> no checkpoint found at '{}'".format(
                args.resume))
    else:
        raise IOError('==> please type in the model to load')

    return network_eval
Example #2
0
def extract_features(paths, args):
    # GPU configuration
    toolkits.initialize_GPU(args)

    network_eval = model.vggvox_resnet2d_icassp(
        input_dim=PARAMS["dim"], num_class=PARAMS["n_classes"], mode="eval", args=args
    )
    network_eval.load_weights(os.path.join(args.resume), by_name=True)

    num_paths = len(paths)
    feats = np.zeros((num_paths, PARAMS["feat_dim"]))

    for i, path in enumerate(tqdm(paths)):
        specs = ut.load_data(
            path,
            win_length=PARAMS["win_length"],
            sr=PARAMS["sampling_rate"],
            hop_length=PARAMS["hop_length"],
            n_fft=PARAMS["nfft"],
            spec_len=PARAMS["spec_len"],
            mode="eval",
        )
        specs = np.expand_dims(np.expand_dims(specs, 0), -1)
        feats[i] = network_eval.predict(specs)

    return feats
Example #3
0
def main(args):

    # gpu configuration
    toolkits.initialize_GPU(args)

    import model
    # ==================================
    #       Get Train/Val.
    # ==================================
    
    total_list = [os.path.join(args.data_path, file) for file in os.listdir(args.data_path)]
    unique_list = np.unique(total_list)

    # ==================================
    #       Get Model
    # ==================================
    # construct the data generator.
    params = {'dim': (257, None, 1),
              'nfft': 512,
              'min_slice': 720,
              'win_length': 400,
              'hop_length': 160,
              'n_classes': 5994,
              'sampling_rate': 16000,
              'normalize': True,
              }

    network_eval = model.vggvox_resnet2d_icassp(input_dim=params['dim'],
                                                num_class=params['n_classes'],
                                                mode='eval', args=args)

    # ==> load pre-trained model ???
    if args.resume:
        # ==> get real_model from arguments input,
        # load the model if the imag_model == real_model.
        if os.path.isfile(args.resume):
            network_eval.load_weights(os.path.join(args.resume), by_name=True)
            print('==> successfully loading model {}.'.format(args.resume))
        else:
            raise IOError("==> no checkpoint found at '{}'".format(args.resume))
    else:
        raise IOError('==> please type in the model to load')

    print('==> start testing.')

    # The feature extraction process has to be done sample-by-sample,
    # because each sample is of different lengths.
    feats = []
    for ID in unique_list:
        specs = preprocess.load_data(ID, split=False, win_length=params['win_length'], sr=params['sampling_rate'],
                             hop_length=params['hop_length'], n_fft=params['nfft'],
                             min_slice=params['min_slice'])
        specs = np.expand_dims(np.expand_dims(specs[0], 0), -1)
    
        v = network_eval.predict(specs)
        feats += [v]

    feats = np.array(feats)[:,0,:]
    preprocess.similar(feats)
Example #4
0
def diarize(segments, sr=16000, win_len=400, hop_len=160, embedding_per_sec=1.0, overlap_rate=0.1):
    logger.debug("[Speaker diarization] Initializing models")
    # Initialize ghostvlad
    toolkits.initialize_GPU(Expando({"gpu": ""}))
    ghostvlad_model = model.vggvox_resnet2d_icassp(input_dim=(257, None, 1),
                                                   num_class=5994,
                                                   mode="eval",
                                                   args=Expando({"net": "resnet34s",
                                                                 "loss": "softmax",
                                                                 "vlad_cluster": 8,
                                                                 "ghost_cluster": 2,
                                                                 "bottleneck_dim": 512,
                                                                 "aggregation_mode": "gvlad"}))
    ghostvlad_model.load_weights("ghostvlad/pretrained/weights.h5", by_name=True)

    # Initialize uisrnn
    sys.argv = sys.argv[:1]
    model_args, _, inference_args = uisrnn.parse_arguments()
    model_args.observation_dim = 512
    uisrnn_model = uisrnn.UISRNN(model_args)
    uisrnn_model.load("uisrnn/pretrained/saved_model.uisrnn_benchmark")

    logger.debug("[Speaker diarization] Calculating utterance features")
    utterances_spec = prepare_ghostvlad_data(segments, sr, win_len, hop_len, embedding_per_sec, overlap_rate)
    feats = []
    for spec in utterances_spec:
        spec = np.expand_dims(np.expand_dims(spec, 0), -1)
        v = ghostvlad_model.predict(spec)
        feats += [v]
    feats = np.array(feats)[:, 0, :].astype(float)

    logger.debug("[Speaker diarization] Clustering utterance features")
    labels = uisrnn_model.predict(feats, inference_args)

    logger.debug("[Speaker diarization] Tagging segments speakers")
    embedding_duration = (1/embedding_per_sec) * (1.0 - overlap_rate)
    labels_count = len(labels)
    current = 0
    for segment in segments:
        begin_index = math.floor(current/embedding_duration)
        current += segment.end-segment.begin
        end_index = math.ceil(current/embedding_duration)
        segment_labels = [labels[index] for index in range(begin_index, min(end_index, labels_count))]
        if len(segment_labels) > 0:
            segment.speaker = max(segment_labels, key=segment_labels.count)
        else:
            segment.speaker = 999
    return segments
Example #5
0
def load_model():
    toolkits.initialize_GPU(args)
    global network_eval
    network_eval = model.vggvox_resnet2d_icassp(input_dim=params['dim'],
                                                num_class=params['n_classes'],
                                                mode='eval',
                                                args=args)

    # ==> load pre-trained model ???
    if args.resume:
        # ==> get real_model from arguments input,
        # load the model if the imag_model == real_model.
        if os.path.isfile(args.resume):
            network_eval.load_weights(os.path.join(args.resume), by_name=True)
            print('==> successfully loading model {}.'.format(args.resume))
        else:
            raise IOError("==> no checkpoint found at '{}'".format(
                args.resume))
    else:
        raise IOError('==> please type in the model to load')
Example #6
0
    def __init__(self):
        netConfig = {
            'net': 'resnet34s',
            'ghost_cluster': 2,
            'vlad_cluster': 8,
            'bottleneck_dim': 512,
            'aggregation_mode': 'gvlad',
            'loss': 'softmax',
            'dim': (257, None, 1),
            'n_classes': 5994,
        }
        netConfig = namedtuple("NetConfig",
                               netConfig.keys())(*netConfig.values())

        self.net = model.vggvox_resnet2d_icassp(input_dim=netConfig.dim,
                                                num_class=netConfig.n_classes,
                                                mode='eval',
                                                args=netConfig)
        self.net.load_weights(os.path.join(
            '../model/gvlad_softmax/resnet34_vlad8_ghost2_bdim512_deploy/weights.h5'
        ),
                              by_name=True)
Example #7
0
def main():

    # gpu configuration
    toolkits.initialize_GPU(args)

    #   get speaker id from folder name
    totalList = [os.path.join(dataPath, file) for file in os.listdir(dataPath)]
    uniqueList = np.unique(totalList)
    speakerList = [extractSpeakerId(u) for u in uniqueList]

    #   get audio file for each speaker
    speakerAudioDict = {}
    for speaker in speakerList:

        #   root path
        rootPath = os.path.join(dataPath, speaker)

        #   get list of files
        fileList = getListOfFiles(rootPath)

        #   add to dict
        speakerAudioDict[speaker] = fileList

    #   get embedding for each audio of speaker
    speakerToFeatureDict = {}
    for speaker in speakerList:

        # construct the data generator.
        params = {
            'dim': (257, None, 1),
            'nfft': 512,
            'min_slice': 720,
            'win_length': 400,
            'hop_length': 160,
            'n_classes': 5994,
            'sampling_rate': 16000,
            'normalize': True,
        }

        network_eval = model.vggvox_resnet2d_icassp(
            input_dim=params['dim'],
            num_class=params['n_classes'],
            mode='eval',
            args=args)

        # ==> load pre-trained model ???
        if args.resume:
            # ==> get real_model from arguments input,
            # load the model if the imag_model == real_model.
            if os.path.isfile(args.resume):
                network_eval.load_weights(os.path.join(args.resume),
                                          by_name=True)
                print('==> successfully loading model {}.'.format(args.resume))
            else:
                raise IOError("==> no checkpoint found at '{}'".format(
                    args.resume))
        else:
            raise IOError('==> please type in the model to load')

        feats = []
        for ID in speakerAudioDict[speaker]:
            specs = preprocess.load_data(ID,
                                         split=False,
                                         win_length=params['win_length'],
                                         sr=params['sampling_rate'],
                                         hop_length=params['hop_length'],
                                         n_fft=params['nfft'],
                                         min_slice=params['min_slice'])
            specs = np.expand_dims(np.expand_dims(specs[0], 0), -1)

            v = network_eval.predict(specs)
            feats += [v]
        speakerToFeatureDict[speaker] = feats

    #   save to file
    with open('speaker_data.pickle', 'wb') as handle:
        pickle.dump(speakerToFeatureDict,
                    handle,
                    protocol=pickle.HIGHEST_PROTOCOL)
def main():

    # gpu configuration
    toolkits.initialize_GPU(args)

    import model
    import generator

    # ==================================
    #       Get Train/Val.
    # ==================================
    trnlist, trnlb = toolkits.get_hike_datalist(
        meta_paths=args.train_meta_data_path,
        data_paths=args.train_data_path,
        mode=model_config['loss'])
    vallist, vallb = toolkits.get_hike_datalist(
        meta_paths=args.val_meta_data_path,
        data_paths=args.val_data_path,
        mode=model_config['loss'])

    input_length = int(args.audio_length * 25)
    num_class = len(score_rule)
    # construct the data generator.
    params = {
        'dim': (513, input_length, 1),
        'mp_pooler': toolkits.set_mp(processes=args.multiprocess),
        'nfft': 1024,
        'spec_len': input_length,
        'win_length': 1024,
        'hop_length': 640,
        'n_classes': num_class,
        'sampling_rate': 16000,
        'batch_size': model_config['batch_size'],
        'shuffle': True,
        'normalize': True,
        'loss': model_config['loss'],
        'data_format': args.data_format
    }

    # Datasets
    partition = {'train': trnlist.flatten(), 'val': vallist.flatten()}
    labels = {'train': trnlb.flatten(), 'val': vallb.flatten()}

    # Generators
    wandb.init(project='vgg_speaker')
    trn_gen = generator.DataGenerator(partition['train'], labels['train'],
                                      **params)
    val_gen = generator.DataGenerator(partition['val'], labels['val'],
                                      **params)
    network = model.vggvox_resnet2d_icassp(input_dim=params['dim'],
                                           num_class=params['n_classes'],
                                           mode='train',
                                           args=model_config)
    # # val data
    # val_data = [params['mp_pooler'].apply_async(ut.load_data,
    #                                 args=(ID, params['win_length'], params['sampling_rate'], params['hop_length'],
    #                                       params['nfft'], params['spec_len'], 'train', args.data_format)) for ID in partition['val']]
    # val_data = np.expand_dims(np.array([p.get() for p in val_data]), -1)

    # ==> load pre-trained model ???
    print(keras.backend.tensorflow_backend._get_available_gpus())

    if args.resume:
        print("Attempting to load", args.resume)
        if args.resume:
            if os.path.isfile(args.resume):
                network.load_weights(os.path.join(args.resume),
                                     by_name=True,
                                     skip_mismatch=True)
                print('==> successfully loading model {}.'.format(args.resume))
            else:
                raise ValueError("==> no checkpoint found at '{}'".format(
                    args.resume))

    print(network.summary())
    print('==> gpu {} is, training {} images, classes: 0-{} '
          'loss: {}, aggregation: {}, ohemlevel: {}'.format(
              args.gpu, len(partition['train']), np.max(labels['train']),
              model_config['loss'], model_config['aggregation_mode'],
              model_config['ohem_level']))

    model_path, log_path = set_path(args, model_config)
    normal_lr = keras.callbacks.LearningRateScheduler(step_decay)
    # tbcallbacks = keras.callbacks.TensorBoard(log_dir=log_path, histogram_freq=0, write_graph=True, write_images=False,
    #                                           update_freq=model_config['batch_size'] * 16)
    callbacks = [
        keras.callbacks.ModelCheckpoint(
            os.path.join(model_path, 'weights-{epoch:02d}-{loss:.3f}.h5'),
            monitor='loss',
            mode='min',
            save_best_only=True,
            period=20,
        ), normal_lr,
        WandbCallback()
    ]

    if model_config[
            'ohem_level'] > 1:  # online hard negative mining will be used
        candidate_steps = int(
            len(partition['train']) // model_config['batch_size'])
        iters_per_epoch = int(
            len(partition['train']) //
            (model_config['ohem_level'] * model_config['batch_size']))

        ohem_generator = generator.OHEM_generator(
            network, trn_gen, candidate_steps, model_config['ohem_level'],
            model_config['batch_size'], params['dim'], params['n_classes'])

        A = ohem_generator.next(
        )  # for some reason, I need to warm up the generator

        network.fit_generator(generator.OHEM_generator(
            network, trn_gen, iters_per_epoch, model_config['ohem_level'],
            model_config['batch_size'], params['dim'], params['n_classes']),
                              steps_per_epoch=iters_per_epoch,
                              epochs=model_config['epochs'],
                              max_queue_size=10,
                              callbacks=callbacks,
                              use_multiprocessing=False,
                              workers=1,
                              verbose=1)

    else:
        if model_config['loss'] != 'mse':
            network.fit_generator(trn_gen,
                                  steps_per_epoch=int(
                                      len(partition['train']) //
                                      model_config['batch_size']),
                                  epochs=model_config['epochs'],
                                  max_queue_size=10,
                                  validation_data=val_gen,
                                  validation_freq=1,
                                  callbacks=callbacks,
                                  use_multiprocessing=False,
                                  workers=1,
                                  verbose=1)
        else:
            network.fit_generator(trn_gen,
                                  steps_per_epoch=int(
                                      len(partition['train']) //
                                      model_config['batch_size']),
                                  epochs=model_config['epochs'],
                                  max_queue_size=10,
                                  validation_data=val_gen,
                                  validation_freq=1,
                                  callbacks=callbacks,
                                  use_multiprocessing=False,
                                  workers=1,
                                  verbose=1)
Example #9
0
def main():

    # gpu configuration
    toolkits.initialize_GPU(args)

    import model
    # ==================================
    #       Get Train/Val.
    # ==================================
    print('==> calculating test({}) data lists...'.format(args.test_type))

    publicTest = pd.read_csv("/content/VoveDataset/public-test.csv")

    list1 = addPath(np.array(publicTest["audio_1"]))
    list2 = addPath(np.array(publicTest["audio_2"]))

    total_list = np.concatenate((list1, list2))
    unique_list = np.unique(total_list)
    # ==================================
    #       Get Model
    # ==================================
    # construct the data generator.
    params = {
        'dim': (257, None, 1),
        'nfft': 512,
        'spec_len': 250,
        'win_length': 400,
        'hop_length': 160,
        'n_classes': 5994,
        'sampling_rate': 16000,
        'normalize': True,
    }

    network_eval = model.vggvox_resnet2d_icassp(input_dim=params['dim'],
                                                num_class=params['n_classes'],
                                                mode='eval',
                                                args=args)

    # ==> load pre-trained model ???
    if args.resume:
        # ==> get real_model from arguments input,
        # load the model if the imag_model == real_model.
        if os.path.isfile(args.resume):
            network_eval.load_weights(os.path.join(args.resume), by_name=True)
            result_path = "/content/VGG-Speaker-Recognition/result"
            print('==> successfully loading model {}.'.format(args.resume))
        else:
            raise IOError("==> no checkpoint found at '{}'".format(
                args.resume))
    else:
        raise IOError('==> please type in the model to load')

    print('==> start testing.')

    # The feature extraction process has to be done sample-by-sample,
    # because each sample is of different lengths.
    total_length = len(unique_list)
    feats, scores, labels = [], [], []
    for c, ID in enumerate(pbar(unique_list)):
        specs = ut.load_data(ID,
                             win_length=params['win_length'],
                             sr=params['sampling_rate'],
                             hop_length=params['hop_length'],
                             n_fft=params['nfft'],
                             spec_len=params['spec_len'],
                             mode='eval')
        specs = np.expand_dims(np.expand_dims(specs, 0), -1)

        v = network_eval.predict(specs)
        feats += [v]

    feats = np.array(feats)
    np.save("/content/feats.npy", feats)
Example #10
0
# ===========================================
def wav2spec(wav):
    wav = np.append(wav, wav[::-1])
    wav = wav.astype(np.float)
    linear_spect = librosa.stft(wav, n_fft=512, win_length=400, hop_length=160).T
    mag, _ = librosa.magphase(linear_spect)  
    spec_mag = mag.T
    mu = np.mean(spec_mag, 0, keepdims=True)
    std = np.std(spec_mag, 0, keepdims=True)
    specs = (spec_mag - mu) / (std + 1e-5)
    specs = np.expand_dims(np.expand_dims(specs, 0), -1)
    return specs
    
if __name__ == '__main__':
    #os.environ["CUDA_VISIBLE_DEVICES"] = "0" # 如有GPU的话取消注释该行,GPU会加速特征提取.
    network_eval = model.vggvox_resnet2d_icassp(input_dim=(257, None, 1),num_class=5994, mode='eval')
    network_eval.load_weights(os.path.join('model/weights.h5'), by_name=True)
    my_model = Mymodel(network_eval)

    wav1_path = "audio/spk1_1.wav"
    wav2_path = "audio/spk2_2.wav"
    audio1, sr = sf.read(wav1_path)
    audio2, sr = sf.read(wav2_path)
    spec1 = wav2spec(audio1)
    spec2 = wav2spec(audio2)
    t0 = time.time()
    feat1 = my_model.get_feats(spec1)
    t1 = time.time()
    print("{} 语音时长: {}s,提取该语音所需时间: {} s".format(wav1_path, len(audio1)/sr, t1-t0))
    feat2 = my_model.get_feats(spec2)
    print("{} 语音时长: {}s,提取该语音所需时间: {} s".format(wav2_path, len(audio2)/sr, time.time()-t1))
Example #11
0
    # gpu configuration
    toolkits.initialize_GPU(args)
    params = {
        'dim': (257, None, 1),
        'nfft': 512,
        'spec_len': 250,
        'win_length': 400,
        'hop_length': 160,
        'n_classes': 5994,
        'sampling_rate': 16000,
        'normalize': True,
    }

    network_eval = spkModel.vggvox_resnet2d_icassp(
        input_dim=params['dim'],
        num_class=params['n_classes'],
        mode='eval',
        args=args)
    network_eval.load_weights(args.resume, by_name=True)

    model_args, _, inference_args = uisrnn.parse_arguments()
    model_args.observation_dim = 512
    uisrnnModel = uisrnn.UISRNN(model_args)
    uisrnnModel.load(SAVED_MODEL_NAME)
    while (True):
        print("Start speaking")
        myrecording = sd.rec(int(seconds * fs), samplerate=fs, channels=2)
        print(type(myrecording))
        sd.wait()  # Wait until recording is finished0000
        print("Finished recording")
        write('wavs/output12.wav', fs, myrecording)
Example #12
0
def main():

    # gpu configuration
    toolkits.initialize_GPU(args)

    import model
    # ==================================
    #       Get Train/Val.
    # ==================================
    print('==> calculating test({}) data lists...'.format(args.test_type))

    if args.test_type == 'normal':
        verify_list = np.loadtxt('../meta/voxceleb1_veri_test.txt', str)
    elif args.test_type == 'hard':
        verify_list = np.loadtxt('../meta/voxceleb1_veri_test_hard.txt', str)
    elif args.test_type == 'extend':
        verify_list = np.loadtxt('../meta/voxceleb1_veri_test_extended.txt',
                                 str)
    else:
        raise IOError('==> unknown test type.')

    verify_lb = np.array([int(i[0]) for i in verify_list])
    list1 = np.array([os.path.join(args.data_path, i[1]) for i in verify_list])
    list2 = np.array([os.path.join(args.data_path, i[2]) for i in verify_list])

    total_list = np.concatenate((list1, list2))
    unique_list = np.unique(total_list)

    # ==================================
    #       Get Model
    # ==================================
    # construct the data generator.
    params = {
        'dim': (257, 100, 1),
        'nfft': 512,
        'spec_len': 100,
        'win_length': 400,
        'hop_length': 160,
        'n_classes': 5994,
        'sampling_rate': 16000,
        'normalize': True,
    }

    network_eval = model.vggvox_resnet2d_icassp(input_dim=params['dim'],
                                                num_class=params['n_classes'],
                                                mode='eval',
                                                args=args)

    # ==> load pre-trained model ???
    if args.resume:
        # ==> get real_model from arguments input,
        # load the model if the imag_model == real_model.
        if os.path.isfile(args.resume):
            network_eval.load_weights(os.path.join(args.resume), by_name=True)
            result_path = set_result_path(args)
            print('==> successfully loading model {}.'.format(args.resume))
        else:
            raise IOError("==> no checkpoint found at '{}'".format(
                args.resume))
    else:
        raise IOError('==> please type in the model to load')

    print('==> start testing.')

    # The feature extraction process has to be done sample-by-sample,
    # because each sample is of different lengths.
    total_length = len(unique_list)
    feats, scores, labels = [], [], []
    for c, ID in enumerate(unique_list):
        if c % 50 == 0:
            print('Finish extracting features for {}/{}th wav.'.format(
                c, total_length))
        specs = ut.load_data(ID,
                             win_length=params['win_length'],
                             sr=params['sampling_rate'],
                             hop_length=params['hop_length'],
                             n_fft=params['nfft'],
                             spec_len=params['spec_len'],
                             mode='eval')
        specs = np.expand_dims(np.expand_dims(specs, 0), -1)

        v = network_eval.predict(specs)
        feats += [v]

    feats = np.array(feats)

    # ==> compute the pair-wise similarity.
    for c, (p1, p2) in enumerate(zip(list1, list2)):
        ind1 = np.where(unique_list == p1)[0][0]
        ind2 = np.where(unique_list == p2)[0][0]

        v1 = feats[ind1, 0]
        v2 = feats[ind2, 0]

        scores += [np.sum(v1 * v2)]
        labels += [verify_lb[c]]
        print('scores : {}, gt : {}'.format(scores[-1], verify_lb[c]))

    scores = np.array(scores)
    labels = np.array(labels)

    np.save(os.path.join(result_path, 'prediction_scores.npy'), scores)
    np.save(os.path.join(result_path, 'groundtruth_labels.npy'), labels)

    eer, thresh = toolkits.calculate_eer(labels, scores)
    print('==> model : {}, EER: {}'.format(args.resume, eer))
def main(args):
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    _ = tf.Session(config=config)
    # ==================================
    #       Get Train/Val.
    # ==================================
    trnlist, trnlb = utils.get_voxceleb2_datalist(path=args.train_list)
    vallist, vallb = utils.get_voxceleb2_datalist(path=args.val_list)

    # construct the data generator.
    params = {'dim': (257, 250, 1),
              'mp_pooler': utils.set_mp(processes=args.multiprocess),
              'nfft': 512,
              'spec_len': 250,
              'win_length': 400,
              'hop_length': 160,
              'n_classes': args.n_classes,
              'sampling_rate': 16000,
              'batch_size': args.batch_size,
              'shuffle': True,
              'normalize': True,
              }

    # Datasets
    partition = {'train': trnlist.flatten(), 'val': vallist.flatten()}
    labels = {'train': trnlb.flatten(), 'val': vallb.flatten()}

    # Generators
    trn_gen = generator.DataGenerator(partition['train'], labels['train'], **params)
    val_gen = generator.DataGenerator(partition['val'], labels['val'], **params)
    network = model.vggvox_resnet2d_icassp(input_dim=params['dim'],
                                           num_class=params['n_classes'],
                                           mode='train', args=args)

    # ==> load pre-trained model
    mgpu = len(keras.backend.tensorflow_backend._get_available_gpus())
    initial_epoch = 0
    if args.resume:
        if os.path.isfile(args.resume):
            if mgpu == 1:
                network.load_weights(os.path.join(args.resume))
            else:
                network.layers[mgpu + 1].load_weights(os.path.join(args.resume))
            initial_epoch = int(os.path.basename(args.resume).split('-')[1])
            print('==> successfully loading model {}.'.format(args.resume))
        else:
            print("==> no checkpoint found at '{}'".format(args.resume))

    print(network.summary())
    print('==> gpu {} is, training {} images, classes: 0-{} loss: {}, aggregation: {}'
          .format(args.gpu, len(partition['train']), np.max(labels['train']), args.loss, args.aggregation_mode))

    model_path, log_path = set_path(args)
    normal_lr = keras.callbacks.LearningRateScheduler(step_decay)
    tbcallbacks = keras.callbacks.TensorBoard(log_dir=log_path, histogram_freq=0, write_graph=True, write_images=False,
                                              update_freq=args.batch_size * 16)
    callbacks = [keras.callbacks.ModelCheckpoint(os.path.join(model_path, 'weights-{epoch:02d}-{acc:.3f}.h5'),
                                                 monitor='loss',
                                                 mode='min',
                                                 save_best_only=True),
                 normal_lr, tbcallbacks]

    network.fit_generator(generator=trn_gen,
                          steps_per_epoch=int(len(partition['train']) // args.batch_size),
                          epochs=args.epochs,
                          initial_epoch=initial_epoch,
                          max_queue_size=10,
                          callbacks=callbacks,
                          use_multiprocessing=True,
                          validation_data=val_gen,
                          workers=4,
                          verbose=1)
def main():

    # gpu configuration
    toolkits.initialize_GPU(args)

    import model
    # ==================================
    #       Get Train/Val.
    # ==================================
    vallist, vallb = toolkits.get_hike_datalist(
        meta_paths=args.test_meta_data_path,
        data_paths=args.test_data_path,
        mode=model_config['loss'])
    _, valscore = toolkits.get_hike_datalist(
        meta_paths=args.test_meta_data_path,
        data_paths=args.test_data_path,
        mode='mse')

    # ==================================
    #       Get Model
    # ==================================
    # construct the data generator.
    num_class = len(score_rule)
    input_length = int(args.audio_length * 25)
    params = {
        'dim': (513, None, 1),
        'mp_pooler': toolkits.set_mp(processes=args.multiprocess),
        'nfft': 1024,
        'spec_len': input_length,
        'win_length': 1024,
        'hop_length': 640,
        'n_classes': num_class,
        'sampling_rate': 16000,
        'normalize': True,
    }

    network_eval = model.vggvox_resnet2d_icassp(input_dim=params['dim'],
                                                num_class=params['n_classes'],
                                                mode='eval',
                                                args=model_config)
    # ==> load pre-trained model ???
    if args.resume:
        # ==> get real_model from arguments input,
        # load the model if the imag_model == real_model.
        if os.path.isfile(args.resume):
            network_eval.load_weights(os.path.join(args.resume),
                                      by_name=True,
                                      skip_mismatch=True)
            print('==> successfully loading model {}.'.format(args.resume))
        else:
            raise IOError("==> no checkpoint found at '{}'".format(
                args.resume))
    else:
        raise IOError('==> please type in the model to load')

    print('==> start testing.')

    v = []
    for ID in vallist:
        val_data = ut.load_data(ID, params['win_length'],
                                params['sampling_rate'], params['hop_length'],
                                params['nfft'], params['spec_len'], 'test',
                                args.data_format)
        info = network_eval.predict(np.expand_dims(val_data, (0, -1)))
        v += info.tolist()
    v = np.array(v)

    print('val data shape {}'.format(v.shape))
    if model_config['loss'] == 'mse':
        v = v.T[0] * 10 + 5
        vallb = vallb * 10 + 5
        metric = np.square(np.subtract(v, vallb)).mean()
        print('mse: ', metric)
        v_test = np.vstack([v, vallb]).astype('float').T
        df = np.hstack([vallist.reshape(-1, 1), v_test])
        df = pd.DataFrame(data=df,
                          columns=['content', 'score_predict', 'score_true'])
    else:
        valscore = valscore * 10 + 5
        v_predict = ((v < 0.5) * 1)[:, 0]
        metric = sum(v_predict == vallb) / len(vallb)
        print('confusion matrix: ', confusion_matrix(vallb, v_predict))
        print('accuracy ', metric)
        v_test = np.hstack([v,
                            vallb.reshape(-1, 1),
                            valscore.reshape(-1, 1)]).astype('float')
        df = np.hstack([vallist.reshape(-1, 1), v_test])
        df = pd.DataFrame(data=df,
                          columns=[
                              'content', 'prob_0', 'prob_1', 'true_label',
                              'score_true'
                          ])

    date = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")
    df.to_csv(
        os.path.join(args.save_dir,
                     '{}_{}_{}.csv'.format(date, model_config['loss'],
                                           metric)))
Example #15
0
def main():

    # gpu configuration
    toolkits.initialize_GPU(args)

    import model
    import generator

    # ==================================
    #       Get Train/Val.
    # ==================================

    trnlist, trnlb, l2i = toolkits.load_from_kaldi_dir(args,
                                                       "train",
                                                       min_len=300)
    vallist, vallb, _ = toolkits.load_from_kaldi_dir(args,
                                                     "val",
                                                     min_len=300,
                                                     label2idx=l2i)
    if args.cmvn:
        cmvn_stats = kaldiio.load_mat(args.cmvn)
        mean_stats = cmvn_stats[0, :-1]
        count = cmvn_stats[0, -1]
        offset = np.expand_dims(mean_stats, 0) / count
        print("offset", offset)
        CMVN = offset

    else:
        CMVN = None

    if args.post_cmvn:
        cmvn_stats = kaldiio.load_mat(args.post_cmvn)
        mean_stats = cmvn_stats[0, :-1]
        count = cmvn_stats[0, -1]
        offset = np.expand_dims(mean_stats, 0) / count
        print("offset", offset)
        POSTCMVN = offset

    else:
        POSTCMVN = None

    # construct the data generator.
    params = {
        'dim': (args.dim, 300, 1),
        'mp_pooler': toolkits.set_mp(processes=args.multiprocess),
        'nfft': 512,
        'spec_len': 300,
        'win_length': 400,
        'hop_length': 160,
        'n_classes': 8,
        'sampling_rate': 16000,
        'tandem': args.tandem,
        'batch_size': args.batch_size,
        'shuffle': True,
        'normalize': False,
        'cmvn': CMVN,
        'postcmvn': POSTCMVN
    }

    # Datasets
    partition = {'train': trnlist, 'val': vallist}
    labels = {'train': trnlb.flatten(), 'val': vallb.flatten()}

    # Generators
    trn_gen = generator.DataGenerator(partition['train'], labels['train'],
                                      **params)
    val_gen = generator.DataGenerator(partition['val'], labels['val'],
                                      **params)
    network = model.vggvox_resnet2d_icassp(input_dim=params['dim'],
                                           num_class=params['n_classes'],
                                           mode='train',
                                           args=args)
    # ==> load pre-trained model ???
    mgpu = len(keras.backend.tensorflow_backend._get_available_gpus())

    if args.resume:
        print("Attempting to load", args.resume)
        if args.resume:
            if os.path.isfile(args.resume):
                if mgpu == 1:
                    # by_name=True, skip_mismatch=True
                    # https://github.com/WeidiXie/VGG-Speaker-Recognition/issues/46
                    network.load_weights(os.path.join(args.resume),
                                         by_name=True,
                                         skip_mismatch=True)
                else:
                    network.layers[mgpu + 1].load_weights(
                        os.path.join(args.resume))
                print('==> successfully loading model {}.'.format(args.resume))
            else:
                print("==> no checkpoint found at '{}'".format(args.resume))

    print(network.summary())
    print('==> gpu {} is, training {} images, classes: 0-{} '
          'loss: {}, aggregation: {}, ohemlevel: {}'.format(
              args.gpu, len(partition['train']), np.max(labels['train']),
              args.loss, args.aggregation_mode, args.ohem_level))

    model_path, log_path = set_path(args)
    with open(os.path.join(model_path, 'label2idx'), 'w') as f:
        for key in l2i.keys():
            f.write(key + ' ' + str(l2i[key]) + '\n')

    normal_lr = keras.callbacks.LearningRateScheduler(step_decay)
    tbcallbacks = keras.callbacks.TensorBoard(log_dir=log_path,
                                              histogram_freq=0,
                                              write_graph=True,
                                              write_images=False,
                                              update_freq=args.batch_size * 16)
    callbacks = [
        keras.callbacks.ModelCheckpoint(os.path.join(
            model_path, 'weights-{epoch:02d}-{val_loss:.3f}.h5'),
                                        monitor='val_loss',
                                        mode='min',
                                        save_best_only=True), normal_lr,
        tbcallbacks
    ]

    if args.ohem_level > 1:  # online hard negative mining will be used
        candidate_steps = int(len(partition['train']) // args.batch_size)
        iters_per_epoch = int(
            len(partition['train']) // (args.ohem_level * args.batch_size))

        ohem_generator = generator.OHEM_generator(
            network, trn_gen, candidate_steps, args.ohem_level,
            args.batch_size, params['dim'], params['n_classes'])

        A = ohem_generator.next(
        )  # for some reason, I need to warm up the generator

        network.fit_generator(generator.OHEM_generator(
            network, trn_gen, iters_per_epoch, args.ohem_level,
            args.batch_size, params['dim'], params['n_classes']),
                              steps_per_epoch=iters_per_epoch,
                              epochs=args.epochs,
                              max_queue_size=10,
                              callbacks=callbacks,
                              use_multiprocessing=False,
                              workers=1,
                              verbose=1)

    else:
        network.fit_generator(trn_gen,
                              validation_data=val_gen,
                              steps_per_epoch=int(
                                  len(partition['train']) // args.batch_size),
                              epochs=args.epochs,
                              max_queue_size=10,
                              callbacks=callbacks,
                              use_multiprocessing=True,
                              workers=12,
                              verbose=1)
Example #16
0
def main(args):
    # 減少显存占用
    config = tensorflow.ConfigProto()
    config.gpu_options.allow_growth = True
    _ = tensorflow.Session(config=config)
    # ==================================
    #       Get Model
    # ==================================
    # construct the data generator.
    params = {
        'dim': (257, None, 1),
        'nfft': 512,
        'spec_len': 250,
        'win_length': 400,
        'hop_length': 160,
        'n_classes': args.n_classes,
        'sampling_rate': 16000,
        'normalize': True
    }

    network_eval = model.vggvox_resnet2d_icassp(input_dim=params['dim'],
                                                num_class=params['n_classes'],
                                                mode='eval',
                                                args=args)

    # ==> load pre-trained model
    network_eval.load_weights(os.path.join(args.resume), by_name=True)
    print('==> successfully loading model {}.'.format(args.resume))

    start = time.time()
    # 获取第一个语音特征
    specs1 = utils.load_data(args.audio1_path,
                             win_length=params['win_length'],
                             sr=params['sampling_rate'],
                             hop_length=params['hop_length'],
                             n_fft=params['nfft'],
                             spec_len=params['spec_len'],
                             mode='eval')
    specs1 = np.expand_dims(np.expand_dims(specs1, 0), -1)
    feature1 = network_eval.predict(specs1)[0]

    # 获取第二个语音特征
    specs2 = utils.load_data(args.audio2_path,
                             win_length=params['win_length'],
                             sr=params['sampling_rate'],
                             hop_length=params['hop_length'],
                             n_fft=params['nfft'],
                             spec_len=params['spec_len'],
                             mode='eval')
    specs2 = np.expand_dims(np.expand_dims(specs2, 0), -1)
    feature2 = network_eval.predict(specs2)[0]
    end = time.time()

    dist = np.dot(feature1, feature2.T)
    if dist > 0.8:
        print("%s 和 %s 为同一个人,相似度为:%f,平均预测时间:%dms" %
              (args.audio1_path, args.audio2_path, dist,
               round((end - start) * 1000) / 2))
    else:
        print("%s 和 %s 不是同一个人,相似度仅为:%f,平均预测时间:%dms" %
              (args.audio1_path, args.audio2_path, dist,
               round((end - start) * 1000) / 2))
def main(wav_path, embedding_per_second=1.0, overlap_rate=0.5):

    # gpu configuration
    toolkits.initialize_GPU(args)

    params = {
        'dim': (257, None, 1),
        'nfft': 512,
        'spec_len': 250,
        'win_length': 400,
        'hop_length': 160,
        'n_classes': 5994,
        'sampling_rate': 16000,
        'normalize': True,
    }

    network_eval = spkModel.vggvox_resnet2d_icassp(
        input_dim=params['dim'],
        num_class=params['n_classes'],
        mode='eval',
        args=args)
    network_eval.load_weights(args.resume, by_name=True)

    model_args, _, inference_args = uisrnn.parse_arguments()
    model_args.observation_dim = 512
    uisrnnModel = uisrnn.UISRNN(model_args)
    uisrnnModel.load(SAVED_MODEL_NAME)

    specs, intervals = load_data(wav_path,
                                 embedding_per_second=embedding_per_second,
                                 overlap_rate=overlap_rate)

    #specs1,interval1 = load_data(r'wavs/REC20190716140159.wav', embedding_per_second=1.2, overlap_rate=0.4)
    #mapTable1,keys1 =genMap(interval1)
    mapTable, keys = genMap(intervals)
    feats = []
    for spec in specs:
        spec = np.expand_dims(np.expand_dims(spec, 0), -1)
        v = network_eval.predict(spec)
        feats += [v]


# =============================================================================
#     for spec1 in specs1:
#         spec1 = np.expand_dims(np.expand_dims(spec1, 0), -1)
#         v = network_eval.predict(spec1)
#         feats += [v]
# =============================================================================
    feats = np.array(feats)[:, 0, :].astype(float)  # [splits, embedding dim]
    #print(len(feats),'00000000')
    #predicted_label = uisrnnModel.predict(feats, inference_args)

    #silhoutte score
    # =============================================================================
    #     sli=[]
    #     fromsel=[]
    #     li=[]
    #     knum=[]
    #     for i in range(10):
    #         li=[]
    #         range_n_clusters = list (range(2,5))
    #         for n_clusters in range_n_clusters:
    #             clusterer = KMeans(n_clusters=n_clusters)
    #             preds = clusterer.fit_predict(feats)
    #             centers = clusterer.cluster_centers_
    #
    #             score = silhouette_score (feats, preds, metric='euclidean')
    #             print ("For n_clusters = {}, silhouette score is {})".format(n_clusters, score))
    #             li.append([n_clusters,score,clusterer,centers])
    #     # =============================================================================
    #     #     print([float(str(i[1])[:4]) for i in li])
    #     #     kvalue=(max([float(str(i[1])[:4]) for i in li]))
    #     #     for i in range(len(li)):
    #     #         if kvalue==float(str(li[i][1])[:4]):
    #     #             true_k=li[i][0]
    #     #             break
    #     # =============================================================================
    #         maxi=li[0][1]
    #         for i in range(1,len(li)):
    #             if li[i][1]-maxi>=0.005:
    #                 maxi=li[i][1]
    #         for i in li:
    #             if i[1]==maxi:
    #                 true_k=i[0]
    #     # =============================================================================
    #     #     maxi=max([i[1] for i in li])
    #     #     for i in li:
    #     #         if i[1]==maxi:
    #     #             true_k=i[0]
    #     # =============================================================================
    #         fromsel.append(li[true_k-2])
    #         print(true_k)
    #         knum.append(true_k)
    #     kval=(max(set(knum), key=knum.count))
    #     print(kval)
    # =============================================================================

    clusterer = SpectralClusterer(min_clusters=2,
                                  max_clusters=100,
                                  p_percentile=0.95,
                                  gaussian_blur_sigma=1)
    predicted_label = clusterer.predict(feats)

    # =============================================================================
    #     clusters = KMeans(n_clusters=40, init='k-means++', max_iter=100, n_init=1, random_state = 0)
    #     clusters.fit(feats)
    #     tsne = TSNEVisualizer()
    #     tsne.fit(feats, ["c{}".format(c) for c in clusters.labels_])
    #     tsne.poof()
    # =============================================================================

    global no_speakers
    no_speakers = len(set(predicted_label))
    #print(predicted_label,'**************************')
    time_spec_rate = 1000 * (1.0 / embedding_per_second) * (
        1.0 - overlap_rate)  # speaker embedding every ?ms
    center_duration = int(1000 * (1.0 / embedding_per_second) // 2)
    speakerSlice = arrangeResult(predicted_label, time_spec_rate)

    for spk, timeDicts in speakerSlice.items(
    ):  # time map to orgin wav(contains mute)
        for tid, timeDict in enumerate(timeDicts):
            s = 0
            e = 0
            for i, key in enumerate(keys):
                if (s != 0 and e != 0):
                    break
                if (s == 0 and key > timeDict['start']):
                    offset = timeDict['start'] - keys[i - 1]
                    s = mapTable[keys[i - 1]] + offset
                if (e == 0 and key > timeDict['stop']):
                    offset = timeDict['stop'] - keys[i - 1]
                    e = mapTable[keys[i - 1]] + offset

            speakerSlice[spk][tid]['start'] = s
            speakerSlice[spk][tid]['stop'] = e

    for spk, timeDicts in speakerSlice.items():
        print('========= ' + str(spk) + ' =========')
        for timeDict in timeDicts:
            s = timeDict['start']
            e = timeDict['stop']
            s = fmtTime(s)  # change point moves to the center of the slice
            e = fmtTime(e)
            print(s + ' ==> ' + e)

    p = PlotDiar(map=speakerSlice, wav=wav_path, gui=True, size=(25, 6))
    p.draw()
    p.plot.show()
Example #18
0
def extract_embeddings(input_path=time_100_emp_train, mode='train'):
    toolkits.initialize_GPU(args)
    params = {
        'dim': (257, None, 1),
        'nfft': 512,
        'spec_len': 250,
        'win_length': 400,
        'hop_length': 160,
        'n_classes': 5994,
        'sampling_rate': 16000,
        'normalize': True,
    }

    network_eval = model.vggvox_resnet2d_icassp(input_dim=params['dim'],
                                                num_class=params['n_classes'],
                                                mode='eval',
                                                args=args)

    if args.resume:
        weight_path = os.path.join(base_path, args.resume)
        if os.path.isfile(weight_path):
            print('loading graph')
            network_eval.load_weights(weight_path, by_name=True)
        else:
            return 'Issue with loading graph'
    else:
        return 'Pre-trained graph is required'

    if mode == 'train':
        audio_files = [
            filename for filename in Path(input_path).rglob('*.wav')
        ]
        total_files = len(audio_files) * 10
        working_file = 0
        emb_store = {}
        for audio in audio_files:
            print(f'processing {os.path.basename(os.path.dirname(audio))} ')
            specs = ut.load_data_aug(audio,
                                     win_length=params['win_length'],
                                     sr=params['sampling_rate'],
                                     hop_length=params['hop_length'],
                                     n_fft=params['nfft'],
                                     spec_len=params['spec_len'],
                                     mode='eval')
            count_file = 0
            for sample in specs:
                print(f'Augmentation count is {count_file}')
                print(f'Processing file {working_file} of {total_files}')
                sample_spec = np.expand_dims(np.expand_dims(sample, 0), -1)
                class_label = os.path.basename(os.path.dirname(audio))
                v = network_eval.predict(sample_spec)

                old_data = []
                if class_label in emb_store.keys():
                    pre_data = emb_store.get(class_label)
                    pre_data.append(v[0])
                    old_data = pre_data
                else:
                    old_data.append(v[0])
                emb_store[class_label] = old_data

                count_file += 1
                working_file += 1
                logging.info(f'For {audio} label stored is {class_label}')

        with open('../data/training_features_augmented.pickle',
                  'wb') as handle:
            pickle.dump(emb_store, handle, protocol=pickle.HIGHEST_PROTOCOL)
    else:
        specs = ut.load_data(input_path,
                             win_length=params['win_length'],
                             sr=params['sampling_rate'],
                             hop_length=params['hop_length'],
                             n_fft=params['nfft'],
                             spec_len=params['spec_len'],
                             mode='eval')
        specs = np.expand_dims(np.expand_dims(specs, 0), -1)
        vector_embedding = network_eval.predict(specs)[0]
        return vector_embedding
def main():

    # gpu configuration
    toolkits.initialize_GPU(args)

    import model
    # ==================================
    #       Get Train/Val.
    # ==================================
    print('Calculating test data lists...')
    
    # AI project list file
    if args.test_type == 'ai':
        verify_list = np.loadtxt('model/meta/sets.txt', str)
    else:
        raise IOError('Unknown test type.')

    verify_lb = np.array([int(i[0]) for i in verify_list])
    list1 = np.array([os.path.join(args.data_path, i[1]) for i in verify_list])
    list2 = np.array([os.path.join(args.data_path, i[2]) for i in verify_list])

    total_list = np.concatenate((list1, list2))
    unique_list = np.unique(total_list)

    # ==================================
    #       Get Model
    # ==================================
    # construct the data generator.
    params = {'dim': (257, None, 1),
              'nfft': 512,
              'spec_len': 250,
              'win_length': 400,
              'hop_length': 160,
              'n_classes': 5994,
              'sampling_rate': 16000,
              'normalize': True,
              }

    network_eval = model.vggvox_resnet2d_icassp(input_dim=params['dim'],
                                                num_class=params['n_classes'],
                                                mode='eval', args=args)

    # ==> load pre-trained model
    if args.resume:        
        # Load pretrained weight
        if os.path.isfile('model/src/weights.h5'):
            network_eval.load_weights('model/src/weights.h5', by_name=True)
            print('Successfully loading model {}.'.format(args.resume))
        else:
            raise IOError("No checkpoint found at '{}'".format(args.resume))
    else:
        raise IOError('Please type in the model to load')

    print('\nStart testing...')

    # The feature extraction process has to be done sample-by-sample,
    # because each sample is of different lengths.
    total_length = len(unique_list)
    feats, scores, labels = [], [], []
    for c, ID in enumerate(unique_list):
        specs = ut.load_data(ID, win_length=params['win_length'], sr=params['sampling_rate'],
                             hop_length=params['hop_length'], n_fft=params['nfft'],
                             spec_len=params['spec_len'], mode='eval')
        specs = np.expand_dims(np.expand_dims(specs, 0), -1)
    
        v = network_eval.predict(specs)
        feats += [v]
    
    feats = np.array(feats)

    allscores = []
    match = []
    nomatch = []

    # ==> compute the pair-wise similarity.
    print("Model 1 scores")
    for c, (p1, p2) in enumerate(zip(list1, list2)):
        ind1 = np.where(unique_list == p1)[0][0]
        ind2 = np.where(unique_list == p2)[0][0]

        v1 = feats[ind1, 0]
        v2 = feats[ind2, 0]

        scores += [np.sum(v1*v2)]
        labels += [verify_lb[c]]

        if c != 0 and verify_lb[c] == 1:
            match.append(scores[-1])
        elif verify_lb[c] == 0:
            nomatch.append(scores[-1])

        allscores.append(scores[-1])
        print('Score : {}'.format(scores[-1]))
    
    # For evaluation
    # match = [str(x) for x in match]
    # nomatch = [str(x) for x in nomatch]

    # with open("./eval/result.txt", "a") as w:
    #     matches = ','.join(match)
    #     nomatches = ','.join(nomatch)
    #     w.write(matches+'\n')
    #     w.write(nomatches+'\n')

    with open("result1.pickle", "wb") as w:
        pickle.dump(scores, w)
Example #20
0
def main(wav_path, embedding_per_second=1.0, overlap_rate=0.5):

    # gpu configuration
    toolkits.initialize_GPU(args)

    params = {
        'dim': (257, None, 1),
        'nfft': 512,
        'spec_len': 250,
        'win_length': 400,
        'hop_length': 160,
        'n_classes': 5994,
        'sampling_rate': 16000,
        'normalize': True,
    }

    network_eval = spkModel.vggvox_resnet2d_icassp(
        input_dim=params['dim'],
        num_class=params['n_classes'],
        mode='eval',
        args=args)
    network_eval.load_weights(args.resume, by_name=True)

    model_args, _, inference_args = uisrnn.parse_arguments()
    model_args.observation_dim = 512
    uisrnnModel = uisrnn.UISRNN(model_args)
    uisrnnModel.load(SAVED_MODEL_NAME)

    specs, intervals = load_data(wav_path,
                                 embedding_per_second=embedding_per_second,
                                 overlap_rate=overlap_rate)
    mapTable, keys = genMap(intervals)

    feats = []
    for spec in specs:
        spec = np.expand_dims(np.expand_dims(spec, 0), -1)
        v = network_eval.predict(spec)
        feats += [v]

    feats = np.array(feats)[:, 0, :].astype(float)  # [splits, embedding dim]
    predicted_label = uisrnnModel.predict(feats, inference_args)

    time_spec_rate = 1000 * (1.0 / embedding_per_second) * (
        1.0 - overlap_rate)  # speaker embedding every ?ms
    center_duration = int(1000 * (1.0 / embedding_per_second) // 2)
    speakerSlice = arrangeResult(predicted_label, time_spec_rate)

    for spk, timeDicts in speakerSlice.items(
    ):  # time map to orgin wav(contains mute)
        for tid, timeDict in enumerate(timeDicts):
            s = 0
            e = 0
            for i, key in enumerate(keys):
                if (s != 0 and e != 0):
                    break
                if (s == 0 and key > timeDict['start']):
                    offset = timeDict['start'] - keys[i - 1]
                    s = mapTable[keys[i - 1]] + offset
                if (e == 0 and key > timeDict['stop']):
                    offset = timeDict['stop'] - keys[i - 1]
                    e = mapTable[keys[i - 1]] + offset

            speakerSlice[spk][tid]['start'] = s
            speakerSlice[spk][tid]['stop'] = e

    # for spk,timeDicts in speakerSlice.items():
    #     print('========= ' + str(spk) + ' =========')
    #     for timeDict in timeDicts:
    #         s = timeDict['start']
    #         e = timeDict['stop']
    #         s = fmtTime(s)  # change point moves to the center of the slice
    #         e = fmtTime(e)
    #         print(s+' ==> '+e)
    # p = PlotDiar(map=speakerSlice, wav=wav_path, gui=True, size=(25, 6))
    # p.draw()
    # p.plot.show()
    speech_r = speech_reg.Recognizer()
    sound = AudioSegment.from_wav(wav_path)
    for spk in speakerSlice.keys():
        print('========= ' + str(spk) + ' =========')
        for item_dict in speakerSlice[spk]:
            audio_seg = sound[item_dict['start']:item_dict['stop']]
            s = item_dict['start']
            e = item_dict['stop']
            s = fmtTime(s)  # change point moves to the center of the slice
            e = fmtTime(e)
            print(s + ' ==> ' + e)
            item_dict.update({'content': audio_seg})
            filename = 'speaker' + str(spk) + '-' + str(
                item_dict['start'] / 1000) + '-' + str(
                    item_dict['stop'] / 1000) + '.wav'
            audio_seg.export(filename, format="wav")
            audio = speech_reg.AudioFile(filename)
            # words=speech_reg.AudioData(audio_seg,sample_rate=fs,sample_width=2)
            with audio as source:
                words = speech_r.record(source)
                try:
                    res = speech_r.recognize_google(words)
                except speech_reg.UnknownValueError:
                    try:
                        res = speech_r.recognize_sphinx(words)
                    except speech_reg.UnknownValueError:
                        res = ''
                item_dict.update({'content': res})
            print(res)

    return speakerSlice
Example #21
0
def main():

    # gpu configuration
    toolkits.initialize_GPU(args)

    import model

    params = {
        'dim': (513, None, 1),
        'n_fft': 1024,
        'win_length': 1024,
        'hop_length': 640,
        'n_classes': 2,
        'sampling_rate': 16000,
        'normalize': True,
    }

    network_eval = model.vggvox_resnet2d_icassp(input_dim=params['dim'],
                                                num_class=params['n_classes'],
                                                mode='eval',
                                                args=args)

    # ==> load pre-trained model ???
    if args.resume:
        # ==> get real_model from arguments input,
        # load the model if the imag_model == real_model.
        if os.path.isfile(args.resume):
            network_eval.load_weights(os.path.join(args.resume),
                                      by_name=True,
                                      skip_mismatch=True)
            print('==> successfully loading model {}.'.format(args.resume))
        else:
            raise IOError("==> no checkpoint found at '{}'".format(
                args.resume))
    else:
        raise IOError('==> please type in the model to load')

    print('==> start testing.')
    if sum([args.data_path.endswith(i) for i in ['.wav', '.m4a', 'mp3']]) == 1:
        wav, sr_ret = librosa.load(args.data_path,
                                   sr=params['sampling_rate'],
                                   offset=5)
        linear_spect = ut.lin_spectogram_from_wav(wav, params['hop_length'],
                                                  params['win_length'],
                                                  params['n_fft'])
        print('sample_rate is ', sr_ret)
    elif args.data_path.endswith('.npy'):
        linear_spect = np.load(args.data_path)
    else:
        raise IOError('wrong input format')

    mag, _ = librosa.magphase(linear_spect)  # magnitude
    mag_T = mag.T
    spec_mag = mag_T
    mu = np.mean(spec_mag, 0, keepdims=True)
    std = np.std(spec_mag * (10**5), 0, keepdims=True) / (10**5)
    spec_mag = (spec_mag - mu) / (std + 1e-3)
    spec_mag = np.expand_dims(spec_mag, (0, -1))
    print(spec_mag.shape)
    if args.loss == 'regression':
        v = network_eval.predict(spec_mag) * 10 + 5
        print('the predicted score is: {}'.format(v))
    else:
        v = network_eval.predict(spec_mag)
        print(v)
def main():
    params = {
        'dim': (257, None, 1),
        'n_fft': 512,
        'win_length': 400,
        'hop_length': 160,
        'n_classes': 2,
        'sampling_rate': 16000,
        'normalize': True,
    }

    network_eval = model.vggvox_resnet2d_icassp(input_dim=params['dim'],
                                                num_class=params['n_classes'],
                                                mode='eval',
                                                args=args)

    # ==> load pre-trained model ???
    if args.resume:
        # ==> get real_model from arguments input,
        # load the model if the imag_model == real_model.
        if os.path.isfile(args.resume):
            network_eval.load_weights(os.path.join(args.resume),
                                      by_name=True,
                                      skip_mismatch=True)
            print('==> successfully loading model {}.'.format(args.resume))
        else:
            raise IOError("==> no checkpoint found at '{}'".format(
                args.resume))
    else:
        raise IOError('==> please type in the model to load')

    print('==> start testing.')

    audio_clip_dir = args.data_path
    clipname_list = os.listdir(audio_clip_dir)

    with open('../../meta_data_100.json', 'r') as f:
        meta_data = json.load(f)

    new_meta_data = []
    for id, score in meta_data:
        # find all corresponding audio clip
        id_audioclip_name_list = [i for i in clipname_list if id in i]
        clip_v = []
        for id_audioclip_name in id_audioclip_name_list:
            wav, sr_ret = librosa.load(os.path.join(audio_clip_dir,
                                                    id_audioclip_name),
                                       sr=params['sampling_rate'])
            linear_spect = ut.lin_spectogram_from_wav(wav,
                                                      params['hop_length'],
                                                      params['win_length'],
                                                      params['n_fft'])

            mag, _ = librosa.magphase(linear_spect)  # magnitude
            mag_T = mag.T
            spec_mag = mag_T
            mu = np.mean(spec_mag, 0, keepdims=True)
            std = np.std(spec_mag * (10**5), 0, keepdims=True) / (10**5)
            spec_mag = (spec_mag - mu) / (std + 1e-3)
            spec_mag = np.expand_dims(spec_mag, (0, -1))
            v = network_eval.predict(spec_mag) * 10 + 5
            v = round(v[0][0], 2).astype('float')
            clip_v.append(v)

        if len(id_audioclip_name_list) != 0:
            if sum(clip_v) / len(clip_v) < 3:
                print('{} is selected, its predicted score is {}'.format(
                    id,
                    sum(clip_v) / len(clip_v)))
                new_meta_data.append((id, score, sum(clip_v) / len(clip_v)))

            # if abs(v-score) < 1.2:
            #     new_meta_data.append((id_audioclip_name, score, v, id))
            #     print('{} is selected, its rule score is {}, and its predicted score is {}'.format(id_audioclip_name, score, v))

    with open('meta_data_low.json', 'w') as f:
        json.dump(new_meta_data, f)
Example #23
0
def main(wav_path,
         embedding_per_second=1.0,
         overlap_rate=0.5,
         exportFile=None,
         expectedSpeakers=2):

    # gpu configuration
    toolkits.initialize_GPU(args)

    params = {
        'dim': (257, None, 1),
        'nfft': 512,
        'spec_len': 250,
        'win_length': 400,
        'hop_length': 160,
        'n_classes': 5994,
        'sampling_rate': 16000,
        'normalize': True,
    }

    network_eval = spkModel.vggvox_resnet2d_icassp(
        input_dim=params['dim'],
        num_class=params['n_classes'],
        mode='eval',
        args=args)
    network_eval.load_weights(args.resume, by_name=True)

    model_args, _, inference_args = uisrnn.parse_arguments()
    model_args.observation_dim = 512
    uisrnnModel = uisrnn.UISRNN(model_args)
    uisrnnModel.load(SAVED_MODEL_NAME)

    specs, intervals = load_data(wav_path,
                                 embedding_per_second=embedding_per_second,
                                 overlap_rate=overlap_rate)
    mapTable, keys = genMap(intervals)

    feats = []
    for spec in specs:
        spec = np.expand_dims(np.expand_dims(spec, 0), -1)
        v = network_eval.predict(spec)
        feats += [v]

    feats = np.array(feats)[:, 0, :].astype(float)  # [splits, embedding dim]
    predicted_label = uisrnnModel.predict(feats, inference_args)

    time_spec_rate = 1000 * (1.0 / embedding_per_second) * (
        1.0 - overlap_rate)  # speaker embedding every ?ms
    center_duration = int(1000 * (1.0 / embedding_per_second) // 2)
    speakerSlice = arrangeResult(predicted_label, time_spec_rate)

    for spk, timeDicts in speakerSlice.items(
    ):  # time map to orgin wav(contains mute)
        for tid, timeDict in enumerate(timeDicts):
            s = 0
            e = 0
            for i, key in enumerate(keys):
                if (s != 0 and e != 0):
                    break
                if (s == 0 and key > timeDict['start']):
                    offset = timeDict['start'] - keys[i - 1]
                    s = mapTable[keys[i - 1]] + offset
                if (e == 0 and key > timeDict['stop']):
                    offset = timeDict['stop'] - keys[i - 1]
                    e = mapTable[keys[i - 1]] + offset

            speakerSlice[spk][tid]['start'] = s
            speakerSlice[spk][tid]['stop'] = e
    n_speakers = len(speakerSlice)
    print('N-SPeakers:', n_speakers)
    global speaker_final
    speaker_final = [pdb.empty()] * n_speakers
    for spk, timeDicts in speakerSlice.items():
        print('========= ' + str(spk) + ' =========')
        for timeDict in timeDicts:
            s = timeDict['start']
            e = timeDict['stop']
            diarization_try(wav_path, s / 1000, e / 1000, spk)
            s = fmtTime(s)  # change point moves to the center of the slice
            e = fmtTime(e)
            print(s + ' ==> ' + e)

    # Find the Top n Speakers
    speaker_final.sort(key=lambda speaker: speaker.duration_seconds,
                       reverse=True)
    speaker_final = speaker_final[0:expectedSpeakers]

    # Export the Files
    iso_wav_path = wav_path.split(".")[0]
    itr = 0
    while itr < len(speaker_final):
        write_path = exportFile + "_speaker" + str(itr) + ".wav"
        speaker_final[itr].export(write_path, format="wav")
        itr += 1

    del speaker_final
Example #24
0
def main():

    # gpu configuration
    toolkits.initialize_GPU(args)

    import model
    # ==================================
    #       Get Train/Val.
    # ==================================

    total_list = [
        os.path.join(args.data_path, file)
        for file in os.listdir(args.data_path)
    ]
    unique_list = np.unique(total_list)

    # ==================================
    #       Get Model
    # ==================================
    # construct the data generator.
    params = {
        'dim': (257, None, 1),
        'nfft': 512,
        'min_slice': 720,
        'win_length': 400,
        'hop_length': 160,
        'n_classes': 5994,
        'sampling_rate': 16000,
        'normalize': True,
    }

    network_eval = model.vggvox_resnet2d_icassp(input_dim=params['dim'],
                                                num_class=params['n_classes'],
                                                mode='eval',
                                                args=args)

    # ==> load pre-trained model ???
    if args.resume:
        # ==> get real_model from arguments input,
        # load the model if the imag_model == real_model.
        if os.path.isfile(args.resume):
            network_eval.load_weights(os.path.join(args.resume), by_name=True)
            print('==> successfully loading model {}.'.format(args.resume))
        else:
            raise IOError("==> no checkpoint found at '{}'".format(
                args.resume))
    else:
        raise IOError('==> please type in the model to load')

    print('==> start testing.')

    # The feature extraction process has to be done sample-by-sample,
    # because each sample is of different lengths.

    train_cluster_id = []
    train_sequence = []
    SRC_PATH = r'/data/dataset/SpkWav120'

    wavDir = os.listdir(SRC_PATH)
    wavDir.sort()
    for i, spkDir in enumerate(wavDir):  # Each speaker's directory
        spk = spkDir  # speaker name
        wavPath = os.path.join(SRC_PATH, spkDir, 'audio')
        print('Processing speaker({}) : {}'.format(i, spk))

        for wav in os.listdir(wavPath):  # wavfile

            utter_path = os.path.join(wavPath, wav)
            feats = []
            specs = load_data(utter_path,
                              split=True,
                              win_length=params['win_length'],
                              sr=params['sampling_rate'],
                              hop_length=params['hop_length'],
                              n_fft=params['nfft'],
                              min_slice=params['min_slice'])
            if (len(specs) < 1):
                continue
            for spec in specs:
                spec = np.expand_dims(np.expand_dims(spec, 0), -1)
                v = network_eval.predict(spec)
                feats += [v]

            feats = np.array(feats)[:, 0, :]  # [splits, embedding dim]

            train_cluster_id.append([spk] * feats.shape[0])
            train_sequence.append(feats)

    np.savez('training_data',
             train_sequence=train_sequence,
             train_cluster_id=train_cluster_id)
Example #25
0
def main():

    # gpu configuration
    toolkits.initialize_GPU(args)

    import model
    import generator
    import keras

    # ==================================
    #       Get Train/Val.
    # ==================================
    trnlist, trnlb = toolkits.get_voxceleb2_datalist(
        args, path='../meta/vox2_train_wav.txt')
    vallist, vallb = toolkits.get_voxceleb2_datalist(
        args, path='../meta/vox2_val_wav.txt')

    # construct the data generator.
    params = {
        'dim': (257, 250, 1),
        'mp_pooler': toolkits.set_mp(processes=args.multiprocess),
        'nfft': 512,
        'spec_len': 250,
        'win_length': 400,
        'hop_length': 160,
        'n_classes': 5994,
        'sampling_rate': 16000,
        'batch_size': args.batch_size,
        'shuffle': True,
        'normalize': True,
    }

    # Datasets
    partition = {'train': trnlist.flatten(), 'val': vallist.flatten()}
    labels = {'train': trnlb.flatten(), 'val': vallb.flatten()}

    # Generators
    trn_gen = generator.DataGenerator(partition['train'], labels['train'],
                                      **params)
    network = model.vggvox_resnet2d_icassp(input_dim=params['dim'],
                                           num_class=params['n_classes'],
                                           mode='train',
                                           args=args)

    # ==> load pre-trained model ???
    mgpu = len(keras.backend.tensorflow_backend._get_available_gpus())
    if args.resume:
        if os.path.isfile(args.resume):
            if mgpu == 1: network.load_weights(os.path.join(args.resume))
            else:
                network.layers[mgpu + 1].load_weights(os.path.join(
                    args.resume))
            print('==> successfully loading model {}.'.format(args.resume))
        else:
            print("==> no checkpoint found at '{}'".format(args.resume))

    print(network.summary())
    print('==> gpu {} is, training {} images, classes: 0-{} '
          'loss: {}, aggregation: {}, ohemlevel: {}'.format(
              args.gpu, len(partition['train']), np.max(labels['train']),
              args.loss, args.aggregation_mode, args.ohem_level))

    model_path, log_path = set_path(args)
    normal_lr = keras.callbacks.LearningRateScheduler(step_decay)
    tbcallbacks = keras.callbacks.TensorBoard(log_dir=log_path,
                                              histogram_freq=0,
                                              write_graph=True,
                                              write_images=False,
                                              update_freq=args.batch_size * 16)
    callbacks = [
        keras.callbacks.ModelCheckpoint(os.path.join(
            model_path, 'weights-{epoch:02d}-{acc:.3f}.h5'),
                                        monitor='loss',
                                        mode='min',
                                        save_best_only=True), normal_lr,
        tbcallbacks
    ]

    if args.ohem_level > 1:  # online hard negative mining will be used
        candidate_steps = int(len(partition['train']) // args.batch_size)
        iters_per_epoch = int(
            len(partition['train']) // (args.ohem_level * args.batch_size))

        ohem_generator = generator.OHEM_generator(
            network, trn_gen, candidate_steps, args.ohem_level,
            args.batch_size, params['dim'], params['n_classes'])

        A = ohem_generator.next(
        )  # for some reason, I need to warm up the generator

        network.fit_generator(generator.OHEM_generator(
            network, trn_gen, iters_per_epoch, args.ohem_level,
            args.batch_size, params['dim'], params['n_classes']),
                              steps_per_epoch=iters_per_epoch,
                              epochs=args.epochs,
                              max_queue_size=10,
                              callbacks=callbacks,
                              use_multiprocessing=False,
                              workers=1,
                              verbose=1)

    else:
        network.fit_generator(trn_gen,
                              steps_per_epoch=int(
                                  len(partition['train']) // args.batch_size),
                              epochs=args.epochs,
                              max_queue_size=10,
                              callbacks=callbacks,
                              use_multiprocessing=False,
                              workers=1,
                              verbose=1)
Example #26
0
def main(wav_path, check, embedding_per_second=1.0, overlap_rate=0.5):

    # gpu configuration
    toolkits.initialize_GPU(args)

    params = {
        'dim': (257, None, 1),
        'nfft': 512,
        'spec_len': 250,
        'win_length': 400,
        'hop_length': 160,
        'n_classes': 5994,
        'sampling_rate': 16000,
        'normalize': True,
    }

    network_eval = spkModel.vggvox_resnet2d_icassp(
        input_dim=params['dim'],
        num_class=params['n_classes'],
        mode='eval',
        args=args)
    network_eval.load_weights(args.resume, by_name=True)

    model_args, _, inference_args = uisrnn.parse_arguments()
    model_args.observation_dim = 512
    uisrnnModel = uisrnn.UISRNN(model_args)
    uisrnnModel.load(SAVED_MODEL_NAME)

    specs, intervals = load_data(wav_path,
                                 embedding_per_second=embedding_per_second,
                                 overlap_rate=overlap_rate)
    mapTable, keys = genMap(intervals)
    if check != '':
        specs1, interval1 = load_data(check,
                                      embedding_per_second=1.2,
                                      overlap_rate=0.4)
        mapTable1, keys1 = genMap(interval1)

    feats = []
    for spec in specs:
        spec = np.expand_dims(np.expand_dims(spec, 0), -1)
        v = network_eval.predict(spec)
        feats += [v]
    featss = np.array(feats)[:, 0, :].astype(float)
    predicted_label = uisrnnModel.predict(featss, inference_args)
    total_speaker = len(set(predicted_label))
    global no_speakers
    print("predicted_label: %s" % predicted_label)
    no_speakers = len(set(predicted_label))
    print('total no of speakers', no_speakers)
    time_spec_rate = 1000 * (1.0 / embedding_per_second) * (
        1.0 - overlap_rate)  # speaker embedding every ?ms
    if check != '':
        for spec1 in specs1:
            spec1 = np.expand_dims(np.expand_dims(spec1, 0), -1)
            v = network_eval.predict(spec1)
            feats += [v]
        featss = np.array(feats)[:,
                                 0, :].astype(float)  # [splits, embedding dim]
        print("=====================")
        print(feats)
        print(featss)
        print("=====================")
        predicted_label2 = uisrnnModel.predict(featss, inference_args)
        check_speaker = len(set(predicted_label2))
        print("predicted_label2: %s" % predicted_label2)
        print('same Speaker' if total_speaker ==
              check_speaker else 'not the same speaker')
        print('speaker detected as ' +
              str(predicted_label2[-1]) if total_speaker ==
              check_speaker else '')
        speakerSlice2 = arrangeResult(predicted_label2, time_spec_rate)
        print("=============speakerSlice2===============")
        for spk, timeDicts in speakerSlice2.items(
        ):  # time map to orgin wav(contains mute)
            for tid, timeDict in enumerate(timeDicts):
                s = 0
                e = 0
                for i, key in enumerate(keys):
                    if (s != 0 and e != 0):
                        break
                    if (s == 0 and key > timeDict['start']):
                        offset = timeDict['start'] - keys[i - 1]
                        s = mapTable[keys[i - 1]] + offset
                    if (e == 0 and key > timeDict['stop']):
                        offset = timeDict['stop'] - keys[i - 1]
                        e = mapTable[keys[i - 1]] + offset

                speakerSlice2[spk][tid]['start'] = s
                speakerSlice2[spk][tid]['stop'] = e

        for spk, timeDicts in speakerSlice2.items():
            print('========= ' + str(spk) + ' =========')
            for timeDict in timeDicts:
                s = timeDict['start']
                e = timeDict['stop']
                s = fmtTime(s)  # change point moves to the center of the slice
                e = fmtTime(e)
                print(s + ' ==> ' + e)
        print("=============speakerSlice2===============")
        #print(predicted_label,'**************************')
    center_duration = int(1000 * (1.0 / embedding_per_second) // 2)
    speakerSlice = arrangeResult(predicted_label, time_spec_rate)

    for spk, timeDicts in speakerSlice.items(
    ):  # time map to orgin wav(contains mute)
        for tid, timeDict in enumerate(timeDicts):
            s = 0
            e = 0
            for i, key in enumerate(keys):
                if (s != 0 and e != 0):
                    break
                if (s == 0 and key > timeDict['start']):
                    offset = timeDict['start'] - keys[i - 1]
                    s = mapTable[keys[i - 1]] + offset
                if (e == 0 and key > timeDict['stop']):
                    offset = timeDict['stop'] - keys[i - 1]
                    e = mapTable[keys[i - 1]] + offset

            speakerSlice[spk][tid]['start'] = s
            speakerSlice[spk][tid]['stop'] = e

    for spk, timeDicts in speakerSlice.items():
        print('========= ' + str(spk) + ' =========')
        for timeDict in timeDicts:
            s = timeDict['start']
            e = timeDict['stop']
            s = fmtTime(s)  # change point moves to the center of the slice
            e = fmtTime(e)
            print(s + ' ==> ' + e)
Example #27
0
def main():

    # gpu configuration
    toolkits.initialize_GPU(args)

    import model
    # ==================================
    #       Get Train/Val.
    # ==================================
    
    total_list = [os.path.join(args.data_path, file) for file in os.listdir(args.data_path)]
    unique_list = np.unique(total_list)

    # ==================================
    #       Get Model
    # ==================================
    # construct the data generator.
    params = {'dim': (257, None, 1),
              'nfft': 512,
              'min_slice': 720,
              'win_length': 400,
              'hop_length': 160,
              'n_classes': 5994,
              'sampling_rate': 16000,
              'normalize': True,
              }

    network_eval = model.vggvox_resnet2d_icassp(input_dim=params['dim'],
                                                num_class=params['n_classes'],
                                                mode='eval', args=args)

    # ==> load pre-trained model ???
    if args.resume:
        # ==> get real_model from arguments input,
        # load the model if the imag_model == real_model.
        if os.path.isfile(args.resume):
            network_eval.load_weights(os.path.join(args.resume), by_name=True)
            print('==> successfully loading model {}.'.format(args.resume))
        else:
            raise IOError("==> no checkpoint found at '{}'".format(args.resume))
    else:
        raise IOError('==> please type in the model to load')

    # The feature extraction process has to be done sample-by-sample,
    # because each sample is of different lengths.

    SRC_PATH = r'/data/dataset/SpkWav120'
    SRC_PATH = r'./ghostvlad/SRC_PATH' # bencq path
    print(SRC_PATH)
    path_spk_tuples = prepare_data(SRC_PATH)
    train_sequence = []
    train_cluster_id = []

    CNT = 7000  # 7000
    for epoch in range(CNT): # Random choice utterances from whole wavfiles
        # A merged utterance contains [10,20] utterances
        splits_count = np.random.randint(10, 20, 1) # 最小值,最大值,[维度]
        path_spks = random.sample(path_spk_tuples, splits_count[0])
        utterance_specs, utterance_speakers = load_data(path_spks, min_win_time=500, max_win_time=1600)
        feats = []
        for spec in utterance_specs:
            spec = np.expand_dims(np.expand_dims(spec, 0), -1)
            v = network_eval.predict(spec)
            feats += [v]

        feats = np.array(feats)[:,0,:]  # [splits, embedding dim]
        train_sequence.append(feats)
        train_cluster_id.append(utterance_speakers)
        print("epoch:{}, utterance length: {}, speakers: {}".format(epoch, len(utterance_speakers), len(path_spks)))

    np.savez('training_data', train_sequence=train_sequence, train_cluster_id=train_cluster_id)
Example #28
0
    def __init__(self):
        self.filename2embedding = {}

        arguments = "--net resnet34s --gpu 0 --ghost_cluster 2 --vlad_cluster 8 --loss softmax " \
                    "--resume " \
                    "/media/ben/datadrive/Software/VGG-Speaker-Recognition/model/gvlad_softmax" \
                    "/2020-11-15_resnet34s_bs16_adam_lr0.001_vlad8_ghost2_bdim512_ohemlevel0" \
                    "/weights-42-0.931.h5 --data_path " \
                    "/media/ben/datadrive/Zalo/voice-verification/Train-Test-Data/dataset/".split()

        ZALO_TEST = "/media/ben/datadrive/Zalo/voice-verification/vgg_db_files/val_trials.txt"

        parser = argparse.ArgumentParser()
        # set up training configuration.
        parser.add_argument("--gpu", default="", type=str)
        parser.add_argument("--resume", default="", type=str)
        parser.add_argument("--batch_size", default=16, type=int)
        parser.add_argument("--data_path", default="/media/weidi/2TB-2/datasets/voxceleb1/wav",
                            type=str)
        # set up network configuration.
        parser.add_argument("--net", default="resnet34s", choices=["resnet34s", "resnet34l"],
                            type=str)
        parser.add_argument("--ghost_cluster", default=2, type=int)
        parser.add_argument("--vlad_cluster", default=8, type=int)
        parser.add_argument("--bottleneck_dim", default=512, type=int)
        parser.add_argument("--aggregation_mode", default="gvlad", choices=["avg", "vlad", "gvlad"],
                            type=str)
        # set up learning rate, training loss and optimizer.
        parser.add_argument("--loss", default="softmax", choices=["softmax", "amsoftmax"], type=str)
        parser.add_argument("--test_type", default="normal", choices=["normal", "hard", "extend"],
                            type=str)
        global args
        args = parser.parse_args(arguments)


        # gpu configuration
        toolkits.initialize_GPU(args)

        # ==================================
        #       Get Train/Val.
        # ==================================
        print("==> Initialising inference engine...".format(args.test_type))

       # ==================================
        #       Get Model
        # ==================================
        # construct the data generator.
        self.params = {"dim": (257, None, 1),
                  "nfft": 512,
                  "spec_len": 250,
                  "win_length": 400,
                  "hop_length": 160,
                  "n_classes": 5994,
                  "sampling_rate": 16000,
                  "normalize": True,
                  }

        self.network_eval = model.vggvox_resnet2d_icassp(input_dim=self.params["dim"],
                                                    num_class=self.params["n_classes"],
                                                    mode="eval", args=args)

        # ==> load pre-trained model ???
        if args.resume:
            # ==> get real_model from arguments input,
            # load the model if the image_model == real_model.
            if os.path.isfile(args.resume):
                self.network_eval.load_weights(os.path.join(args.resume), by_name=True)
                print("==> successfully loading model {}.".format(args.resume))
            else:
                raise IOError("==> no checkpoint found at '{}'".format(args.resume))
        else:
            raise IOError("==> please type in the model to load")

        print("==> start testing.")
Example #29
0
def dia_audio(wav_path, embedding_per_second=0.3, overlap_rate=0.33):

    # gpu configuration
    #toolkits.initialize_GPU(args)

    params = {'dim': (257, None, 1),
              'nfft': 512,
              'spec_len': 250,
              'win_length': 400,
              'hop_length': 160,
              'n_classes': 5994,
              'sampling_rate': 16000,
              'normalize': True,
              }

    network_eval = spkModel.vggvox_resnet2d_icassp(input_dim=params['dim'],
                                                   num_class=params['n_classes'],
                                                   mode='eval', args=args)
    network_eval.load_weights(args.resume, by_name=True)

    model_args.observation_dim = 512
    uisrnnModel = uisrnn.UISRNN(model_args)
    uisrnnModel.load(SAVED_MODEL_NAME)

    specs, intervals = load_data(
        wav_path, embedding_per_second=embedding_per_second, overlap_rate=overlap_rate)
    mapTable, keys = genMap(intervals)

    feats = []
    for spec in specs:
        spec = np.expand_dims(np.expand_dims(spec, 0), -1)
        v = network_eval.predict(spec)
        feats += [v]

    feats = np.array(feats)[:, 0, :].astype(float)  # [splits, embedding dim]
    predicted_label = uisrnnModel.predict(feats, inference_args)

    time_spec_rate = 1000*(1.0/embedding_per_second) * \
        (1.0-overlap_rate)  # speaker embedding every ?ms
    center_duration = int(1000*(1.0/embedding_per_second)//2)
    speakerSlice = arrangeResult(predicted_label, time_spec_rate)

    for spk, timeDicts in speakerSlice.items():    # time map to orgin wav(contains mute)
        for tid, timeDict in enumerate(timeDicts):
            s = 0
            e = 0
            for i, key in enumerate(keys):
                if(s != 0 and e != 0):
                    break
                if(s == 0 and key > timeDict['start']):
                    offset = timeDict['start'] - keys[i-1]
                    s = mapTable[keys[i-1]] + offset
                if(e == 0 and key > timeDict['stop']):
                    offset = timeDict['stop'] - keys[i-1]
                    e = mapTable[keys[i-1]] + offset

            speakerSlice[spk][tid]['start'] = s
            speakerSlice[spk][tid]['stop'] = e

    for spk, timeDicts in speakerSlice.items():
        ##print('========= ' + str(spk) + ' =========')
        for timeDict in timeDicts:
            s = timeDict['start']
            e = timeDict['stop']
            s = fmtTime(s)  # change point moves to the center of the slice
            e = fmtTime(e)
            #print(s+' ==> '+e)

#     p = PlotDiar(map=speakerSlice, wav=wav_path, gui=True, size=(25, 6))
#     p.draw()
#     p.plot.show()
    return speakerSlice
Example #30
0
def main(wav_path,
         embedding_per_second=1.0,
         n_classes=5994,
         overlap_rate=0.5,
         plot_results=True):

    # gpu configuration
    toolkits.initialize_GPU(args)

    params = {
        'dim': (257, None, 1),
        'nfft': 512,
        'spec_len': 250,
        'win_length': 400,
        'hop_length': 160,
        'n_classes': 5994,
        'sampling_rate': 16000,
        'normalize': True,
    }

    network_eval = spkModel.vggvox_resnet2d_icassp(
        input_dim=params['dim'],
        num_class=params['n_classes'],
        mode='eval',
        args=args)
    network_eval.load_weights(args.resume, by_name=True)
    #

    model_args, _, inference_args = uisrnn.parse_arguments()
    model_args.observation_dim = 512
    uisrnnModel = uisrnn.UISRNN(model_args)
    uisrnnModel.load(SAVED_MODEL_NAME)

    specs, intervals = load_data(wav_path,
                                 embedding_per_second=embedding_per_second,
                                 overlap_rate=overlap_rate)
    mapTable, keys = genMap(intervals)

    print('intervals', intervals, len(intervals))
    print('mapTable', mapTable, len(mapTable))
    print('keys', keys, len(keys))
    # print('mapTable, keys', mapTable, keys)
    feats = []
    for spec in specs:
        spec = np.expand_dims(np.expand_dims(spec, 0), -1)
        v = network_eval.predict(spec)
        # print('v',v.shape)
        #print('feats', feats.shape)

        feats += [v]

    feats = np.array(feats)[:, 0, :].astype(float)  # [splits, embedding dim]
    predicted_label = uisrnnModel.predict(feats, inference_args)
    print(feats.shape)
    print(inference_args)
    print('predicted_label', predicted_label)

    time_spec_rate = 1000 * (1.0 / embedding_per_second) * (
        1.0 - overlap_rate)  # speaker embedding every ?ms
    print('time_spec_rate', time_spec_rate)
    center_duration = int(1000 * (1.0 / embedding_per_second) // 2)
    speakerSlice = arrangeResult(predicted_label, time_spec_rate)
    print('speakerSlice', speakerSlice)
    for spk, timeDicts in speakerSlice.items(
    ):  # time map to orgin wav(contains mute)
        print(spk, timeDicts)
        for tid, timeDict in enumerate(timeDicts):
            print(tid, timeDict)
            s = 0
            e = 0
            for i, key in enumerate(keys):
                if (s != 0 and e != 0):
                    break
                if (s == 0 and key > timeDict['start']):
                    offset = timeDict['start'] - keys[i - 1]
                    print('offset', offset)
                    s = mapTable[keys[i - 1]] + offset
                if (e == 0 and key > timeDict['stop']):
                    offset = timeDict['stop'] - keys[i - 1]
                    e = mapTable[keys[i - 1]] + offset

                print('i,s,e')
                print(i, s, e, tid, spk)
            print('>>>>>', i, s, e, tid, spk)
            speakerSlice[spk][tid]['start'] = s
            speakerSlice[spk][tid]['stop'] = e

    speaker_assingments = []
    for spk, timeDicts in speakerSlice.items():
        speaker = str(spk)
        print('========= ' + str(spk) + ' =========')
        for timeDict in timeDicts:
            start = timeDict['start']
            end = timeDict['stop']
            start = fmtTime(
                start)  # change point moves to the center of the slice
            end = fmtTime(end)
            print(start + ' ==> ' + end)
            speaker_assingments.append((start, end, speaker, wav_path))

    if plot_results:
        p = PlotDiar(map=speakerSlice, wav=wav_path, gui=True, size=(25, 6))
        p.draw()
        p.plot.show()

    return feats, predicted_label, intervals, speaker_assingments, time_spec_rate