def init_model(): # gpu configuration toolkits.initialize_GPU(args) import model # construct the data generator. params = { 'dim': (257, None, 1), 'nfft': 512, 'min_slice': 720, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = model.vggvox_resnet2d_icassp(input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) if args.resume: if os.path.isfile(args.resume): network_eval.load_weights(os.path.join(args.resume), by_name=True) else: raise IOError("==> no checkpoint found at '{}'".format( args.resume)) else: raise IOError('==> please type in the model to load') return network_eval
def extract_features(paths, args): # GPU configuration toolkits.initialize_GPU(args) network_eval = model.vggvox_resnet2d_icassp( input_dim=PARAMS["dim"], num_class=PARAMS["n_classes"], mode="eval", args=args ) network_eval.load_weights(os.path.join(args.resume), by_name=True) num_paths = len(paths) feats = np.zeros((num_paths, PARAMS["feat_dim"])) for i, path in enumerate(tqdm(paths)): specs = ut.load_data( path, win_length=PARAMS["win_length"], sr=PARAMS["sampling_rate"], hop_length=PARAMS["hop_length"], n_fft=PARAMS["nfft"], spec_len=PARAMS["spec_len"], mode="eval", ) specs = np.expand_dims(np.expand_dims(specs, 0), -1) feats[i] = network_eval.predict(specs) return feats
def main(args): # gpu configuration toolkits.initialize_GPU(args) import model # ================================== # Get Train/Val. # ================================== total_list = [os.path.join(args.data_path, file) for file in os.listdir(args.data_path)] unique_list = np.unique(total_list) # ================================== # Get Model # ================================== # construct the data generator. params = {'dim': (257, None, 1), 'nfft': 512, 'min_slice': 720, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = model.vggvox_resnet2d_icassp(input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) # ==> load pre-trained model ??? if args.resume: # ==> get real_model from arguments input, # load the model if the imag_model == real_model. if os.path.isfile(args.resume): network_eval.load_weights(os.path.join(args.resume), by_name=True) print('==> successfully loading model {}.'.format(args.resume)) else: raise IOError("==> no checkpoint found at '{}'".format(args.resume)) else: raise IOError('==> please type in the model to load') print('==> start testing.') # The feature extraction process has to be done sample-by-sample, # because each sample is of different lengths. feats = [] for ID in unique_list: specs = preprocess.load_data(ID, split=False, win_length=params['win_length'], sr=params['sampling_rate'], hop_length=params['hop_length'], n_fft=params['nfft'], min_slice=params['min_slice']) specs = np.expand_dims(np.expand_dims(specs[0], 0), -1) v = network_eval.predict(specs) feats += [v] feats = np.array(feats)[:,0,:] preprocess.similar(feats)
def diarize(segments, sr=16000, win_len=400, hop_len=160, embedding_per_sec=1.0, overlap_rate=0.1): logger.debug("[Speaker diarization] Initializing models") # Initialize ghostvlad toolkits.initialize_GPU(Expando({"gpu": ""})) ghostvlad_model = model.vggvox_resnet2d_icassp(input_dim=(257, None, 1), num_class=5994, mode="eval", args=Expando({"net": "resnet34s", "loss": "softmax", "vlad_cluster": 8, "ghost_cluster": 2, "bottleneck_dim": 512, "aggregation_mode": "gvlad"})) ghostvlad_model.load_weights("ghostvlad/pretrained/weights.h5", by_name=True) # Initialize uisrnn sys.argv = sys.argv[:1] model_args, _, inference_args = uisrnn.parse_arguments() model_args.observation_dim = 512 uisrnn_model = uisrnn.UISRNN(model_args) uisrnn_model.load("uisrnn/pretrained/saved_model.uisrnn_benchmark") logger.debug("[Speaker diarization] Calculating utterance features") utterances_spec = prepare_ghostvlad_data(segments, sr, win_len, hop_len, embedding_per_sec, overlap_rate) feats = [] for spec in utterances_spec: spec = np.expand_dims(np.expand_dims(spec, 0), -1) v = ghostvlad_model.predict(spec) feats += [v] feats = np.array(feats)[:, 0, :].astype(float) logger.debug("[Speaker diarization] Clustering utterance features") labels = uisrnn_model.predict(feats, inference_args) logger.debug("[Speaker diarization] Tagging segments speakers") embedding_duration = (1/embedding_per_sec) * (1.0 - overlap_rate) labels_count = len(labels) current = 0 for segment in segments: begin_index = math.floor(current/embedding_duration) current += segment.end-segment.begin end_index = math.ceil(current/embedding_duration) segment_labels = [labels[index] for index in range(begin_index, min(end_index, labels_count))] if len(segment_labels) > 0: segment.speaker = max(segment_labels, key=segment_labels.count) else: segment.speaker = 999 return segments
def load_model(): toolkits.initialize_GPU(args) global network_eval network_eval = model.vggvox_resnet2d_icassp(input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) # ==> load pre-trained model ??? if args.resume: # ==> get real_model from arguments input, # load the model if the imag_model == real_model. if os.path.isfile(args.resume): network_eval.load_weights(os.path.join(args.resume), by_name=True) print('==> successfully loading model {}.'.format(args.resume)) else: raise IOError("==> no checkpoint found at '{}'".format( args.resume)) else: raise IOError('==> please type in the model to load')
def __init__(self): netConfig = { 'net': 'resnet34s', 'ghost_cluster': 2, 'vlad_cluster': 8, 'bottleneck_dim': 512, 'aggregation_mode': 'gvlad', 'loss': 'softmax', 'dim': (257, None, 1), 'n_classes': 5994, } netConfig = namedtuple("NetConfig", netConfig.keys())(*netConfig.values()) self.net = model.vggvox_resnet2d_icassp(input_dim=netConfig.dim, num_class=netConfig.n_classes, mode='eval', args=netConfig) self.net.load_weights(os.path.join( '../model/gvlad_softmax/resnet34_vlad8_ghost2_bdim512_deploy/weights.h5' ), by_name=True)
def main(): # gpu configuration toolkits.initialize_GPU(args) # get speaker id from folder name totalList = [os.path.join(dataPath, file) for file in os.listdir(dataPath)] uniqueList = np.unique(totalList) speakerList = [extractSpeakerId(u) for u in uniqueList] # get audio file for each speaker speakerAudioDict = {} for speaker in speakerList: # root path rootPath = os.path.join(dataPath, speaker) # get list of files fileList = getListOfFiles(rootPath) # add to dict speakerAudioDict[speaker] = fileList # get embedding for each audio of speaker speakerToFeatureDict = {} for speaker in speakerList: # construct the data generator. params = { 'dim': (257, None, 1), 'nfft': 512, 'min_slice': 720, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = model.vggvox_resnet2d_icassp( input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) # ==> load pre-trained model ??? if args.resume: # ==> get real_model from arguments input, # load the model if the imag_model == real_model. if os.path.isfile(args.resume): network_eval.load_weights(os.path.join(args.resume), by_name=True) print('==> successfully loading model {}.'.format(args.resume)) else: raise IOError("==> no checkpoint found at '{}'".format( args.resume)) else: raise IOError('==> please type in the model to load') feats = [] for ID in speakerAudioDict[speaker]: specs = preprocess.load_data(ID, split=False, win_length=params['win_length'], sr=params['sampling_rate'], hop_length=params['hop_length'], n_fft=params['nfft'], min_slice=params['min_slice']) specs = np.expand_dims(np.expand_dims(specs[0], 0), -1) v = network_eval.predict(specs) feats += [v] speakerToFeatureDict[speaker] = feats # save to file with open('speaker_data.pickle', 'wb') as handle: pickle.dump(speakerToFeatureDict, handle, protocol=pickle.HIGHEST_PROTOCOL)
def main(): # gpu configuration toolkits.initialize_GPU(args) import model import generator # ================================== # Get Train/Val. # ================================== trnlist, trnlb = toolkits.get_hike_datalist( meta_paths=args.train_meta_data_path, data_paths=args.train_data_path, mode=model_config['loss']) vallist, vallb = toolkits.get_hike_datalist( meta_paths=args.val_meta_data_path, data_paths=args.val_data_path, mode=model_config['loss']) input_length = int(args.audio_length * 25) num_class = len(score_rule) # construct the data generator. params = { 'dim': (513, input_length, 1), 'mp_pooler': toolkits.set_mp(processes=args.multiprocess), 'nfft': 1024, 'spec_len': input_length, 'win_length': 1024, 'hop_length': 640, 'n_classes': num_class, 'sampling_rate': 16000, 'batch_size': model_config['batch_size'], 'shuffle': True, 'normalize': True, 'loss': model_config['loss'], 'data_format': args.data_format } # Datasets partition = {'train': trnlist.flatten(), 'val': vallist.flatten()} labels = {'train': trnlb.flatten(), 'val': vallb.flatten()} # Generators wandb.init(project='vgg_speaker') trn_gen = generator.DataGenerator(partition['train'], labels['train'], **params) val_gen = generator.DataGenerator(partition['val'], labels['val'], **params) network = model.vggvox_resnet2d_icassp(input_dim=params['dim'], num_class=params['n_classes'], mode='train', args=model_config) # # val data # val_data = [params['mp_pooler'].apply_async(ut.load_data, # args=(ID, params['win_length'], params['sampling_rate'], params['hop_length'], # params['nfft'], params['spec_len'], 'train', args.data_format)) for ID in partition['val']] # val_data = np.expand_dims(np.array([p.get() for p in val_data]), -1) # ==> load pre-trained model ??? print(keras.backend.tensorflow_backend._get_available_gpus()) if args.resume: print("Attempting to load", args.resume) if args.resume: if os.path.isfile(args.resume): network.load_weights(os.path.join(args.resume), by_name=True, skip_mismatch=True) print('==> successfully loading model {}.'.format(args.resume)) else: raise ValueError("==> no checkpoint found at '{}'".format( args.resume)) print(network.summary()) print('==> gpu {} is, training {} images, classes: 0-{} ' 'loss: {}, aggregation: {}, ohemlevel: {}'.format( args.gpu, len(partition['train']), np.max(labels['train']), model_config['loss'], model_config['aggregation_mode'], model_config['ohem_level'])) model_path, log_path = set_path(args, model_config) normal_lr = keras.callbacks.LearningRateScheduler(step_decay) # tbcallbacks = keras.callbacks.TensorBoard(log_dir=log_path, histogram_freq=0, write_graph=True, write_images=False, # update_freq=model_config['batch_size'] * 16) callbacks = [ keras.callbacks.ModelCheckpoint( os.path.join(model_path, 'weights-{epoch:02d}-{loss:.3f}.h5'), monitor='loss', mode='min', save_best_only=True, period=20, ), normal_lr, WandbCallback() ] if model_config[ 'ohem_level'] > 1: # online hard negative mining will be used candidate_steps = int( len(partition['train']) // model_config['batch_size']) iters_per_epoch = int( len(partition['train']) // (model_config['ohem_level'] * model_config['batch_size'])) ohem_generator = generator.OHEM_generator( network, trn_gen, candidate_steps, model_config['ohem_level'], model_config['batch_size'], params['dim'], params['n_classes']) A = ohem_generator.next( ) # for some reason, I need to warm up the generator network.fit_generator(generator.OHEM_generator( network, trn_gen, iters_per_epoch, model_config['ohem_level'], model_config['batch_size'], params['dim'], params['n_classes']), steps_per_epoch=iters_per_epoch, epochs=model_config['epochs'], max_queue_size=10, callbacks=callbacks, use_multiprocessing=False, workers=1, verbose=1) else: if model_config['loss'] != 'mse': network.fit_generator(trn_gen, steps_per_epoch=int( len(partition['train']) // model_config['batch_size']), epochs=model_config['epochs'], max_queue_size=10, validation_data=val_gen, validation_freq=1, callbacks=callbacks, use_multiprocessing=False, workers=1, verbose=1) else: network.fit_generator(trn_gen, steps_per_epoch=int( len(partition['train']) // model_config['batch_size']), epochs=model_config['epochs'], max_queue_size=10, validation_data=val_gen, validation_freq=1, callbacks=callbacks, use_multiprocessing=False, workers=1, verbose=1)
def main(): # gpu configuration toolkits.initialize_GPU(args) import model # ================================== # Get Train/Val. # ================================== print('==> calculating test({}) data lists...'.format(args.test_type)) publicTest = pd.read_csv("/content/VoveDataset/public-test.csv") list1 = addPath(np.array(publicTest["audio_1"])) list2 = addPath(np.array(publicTest["audio_2"])) total_list = np.concatenate((list1, list2)) unique_list = np.unique(total_list) # ================================== # Get Model # ================================== # construct the data generator. params = { 'dim': (257, None, 1), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = model.vggvox_resnet2d_icassp(input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) # ==> load pre-trained model ??? if args.resume: # ==> get real_model from arguments input, # load the model if the imag_model == real_model. if os.path.isfile(args.resume): network_eval.load_weights(os.path.join(args.resume), by_name=True) result_path = "/content/VGG-Speaker-Recognition/result" print('==> successfully loading model {}.'.format(args.resume)) else: raise IOError("==> no checkpoint found at '{}'".format( args.resume)) else: raise IOError('==> please type in the model to load') print('==> start testing.') # The feature extraction process has to be done sample-by-sample, # because each sample is of different lengths. total_length = len(unique_list) feats, scores, labels = [], [], [] for c, ID in enumerate(pbar(unique_list)): specs = ut.load_data(ID, win_length=params['win_length'], sr=params['sampling_rate'], hop_length=params['hop_length'], n_fft=params['nfft'], spec_len=params['spec_len'], mode='eval') specs = np.expand_dims(np.expand_dims(specs, 0), -1) v = network_eval.predict(specs) feats += [v] feats = np.array(feats) np.save("/content/feats.npy", feats)
# =========================================== def wav2spec(wav): wav = np.append(wav, wav[::-1]) wav = wav.astype(np.float) linear_spect = librosa.stft(wav, n_fft=512, win_length=400, hop_length=160).T mag, _ = librosa.magphase(linear_spect) spec_mag = mag.T mu = np.mean(spec_mag, 0, keepdims=True) std = np.std(spec_mag, 0, keepdims=True) specs = (spec_mag - mu) / (std + 1e-5) specs = np.expand_dims(np.expand_dims(specs, 0), -1) return specs if __name__ == '__main__': #os.environ["CUDA_VISIBLE_DEVICES"] = "0" # 如有GPU的话取消注释该行,GPU会加速特征提取. network_eval = model.vggvox_resnet2d_icassp(input_dim=(257, None, 1),num_class=5994, mode='eval') network_eval.load_weights(os.path.join('model/weights.h5'), by_name=True) my_model = Mymodel(network_eval) wav1_path = "audio/spk1_1.wav" wav2_path = "audio/spk2_2.wav" audio1, sr = sf.read(wav1_path) audio2, sr = sf.read(wav2_path) spec1 = wav2spec(audio1) spec2 = wav2spec(audio2) t0 = time.time() feat1 = my_model.get_feats(spec1) t1 = time.time() print("{} 语音时长: {}s,提取该语音所需时间: {} s".format(wav1_path, len(audio1)/sr, t1-t0)) feat2 = my_model.get_feats(spec2) print("{} 语音时长: {}s,提取该语音所需时间: {} s".format(wav2_path, len(audio2)/sr, time.time()-t1))
# gpu configuration toolkits.initialize_GPU(args) params = { 'dim': (257, None, 1), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = spkModel.vggvox_resnet2d_icassp( input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) network_eval.load_weights(args.resume, by_name=True) model_args, _, inference_args = uisrnn.parse_arguments() model_args.observation_dim = 512 uisrnnModel = uisrnn.UISRNN(model_args) uisrnnModel.load(SAVED_MODEL_NAME) while (True): print("Start speaking") myrecording = sd.rec(int(seconds * fs), samplerate=fs, channels=2) print(type(myrecording)) sd.wait() # Wait until recording is finished0000 print("Finished recording") write('wavs/output12.wav', fs, myrecording)
def main(): # gpu configuration toolkits.initialize_GPU(args) import model # ================================== # Get Train/Val. # ================================== print('==> calculating test({}) data lists...'.format(args.test_type)) if args.test_type == 'normal': verify_list = np.loadtxt('../meta/voxceleb1_veri_test.txt', str) elif args.test_type == 'hard': verify_list = np.loadtxt('../meta/voxceleb1_veri_test_hard.txt', str) elif args.test_type == 'extend': verify_list = np.loadtxt('../meta/voxceleb1_veri_test_extended.txt', str) else: raise IOError('==> unknown test type.') verify_lb = np.array([int(i[0]) for i in verify_list]) list1 = np.array([os.path.join(args.data_path, i[1]) for i in verify_list]) list2 = np.array([os.path.join(args.data_path, i[2]) for i in verify_list]) total_list = np.concatenate((list1, list2)) unique_list = np.unique(total_list) # ================================== # Get Model # ================================== # construct the data generator. params = { 'dim': (257, 100, 1), 'nfft': 512, 'spec_len': 100, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = model.vggvox_resnet2d_icassp(input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) # ==> load pre-trained model ??? if args.resume: # ==> get real_model from arguments input, # load the model if the imag_model == real_model. if os.path.isfile(args.resume): network_eval.load_weights(os.path.join(args.resume), by_name=True) result_path = set_result_path(args) print('==> successfully loading model {}.'.format(args.resume)) else: raise IOError("==> no checkpoint found at '{}'".format( args.resume)) else: raise IOError('==> please type in the model to load') print('==> start testing.') # The feature extraction process has to be done sample-by-sample, # because each sample is of different lengths. total_length = len(unique_list) feats, scores, labels = [], [], [] for c, ID in enumerate(unique_list): if c % 50 == 0: print('Finish extracting features for {}/{}th wav.'.format( c, total_length)) specs = ut.load_data(ID, win_length=params['win_length'], sr=params['sampling_rate'], hop_length=params['hop_length'], n_fft=params['nfft'], spec_len=params['spec_len'], mode='eval') specs = np.expand_dims(np.expand_dims(specs, 0), -1) v = network_eval.predict(specs) feats += [v] feats = np.array(feats) # ==> compute the pair-wise similarity. for c, (p1, p2) in enumerate(zip(list1, list2)): ind1 = np.where(unique_list == p1)[0][0] ind2 = np.where(unique_list == p2)[0][0] v1 = feats[ind1, 0] v2 = feats[ind2, 0] scores += [np.sum(v1 * v2)] labels += [verify_lb[c]] print('scores : {}, gt : {}'.format(scores[-1], verify_lb[c])) scores = np.array(scores) labels = np.array(labels) np.save(os.path.join(result_path, 'prediction_scores.npy'), scores) np.save(os.path.join(result_path, 'groundtruth_labels.npy'), labels) eer, thresh = toolkits.calculate_eer(labels, scores) print('==> model : {}, EER: {}'.format(args.resume, eer))
def main(args): os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu config = tf.ConfigProto() config.gpu_options.allow_growth = True _ = tf.Session(config=config) # ================================== # Get Train/Val. # ================================== trnlist, trnlb = utils.get_voxceleb2_datalist(path=args.train_list) vallist, vallb = utils.get_voxceleb2_datalist(path=args.val_list) # construct the data generator. params = {'dim': (257, 250, 1), 'mp_pooler': utils.set_mp(processes=args.multiprocess), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': args.n_classes, 'sampling_rate': 16000, 'batch_size': args.batch_size, 'shuffle': True, 'normalize': True, } # Datasets partition = {'train': trnlist.flatten(), 'val': vallist.flatten()} labels = {'train': trnlb.flatten(), 'val': vallb.flatten()} # Generators trn_gen = generator.DataGenerator(partition['train'], labels['train'], **params) val_gen = generator.DataGenerator(partition['val'], labels['val'], **params) network = model.vggvox_resnet2d_icassp(input_dim=params['dim'], num_class=params['n_classes'], mode='train', args=args) # ==> load pre-trained model mgpu = len(keras.backend.tensorflow_backend._get_available_gpus()) initial_epoch = 0 if args.resume: if os.path.isfile(args.resume): if mgpu == 1: network.load_weights(os.path.join(args.resume)) else: network.layers[mgpu + 1].load_weights(os.path.join(args.resume)) initial_epoch = int(os.path.basename(args.resume).split('-')[1]) print('==> successfully loading model {}.'.format(args.resume)) else: print("==> no checkpoint found at '{}'".format(args.resume)) print(network.summary()) print('==> gpu {} is, training {} images, classes: 0-{} loss: {}, aggregation: {}' .format(args.gpu, len(partition['train']), np.max(labels['train']), args.loss, args.aggregation_mode)) model_path, log_path = set_path(args) normal_lr = keras.callbacks.LearningRateScheduler(step_decay) tbcallbacks = keras.callbacks.TensorBoard(log_dir=log_path, histogram_freq=0, write_graph=True, write_images=False, update_freq=args.batch_size * 16) callbacks = [keras.callbacks.ModelCheckpoint(os.path.join(model_path, 'weights-{epoch:02d}-{acc:.3f}.h5'), monitor='loss', mode='min', save_best_only=True), normal_lr, tbcallbacks] network.fit_generator(generator=trn_gen, steps_per_epoch=int(len(partition['train']) // args.batch_size), epochs=args.epochs, initial_epoch=initial_epoch, max_queue_size=10, callbacks=callbacks, use_multiprocessing=True, validation_data=val_gen, workers=4, verbose=1)
def main(): # gpu configuration toolkits.initialize_GPU(args) import model # ================================== # Get Train/Val. # ================================== vallist, vallb = toolkits.get_hike_datalist( meta_paths=args.test_meta_data_path, data_paths=args.test_data_path, mode=model_config['loss']) _, valscore = toolkits.get_hike_datalist( meta_paths=args.test_meta_data_path, data_paths=args.test_data_path, mode='mse') # ================================== # Get Model # ================================== # construct the data generator. num_class = len(score_rule) input_length = int(args.audio_length * 25) params = { 'dim': (513, None, 1), 'mp_pooler': toolkits.set_mp(processes=args.multiprocess), 'nfft': 1024, 'spec_len': input_length, 'win_length': 1024, 'hop_length': 640, 'n_classes': num_class, 'sampling_rate': 16000, 'normalize': True, } network_eval = model.vggvox_resnet2d_icassp(input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=model_config) # ==> load pre-trained model ??? if args.resume: # ==> get real_model from arguments input, # load the model if the imag_model == real_model. if os.path.isfile(args.resume): network_eval.load_weights(os.path.join(args.resume), by_name=True, skip_mismatch=True) print('==> successfully loading model {}.'.format(args.resume)) else: raise IOError("==> no checkpoint found at '{}'".format( args.resume)) else: raise IOError('==> please type in the model to load') print('==> start testing.') v = [] for ID in vallist: val_data = ut.load_data(ID, params['win_length'], params['sampling_rate'], params['hop_length'], params['nfft'], params['spec_len'], 'test', args.data_format) info = network_eval.predict(np.expand_dims(val_data, (0, -1))) v += info.tolist() v = np.array(v) print('val data shape {}'.format(v.shape)) if model_config['loss'] == 'mse': v = v.T[0] * 10 + 5 vallb = vallb * 10 + 5 metric = np.square(np.subtract(v, vallb)).mean() print('mse: ', metric) v_test = np.vstack([v, vallb]).astype('float').T df = np.hstack([vallist.reshape(-1, 1), v_test]) df = pd.DataFrame(data=df, columns=['content', 'score_predict', 'score_true']) else: valscore = valscore * 10 + 5 v_predict = ((v < 0.5) * 1)[:, 0] metric = sum(v_predict == vallb) / len(vallb) print('confusion matrix: ', confusion_matrix(vallb, v_predict)) print('accuracy ', metric) v_test = np.hstack([v, vallb.reshape(-1, 1), valscore.reshape(-1, 1)]).astype('float') df = np.hstack([vallist.reshape(-1, 1), v_test]) df = pd.DataFrame(data=df, columns=[ 'content', 'prob_0', 'prob_1', 'true_label', 'score_true' ]) date = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M") df.to_csv( os.path.join(args.save_dir, '{}_{}_{}.csv'.format(date, model_config['loss'], metric)))
def main(): # gpu configuration toolkits.initialize_GPU(args) import model import generator # ================================== # Get Train/Val. # ================================== trnlist, trnlb, l2i = toolkits.load_from_kaldi_dir(args, "train", min_len=300) vallist, vallb, _ = toolkits.load_from_kaldi_dir(args, "val", min_len=300, label2idx=l2i) if args.cmvn: cmvn_stats = kaldiio.load_mat(args.cmvn) mean_stats = cmvn_stats[0, :-1] count = cmvn_stats[0, -1] offset = np.expand_dims(mean_stats, 0) / count print("offset", offset) CMVN = offset else: CMVN = None if args.post_cmvn: cmvn_stats = kaldiio.load_mat(args.post_cmvn) mean_stats = cmvn_stats[0, :-1] count = cmvn_stats[0, -1] offset = np.expand_dims(mean_stats, 0) / count print("offset", offset) POSTCMVN = offset else: POSTCMVN = None # construct the data generator. params = { 'dim': (args.dim, 300, 1), 'mp_pooler': toolkits.set_mp(processes=args.multiprocess), 'nfft': 512, 'spec_len': 300, 'win_length': 400, 'hop_length': 160, 'n_classes': 8, 'sampling_rate': 16000, 'tandem': args.tandem, 'batch_size': args.batch_size, 'shuffle': True, 'normalize': False, 'cmvn': CMVN, 'postcmvn': POSTCMVN } # Datasets partition = {'train': trnlist, 'val': vallist} labels = {'train': trnlb.flatten(), 'val': vallb.flatten()} # Generators trn_gen = generator.DataGenerator(partition['train'], labels['train'], **params) val_gen = generator.DataGenerator(partition['val'], labels['val'], **params) network = model.vggvox_resnet2d_icassp(input_dim=params['dim'], num_class=params['n_classes'], mode='train', args=args) # ==> load pre-trained model ??? mgpu = len(keras.backend.tensorflow_backend._get_available_gpus()) if args.resume: print("Attempting to load", args.resume) if args.resume: if os.path.isfile(args.resume): if mgpu == 1: # by_name=True, skip_mismatch=True # https://github.com/WeidiXie/VGG-Speaker-Recognition/issues/46 network.load_weights(os.path.join(args.resume), by_name=True, skip_mismatch=True) else: network.layers[mgpu + 1].load_weights( os.path.join(args.resume)) print('==> successfully loading model {}.'.format(args.resume)) else: print("==> no checkpoint found at '{}'".format(args.resume)) print(network.summary()) print('==> gpu {} is, training {} images, classes: 0-{} ' 'loss: {}, aggregation: {}, ohemlevel: {}'.format( args.gpu, len(partition['train']), np.max(labels['train']), args.loss, args.aggregation_mode, args.ohem_level)) model_path, log_path = set_path(args) with open(os.path.join(model_path, 'label2idx'), 'w') as f: for key in l2i.keys(): f.write(key + ' ' + str(l2i[key]) + '\n') normal_lr = keras.callbacks.LearningRateScheduler(step_decay) tbcallbacks = keras.callbacks.TensorBoard(log_dir=log_path, histogram_freq=0, write_graph=True, write_images=False, update_freq=args.batch_size * 16) callbacks = [ keras.callbacks.ModelCheckpoint(os.path.join( model_path, 'weights-{epoch:02d}-{val_loss:.3f}.h5'), monitor='val_loss', mode='min', save_best_only=True), normal_lr, tbcallbacks ] if args.ohem_level > 1: # online hard negative mining will be used candidate_steps = int(len(partition['train']) // args.batch_size) iters_per_epoch = int( len(partition['train']) // (args.ohem_level * args.batch_size)) ohem_generator = generator.OHEM_generator( network, trn_gen, candidate_steps, args.ohem_level, args.batch_size, params['dim'], params['n_classes']) A = ohem_generator.next( ) # for some reason, I need to warm up the generator network.fit_generator(generator.OHEM_generator( network, trn_gen, iters_per_epoch, args.ohem_level, args.batch_size, params['dim'], params['n_classes']), steps_per_epoch=iters_per_epoch, epochs=args.epochs, max_queue_size=10, callbacks=callbacks, use_multiprocessing=False, workers=1, verbose=1) else: network.fit_generator(trn_gen, validation_data=val_gen, steps_per_epoch=int( len(partition['train']) // args.batch_size), epochs=args.epochs, max_queue_size=10, callbacks=callbacks, use_multiprocessing=True, workers=12, verbose=1)
def main(args): # 減少显存占用 config = tensorflow.ConfigProto() config.gpu_options.allow_growth = True _ = tensorflow.Session(config=config) # ================================== # Get Model # ================================== # construct the data generator. params = { 'dim': (257, None, 1), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': args.n_classes, 'sampling_rate': 16000, 'normalize': True } network_eval = model.vggvox_resnet2d_icassp(input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) # ==> load pre-trained model network_eval.load_weights(os.path.join(args.resume), by_name=True) print('==> successfully loading model {}.'.format(args.resume)) start = time.time() # 获取第一个语音特征 specs1 = utils.load_data(args.audio1_path, win_length=params['win_length'], sr=params['sampling_rate'], hop_length=params['hop_length'], n_fft=params['nfft'], spec_len=params['spec_len'], mode='eval') specs1 = np.expand_dims(np.expand_dims(specs1, 0), -1) feature1 = network_eval.predict(specs1)[0] # 获取第二个语音特征 specs2 = utils.load_data(args.audio2_path, win_length=params['win_length'], sr=params['sampling_rate'], hop_length=params['hop_length'], n_fft=params['nfft'], spec_len=params['spec_len'], mode='eval') specs2 = np.expand_dims(np.expand_dims(specs2, 0), -1) feature2 = network_eval.predict(specs2)[0] end = time.time() dist = np.dot(feature1, feature2.T) if dist > 0.8: print("%s 和 %s 为同一个人,相似度为:%f,平均预测时间:%dms" % (args.audio1_path, args.audio2_path, dist, round((end - start) * 1000) / 2)) else: print("%s 和 %s 不是同一个人,相似度仅为:%f,平均预测时间:%dms" % (args.audio1_path, args.audio2_path, dist, round((end - start) * 1000) / 2))
def main(wav_path, embedding_per_second=1.0, overlap_rate=0.5): # gpu configuration toolkits.initialize_GPU(args) params = { 'dim': (257, None, 1), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = spkModel.vggvox_resnet2d_icassp( input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) network_eval.load_weights(args.resume, by_name=True) model_args, _, inference_args = uisrnn.parse_arguments() model_args.observation_dim = 512 uisrnnModel = uisrnn.UISRNN(model_args) uisrnnModel.load(SAVED_MODEL_NAME) specs, intervals = load_data(wav_path, embedding_per_second=embedding_per_second, overlap_rate=overlap_rate) #specs1,interval1 = load_data(r'wavs/REC20190716140159.wav', embedding_per_second=1.2, overlap_rate=0.4) #mapTable1,keys1 =genMap(interval1) mapTable, keys = genMap(intervals) feats = [] for spec in specs: spec = np.expand_dims(np.expand_dims(spec, 0), -1) v = network_eval.predict(spec) feats += [v] # ============================================================================= # for spec1 in specs1: # spec1 = np.expand_dims(np.expand_dims(spec1, 0), -1) # v = network_eval.predict(spec1) # feats += [v] # ============================================================================= feats = np.array(feats)[:, 0, :].astype(float) # [splits, embedding dim] #print(len(feats),'00000000') #predicted_label = uisrnnModel.predict(feats, inference_args) #silhoutte score # ============================================================================= # sli=[] # fromsel=[] # li=[] # knum=[] # for i in range(10): # li=[] # range_n_clusters = list (range(2,5)) # for n_clusters in range_n_clusters: # clusterer = KMeans(n_clusters=n_clusters) # preds = clusterer.fit_predict(feats) # centers = clusterer.cluster_centers_ # # score = silhouette_score (feats, preds, metric='euclidean') # print ("For n_clusters = {}, silhouette score is {})".format(n_clusters, score)) # li.append([n_clusters,score,clusterer,centers]) # # ============================================================================= # # print([float(str(i[1])[:4]) for i in li]) # # kvalue=(max([float(str(i[1])[:4]) for i in li])) # # for i in range(len(li)): # # if kvalue==float(str(li[i][1])[:4]): # # true_k=li[i][0] # # break # # ============================================================================= # maxi=li[0][1] # for i in range(1,len(li)): # if li[i][1]-maxi>=0.005: # maxi=li[i][1] # for i in li: # if i[1]==maxi: # true_k=i[0] # # ============================================================================= # # maxi=max([i[1] for i in li]) # # for i in li: # # if i[1]==maxi: # # true_k=i[0] # # ============================================================================= # fromsel.append(li[true_k-2]) # print(true_k) # knum.append(true_k) # kval=(max(set(knum), key=knum.count)) # print(kval) # ============================================================================= clusterer = SpectralClusterer(min_clusters=2, max_clusters=100, p_percentile=0.95, gaussian_blur_sigma=1) predicted_label = clusterer.predict(feats) # ============================================================================= # clusters = KMeans(n_clusters=40, init='k-means++', max_iter=100, n_init=1, random_state = 0) # clusters.fit(feats) # tsne = TSNEVisualizer() # tsne.fit(feats, ["c{}".format(c) for c in clusters.labels_]) # tsne.poof() # ============================================================================= global no_speakers no_speakers = len(set(predicted_label)) #print(predicted_label,'**************************') time_spec_rate = 1000 * (1.0 / embedding_per_second) * ( 1.0 - overlap_rate) # speaker embedding every ?ms center_duration = int(1000 * (1.0 / embedding_per_second) // 2) speakerSlice = arrangeResult(predicted_label, time_spec_rate) for spk, timeDicts in speakerSlice.items( ): # time map to orgin wav(contains mute) for tid, timeDict in enumerate(timeDicts): s = 0 e = 0 for i, key in enumerate(keys): if (s != 0 and e != 0): break if (s == 0 and key > timeDict['start']): offset = timeDict['start'] - keys[i - 1] s = mapTable[keys[i - 1]] + offset if (e == 0 and key > timeDict['stop']): offset = timeDict['stop'] - keys[i - 1] e = mapTable[keys[i - 1]] + offset speakerSlice[spk][tid]['start'] = s speakerSlice[spk][tid]['stop'] = e for spk, timeDicts in speakerSlice.items(): print('========= ' + str(spk) + ' =========') for timeDict in timeDicts: s = timeDict['start'] e = timeDict['stop'] s = fmtTime(s) # change point moves to the center of the slice e = fmtTime(e) print(s + ' ==> ' + e) p = PlotDiar(map=speakerSlice, wav=wav_path, gui=True, size=(25, 6)) p.draw() p.plot.show()
def extract_embeddings(input_path=time_100_emp_train, mode='train'): toolkits.initialize_GPU(args) params = { 'dim': (257, None, 1), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = model.vggvox_resnet2d_icassp(input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) if args.resume: weight_path = os.path.join(base_path, args.resume) if os.path.isfile(weight_path): print('loading graph') network_eval.load_weights(weight_path, by_name=True) else: return 'Issue with loading graph' else: return 'Pre-trained graph is required' if mode == 'train': audio_files = [ filename for filename in Path(input_path).rglob('*.wav') ] total_files = len(audio_files) * 10 working_file = 0 emb_store = {} for audio in audio_files: print(f'processing {os.path.basename(os.path.dirname(audio))} ') specs = ut.load_data_aug(audio, win_length=params['win_length'], sr=params['sampling_rate'], hop_length=params['hop_length'], n_fft=params['nfft'], spec_len=params['spec_len'], mode='eval') count_file = 0 for sample in specs: print(f'Augmentation count is {count_file}') print(f'Processing file {working_file} of {total_files}') sample_spec = np.expand_dims(np.expand_dims(sample, 0), -1) class_label = os.path.basename(os.path.dirname(audio)) v = network_eval.predict(sample_spec) old_data = [] if class_label in emb_store.keys(): pre_data = emb_store.get(class_label) pre_data.append(v[0]) old_data = pre_data else: old_data.append(v[0]) emb_store[class_label] = old_data count_file += 1 working_file += 1 logging.info(f'For {audio} label stored is {class_label}') with open('../data/training_features_augmented.pickle', 'wb') as handle: pickle.dump(emb_store, handle, protocol=pickle.HIGHEST_PROTOCOL) else: specs = ut.load_data(input_path, win_length=params['win_length'], sr=params['sampling_rate'], hop_length=params['hop_length'], n_fft=params['nfft'], spec_len=params['spec_len'], mode='eval') specs = np.expand_dims(np.expand_dims(specs, 0), -1) vector_embedding = network_eval.predict(specs)[0] return vector_embedding
def main(): # gpu configuration toolkits.initialize_GPU(args) import model # ================================== # Get Train/Val. # ================================== print('Calculating test data lists...') # AI project list file if args.test_type == 'ai': verify_list = np.loadtxt('model/meta/sets.txt', str) else: raise IOError('Unknown test type.') verify_lb = np.array([int(i[0]) for i in verify_list]) list1 = np.array([os.path.join(args.data_path, i[1]) for i in verify_list]) list2 = np.array([os.path.join(args.data_path, i[2]) for i in verify_list]) total_list = np.concatenate((list1, list2)) unique_list = np.unique(total_list) # ================================== # Get Model # ================================== # construct the data generator. params = {'dim': (257, None, 1), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = model.vggvox_resnet2d_icassp(input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) # ==> load pre-trained model if args.resume: # Load pretrained weight if os.path.isfile('model/src/weights.h5'): network_eval.load_weights('model/src/weights.h5', by_name=True) print('Successfully loading model {}.'.format(args.resume)) else: raise IOError("No checkpoint found at '{}'".format(args.resume)) else: raise IOError('Please type in the model to load') print('\nStart testing...') # The feature extraction process has to be done sample-by-sample, # because each sample is of different lengths. total_length = len(unique_list) feats, scores, labels = [], [], [] for c, ID in enumerate(unique_list): specs = ut.load_data(ID, win_length=params['win_length'], sr=params['sampling_rate'], hop_length=params['hop_length'], n_fft=params['nfft'], spec_len=params['spec_len'], mode='eval') specs = np.expand_dims(np.expand_dims(specs, 0), -1) v = network_eval.predict(specs) feats += [v] feats = np.array(feats) allscores = [] match = [] nomatch = [] # ==> compute the pair-wise similarity. print("Model 1 scores") for c, (p1, p2) in enumerate(zip(list1, list2)): ind1 = np.where(unique_list == p1)[0][0] ind2 = np.where(unique_list == p2)[0][0] v1 = feats[ind1, 0] v2 = feats[ind2, 0] scores += [np.sum(v1*v2)] labels += [verify_lb[c]] if c != 0 and verify_lb[c] == 1: match.append(scores[-1]) elif verify_lb[c] == 0: nomatch.append(scores[-1]) allscores.append(scores[-1]) print('Score : {}'.format(scores[-1])) # For evaluation # match = [str(x) for x in match] # nomatch = [str(x) for x in nomatch] # with open("./eval/result.txt", "a") as w: # matches = ','.join(match) # nomatches = ','.join(nomatch) # w.write(matches+'\n') # w.write(nomatches+'\n') with open("result1.pickle", "wb") as w: pickle.dump(scores, w)
def main(wav_path, embedding_per_second=1.0, overlap_rate=0.5): # gpu configuration toolkits.initialize_GPU(args) params = { 'dim': (257, None, 1), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = spkModel.vggvox_resnet2d_icassp( input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) network_eval.load_weights(args.resume, by_name=True) model_args, _, inference_args = uisrnn.parse_arguments() model_args.observation_dim = 512 uisrnnModel = uisrnn.UISRNN(model_args) uisrnnModel.load(SAVED_MODEL_NAME) specs, intervals = load_data(wav_path, embedding_per_second=embedding_per_second, overlap_rate=overlap_rate) mapTable, keys = genMap(intervals) feats = [] for spec in specs: spec = np.expand_dims(np.expand_dims(spec, 0), -1) v = network_eval.predict(spec) feats += [v] feats = np.array(feats)[:, 0, :].astype(float) # [splits, embedding dim] predicted_label = uisrnnModel.predict(feats, inference_args) time_spec_rate = 1000 * (1.0 / embedding_per_second) * ( 1.0 - overlap_rate) # speaker embedding every ?ms center_duration = int(1000 * (1.0 / embedding_per_second) // 2) speakerSlice = arrangeResult(predicted_label, time_spec_rate) for spk, timeDicts in speakerSlice.items( ): # time map to orgin wav(contains mute) for tid, timeDict in enumerate(timeDicts): s = 0 e = 0 for i, key in enumerate(keys): if (s != 0 and e != 0): break if (s == 0 and key > timeDict['start']): offset = timeDict['start'] - keys[i - 1] s = mapTable[keys[i - 1]] + offset if (e == 0 and key > timeDict['stop']): offset = timeDict['stop'] - keys[i - 1] e = mapTable[keys[i - 1]] + offset speakerSlice[spk][tid]['start'] = s speakerSlice[spk][tid]['stop'] = e # for spk,timeDicts in speakerSlice.items(): # print('========= ' + str(spk) + ' =========') # for timeDict in timeDicts: # s = timeDict['start'] # e = timeDict['stop'] # s = fmtTime(s) # change point moves to the center of the slice # e = fmtTime(e) # print(s+' ==> '+e) # p = PlotDiar(map=speakerSlice, wav=wav_path, gui=True, size=(25, 6)) # p.draw() # p.plot.show() speech_r = speech_reg.Recognizer() sound = AudioSegment.from_wav(wav_path) for spk in speakerSlice.keys(): print('========= ' + str(spk) + ' =========') for item_dict in speakerSlice[spk]: audio_seg = sound[item_dict['start']:item_dict['stop']] s = item_dict['start'] e = item_dict['stop'] s = fmtTime(s) # change point moves to the center of the slice e = fmtTime(e) print(s + ' ==> ' + e) item_dict.update({'content': audio_seg}) filename = 'speaker' + str(spk) + '-' + str( item_dict['start'] / 1000) + '-' + str( item_dict['stop'] / 1000) + '.wav' audio_seg.export(filename, format="wav") audio = speech_reg.AudioFile(filename) # words=speech_reg.AudioData(audio_seg,sample_rate=fs,sample_width=2) with audio as source: words = speech_r.record(source) try: res = speech_r.recognize_google(words) except speech_reg.UnknownValueError: try: res = speech_r.recognize_sphinx(words) except speech_reg.UnknownValueError: res = '' item_dict.update({'content': res}) print(res) return speakerSlice
def main(): # gpu configuration toolkits.initialize_GPU(args) import model params = { 'dim': (513, None, 1), 'n_fft': 1024, 'win_length': 1024, 'hop_length': 640, 'n_classes': 2, 'sampling_rate': 16000, 'normalize': True, } network_eval = model.vggvox_resnet2d_icassp(input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) # ==> load pre-trained model ??? if args.resume: # ==> get real_model from arguments input, # load the model if the imag_model == real_model. if os.path.isfile(args.resume): network_eval.load_weights(os.path.join(args.resume), by_name=True, skip_mismatch=True) print('==> successfully loading model {}.'.format(args.resume)) else: raise IOError("==> no checkpoint found at '{}'".format( args.resume)) else: raise IOError('==> please type in the model to load') print('==> start testing.') if sum([args.data_path.endswith(i) for i in ['.wav', '.m4a', 'mp3']]) == 1: wav, sr_ret = librosa.load(args.data_path, sr=params['sampling_rate'], offset=5) linear_spect = ut.lin_spectogram_from_wav(wav, params['hop_length'], params['win_length'], params['n_fft']) print('sample_rate is ', sr_ret) elif args.data_path.endswith('.npy'): linear_spect = np.load(args.data_path) else: raise IOError('wrong input format') mag, _ = librosa.magphase(linear_spect) # magnitude mag_T = mag.T spec_mag = mag_T mu = np.mean(spec_mag, 0, keepdims=True) std = np.std(spec_mag * (10**5), 0, keepdims=True) / (10**5) spec_mag = (spec_mag - mu) / (std + 1e-3) spec_mag = np.expand_dims(spec_mag, (0, -1)) print(spec_mag.shape) if args.loss == 'regression': v = network_eval.predict(spec_mag) * 10 + 5 print('the predicted score is: {}'.format(v)) else: v = network_eval.predict(spec_mag) print(v)
def main(): params = { 'dim': (257, None, 1), 'n_fft': 512, 'win_length': 400, 'hop_length': 160, 'n_classes': 2, 'sampling_rate': 16000, 'normalize': True, } network_eval = model.vggvox_resnet2d_icassp(input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) # ==> load pre-trained model ??? if args.resume: # ==> get real_model from arguments input, # load the model if the imag_model == real_model. if os.path.isfile(args.resume): network_eval.load_weights(os.path.join(args.resume), by_name=True, skip_mismatch=True) print('==> successfully loading model {}.'.format(args.resume)) else: raise IOError("==> no checkpoint found at '{}'".format( args.resume)) else: raise IOError('==> please type in the model to load') print('==> start testing.') audio_clip_dir = args.data_path clipname_list = os.listdir(audio_clip_dir) with open('../../meta_data_100.json', 'r') as f: meta_data = json.load(f) new_meta_data = [] for id, score in meta_data: # find all corresponding audio clip id_audioclip_name_list = [i for i in clipname_list if id in i] clip_v = [] for id_audioclip_name in id_audioclip_name_list: wav, sr_ret = librosa.load(os.path.join(audio_clip_dir, id_audioclip_name), sr=params['sampling_rate']) linear_spect = ut.lin_spectogram_from_wav(wav, params['hop_length'], params['win_length'], params['n_fft']) mag, _ = librosa.magphase(linear_spect) # magnitude mag_T = mag.T spec_mag = mag_T mu = np.mean(spec_mag, 0, keepdims=True) std = np.std(spec_mag * (10**5), 0, keepdims=True) / (10**5) spec_mag = (spec_mag - mu) / (std + 1e-3) spec_mag = np.expand_dims(spec_mag, (0, -1)) v = network_eval.predict(spec_mag) * 10 + 5 v = round(v[0][0], 2).astype('float') clip_v.append(v) if len(id_audioclip_name_list) != 0: if sum(clip_v) / len(clip_v) < 3: print('{} is selected, its predicted score is {}'.format( id, sum(clip_v) / len(clip_v))) new_meta_data.append((id, score, sum(clip_v) / len(clip_v))) # if abs(v-score) < 1.2: # new_meta_data.append((id_audioclip_name, score, v, id)) # print('{} is selected, its rule score is {}, and its predicted score is {}'.format(id_audioclip_name, score, v)) with open('meta_data_low.json', 'w') as f: json.dump(new_meta_data, f)
def main(wav_path, embedding_per_second=1.0, overlap_rate=0.5, exportFile=None, expectedSpeakers=2): # gpu configuration toolkits.initialize_GPU(args) params = { 'dim': (257, None, 1), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = spkModel.vggvox_resnet2d_icassp( input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) network_eval.load_weights(args.resume, by_name=True) model_args, _, inference_args = uisrnn.parse_arguments() model_args.observation_dim = 512 uisrnnModel = uisrnn.UISRNN(model_args) uisrnnModel.load(SAVED_MODEL_NAME) specs, intervals = load_data(wav_path, embedding_per_second=embedding_per_second, overlap_rate=overlap_rate) mapTable, keys = genMap(intervals) feats = [] for spec in specs: spec = np.expand_dims(np.expand_dims(spec, 0), -1) v = network_eval.predict(spec) feats += [v] feats = np.array(feats)[:, 0, :].astype(float) # [splits, embedding dim] predicted_label = uisrnnModel.predict(feats, inference_args) time_spec_rate = 1000 * (1.0 / embedding_per_second) * ( 1.0 - overlap_rate) # speaker embedding every ?ms center_duration = int(1000 * (1.0 / embedding_per_second) // 2) speakerSlice = arrangeResult(predicted_label, time_spec_rate) for spk, timeDicts in speakerSlice.items( ): # time map to orgin wav(contains mute) for tid, timeDict in enumerate(timeDicts): s = 0 e = 0 for i, key in enumerate(keys): if (s != 0 and e != 0): break if (s == 0 and key > timeDict['start']): offset = timeDict['start'] - keys[i - 1] s = mapTable[keys[i - 1]] + offset if (e == 0 and key > timeDict['stop']): offset = timeDict['stop'] - keys[i - 1] e = mapTable[keys[i - 1]] + offset speakerSlice[spk][tid]['start'] = s speakerSlice[spk][tid]['stop'] = e n_speakers = len(speakerSlice) print('N-SPeakers:', n_speakers) global speaker_final speaker_final = [pdb.empty()] * n_speakers for spk, timeDicts in speakerSlice.items(): print('========= ' + str(spk) + ' =========') for timeDict in timeDicts: s = timeDict['start'] e = timeDict['stop'] diarization_try(wav_path, s / 1000, e / 1000, spk) s = fmtTime(s) # change point moves to the center of the slice e = fmtTime(e) print(s + ' ==> ' + e) # Find the Top n Speakers speaker_final.sort(key=lambda speaker: speaker.duration_seconds, reverse=True) speaker_final = speaker_final[0:expectedSpeakers] # Export the Files iso_wav_path = wav_path.split(".")[0] itr = 0 while itr < len(speaker_final): write_path = exportFile + "_speaker" + str(itr) + ".wav" speaker_final[itr].export(write_path, format="wav") itr += 1 del speaker_final
def main(): # gpu configuration toolkits.initialize_GPU(args) import model # ================================== # Get Train/Val. # ================================== total_list = [ os.path.join(args.data_path, file) for file in os.listdir(args.data_path) ] unique_list = np.unique(total_list) # ================================== # Get Model # ================================== # construct the data generator. params = { 'dim': (257, None, 1), 'nfft': 512, 'min_slice': 720, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = model.vggvox_resnet2d_icassp(input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) # ==> load pre-trained model ??? if args.resume: # ==> get real_model from arguments input, # load the model if the imag_model == real_model. if os.path.isfile(args.resume): network_eval.load_weights(os.path.join(args.resume), by_name=True) print('==> successfully loading model {}.'.format(args.resume)) else: raise IOError("==> no checkpoint found at '{}'".format( args.resume)) else: raise IOError('==> please type in the model to load') print('==> start testing.') # The feature extraction process has to be done sample-by-sample, # because each sample is of different lengths. train_cluster_id = [] train_sequence = [] SRC_PATH = r'/data/dataset/SpkWav120' wavDir = os.listdir(SRC_PATH) wavDir.sort() for i, spkDir in enumerate(wavDir): # Each speaker's directory spk = spkDir # speaker name wavPath = os.path.join(SRC_PATH, spkDir, 'audio') print('Processing speaker({}) : {}'.format(i, spk)) for wav in os.listdir(wavPath): # wavfile utter_path = os.path.join(wavPath, wav) feats = [] specs = load_data(utter_path, split=True, win_length=params['win_length'], sr=params['sampling_rate'], hop_length=params['hop_length'], n_fft=params['nfft'], min_slice=params['min_slice']) if (len(specs) < 1): continue for spec in specs: spec = np.expand_dims(np.expand_dims(spec, 0), -1) v = network_eval.predict(spec) feats += [v] feats = np.array(feats)[:, 0, :] # [splits, embedding dim] train_cluster_id.append([spk] * feats.shape[0]) train_sequence.append(feats) np.savez('training_data', train_sequence=train_sequence, train_cluster_id=train_cluster_id)
def main(): # gpu configuration toolkits.initialize_GPU(args) import model import generator import keras # ================================== # Get Train/Val. # ================================== trnlist, trnlb = toolkits.get_voxceleb2_datalist( args, path='../meta/vox2_train_wav.txt') vallist, vallb = toolkits.get_voxceleb2_datalist( args, path='../meta/vox2_val_wav.txt') # construct the data generator. params = { 'dim': (257, 250, 1), 'mp_pooler': toolkits.set_mp(processes=args.multiprocess), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'batch_size': args.batch_size, 'shuffle': True, 'normalize': True, } # Datasets partition = {'train': trnlist.flatten(), 'val': vallist.flatten()} labels = {'train': trnlb.flatten(), 'val': vallb.flatten()} # Generators trn_gen = generator.DataGenerator(partition['train'], labels['train'], **params) network = model.vggvox_resnet2d_icassp(input_dim=params['dim'], num_class=params['n_classes'], mode='train', args=args) # ==> load pre-trained model ??? mgpu = len(keras.backend.tensorflow_backend._get_available_gpus()) if args.resume: if os.path.isfile(args.resume): if mgpu == 1: network.load_weights(os.path.join(args.resume)) else: network.layers[mgpu + 1].load_weights(os.path.join( args.resume)) print('==> successfully loading model {}.'.format(args.resume)) else: print("==> no checkpoint found at '{}'".format(args.resume)) print(network.summary()) print('==> gpu {} is, training {} images, classes: 0-{} ' 'loss: {}, aggregation: {}, ohemlevel: {}'.format( args.gpu, len(partition['train']), np.max(labels['train']), args.loss, args.aggregation_mode, args.ohem_level)) model_path, log_path = set_path(args) normal_lr = keras.callbacks.LearningRateScheduler(step_decay) tbcallbacks = keras.callbacks.TensorBoard(log_dir=log_path, histogram_freq=0, write_graph=True, write_images=False, update_freq=args.batch_size * 16) callbacks = [ keras.callbacks.ModelCheckpoint(os.path.join( model_path, 'weights-{epoch:02d}-{acc:.3f}.h5'), monitor='loss', mode='min', save_best_only=True), normal_lr, tbcallbacks ] if args.ohem_level > 1: # online hard negative mining will be used candidate_steps = int(len(partition['train']) // args.batch_size) iters_per_epoch = int( len(partition['train']) // (args.ohem_level * args.batch_size)) ohem_generator = generator.OHEM_generator( network, trn_gen, candidate_steps, args.ohem_level, args.batch_size, params['dim'], params['n_classes']) A = ohem_generator.next( ) # for some reason, I need to warm up the generator network.fit_generator(generator.OHEM_generator( network, trn_gen, iters_per_epoch, args.ohem_level, args.batch_size, params['dim'], params['n_classes']), steps_per_epoch=iters_per_epoch, epochs=args.epochs, max_queue_size=10, callbacks=callbacks, use_multiprocessing=False, workers=1, verbose=1) else: network.fit_generator(trn_gen, steps_per_epoch=int( len(partition['train']) // args.batch_size), epochs=args.epochs, max_queue_size=10, callbacks=callbacks, use_multiprocessing=False, workers=1, verbose=1)
def main(wav_path, check, embedding_per_second=1.0, overlap_rate=0.5): # gpu configuration toolkits.initialize_GPU(args) params = { 'dim': (257, None, 1), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = spkModel.vggvox_resnet2d_icassp( input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) network_eval.load_weights(args.resume, by_name=True) model_args, _, inference_args = uisrnn.parse_arguments() model_args.observation_dim = 512 uisrnnModel = uisrnn.UISRNN(model_args) uisrnnModel.load(SAVED_MODEL_NAME) specs, intervals = load_data(wav_path, embedding_per_second=embedding_per_second, overlap_rate=overlap_rate) mapTable, keys = genMap(intervals) if check != '': specs1, interval1 = load_data(check, embedding_per_second=1.2, overlap_rate=0.4) mapTable1, keys1 = genMap(interval1) feats = [] for spec in specs: spec = np.expand_dims(np.expand_dims(spec, 0), -1) v = network_eval.predict(spec) feats += [v] featss = np.array(feats)[:, 0, :].astype(float) predicted_label = uisrnnModel.predict(featss, inference_args) total_speaker = len(set(predicted_label)) global no_speakers print("predicted_label: %s" % predicted_label) no_speakers = len(set(predicted_label)) print('total no of speakers', no_speakers) time_spec_rate = 1000 * (1.0 / embedding_per_second) * ( 1.0 - overlap_rate) # speaker embedding every ?ms if check != '': for spec1 in specs1: spec1 = np.expand_dims(np.expand_dims(spec1, 0), -1) v = network_eval.predict(spec1) feats += [v] featss = np.array(feats)[:, 0, :].astype(float) # [splits, embedding dim] print("=====================") print(feats) print(featss) print("=====================") predicted_label2 = uisrnnModel.predict(featss, inference_args) check_speaker = len(set(predicted_label2)) print("predicted_label2: %s" % predicted_label2) print('same Speaker' if total_speaker == check_speaker else 'not the same speaker') print('speaker detected as ' + str(predicted_label2[-1]) if total_speaker == check_speaker else '') speakerSlice2 = arrangeResult(predicted_label2, time_spec_rate) print("=============speakerSlice2===============") for spk, timeDicts in speakerSlice2.items( ): # time map to orgin wav(contains mute) for tid, timeDict in enumerate(timeDicts): s = 0 e = 0 for i, key in enumerate(keys): if (s != 0 and e != 0): break if (s == 0 and key > timeDict['start']): offset = timeDict['start'] - keys[i - 1] s = mapTable[keys[i - 1]] + offset if (e == 0 and key > timeDict['stop']): offset = timeDict['stop'] - keys[i - 1] e = mapTable[keys[i - 1]] + offset speakerSlice2[spk][tid]['start'] = s speakerSlice2[spk][tid]['stop'] = e for spk, timeDicts in speakerSlice2.items(): print('========= ' + str(spk) + ' =========') for timeDict in timeDicts: s = timeDict['start'] e = timeDict['stop'] s = fmtTime(s) # change point moves to the center of the slice e = fmtTime(e) print(s + ' ==> ' + e) print("=============speakerSlice2===============") #print(predicted_label,'**************************') center_duration = int(1000 * (1.0 / embedding_per_second) // 2) speakerSlice = arrangeResult(predicted_label, time_spec_rate) for spk, timeDicts in speakerSlice.items( ): # time map to orgin wav(contains mute) for tid, timeDict in enumerate(timeDicts): s = 0 e = 0 for i, key in enumerate(keys): if (s != 0 and e != 0): break if (s == 0 and key > timeDict['start']): offset = timeDict['start'] - keys[i - 1] s = mapTable[keys[i - 1]] + offset if (e == 0 and key > timeDict['stop']): offset = timeDict['stop'] - keys[i - 1] e = mapTable[keys[i - 1]] + offset speakerSlice[spk][tid]['start'] = s speakerSlice[spk][tid]['stop'] = e for spk, timeDicts in speakerSlice.items(): print('========= ' + str(spk) + ' =========') for timeDict in timeDicts: s = timeDict['start'] e = timeDict['stop'] s = fmtTime(s) # change point moves to the center of the slice e = fmtTime(e) print(s + ' ==> ' + e)
def main(): # gpu configuration toolkits.initialize_GPU(args) import model # ================================== # Get Train/Val. # ================================== total_list = [os.path.join(args.data_path, file) for file in os.listdir(args.data_path)] unique_list = np.unique(total_list) # ================================== # Get Model # ================================== # construct the data generator. params = {'dim': (257, None, 1), 'nfft': 512, 'min_slice': 720, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = model.vggvox_resnet2d_icassp(input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) # ==> load pre-trained model ??? if args.resume: # ==> get real_model from arguments input, # load the model if the imag_model == real_model. if os.path.isfile(args.resume): network_eval.load_weights(os.path.join(args.resume), by_name=True) print('==> successfully loading model {}.'.format(args.resume)) else: raise IOError("==> no checkpoint found at '{}'".format(args.resume)) else: raise IOError('==> please type in the model to load') # The feature extraction process has to be done sample-by-sample, # because each sample is of different lengths. SRC_PATH = r'/data/dataset/SpkWav120' SRC_PATH = r'./ghostvlad/SRC_PATH' # bencq path print(SRC_PATH) path_spk_tuples = prepare_data(SRC_PATH) train_sequence = [] train_cluster_id = [] CNT = 7000 # 7000 for epoch in range(CNT): # Random choice utterances from whole wavfiles # A merged utterance contains [10,20] utterances splits_count = np.random.randint(10, 20, 1) # 最小值,最大值,[维度] path_spks = random.sample(path_spk_tuples, splits_count[0]) utterance_specs, utterance_speakers = load_data(path_spks, min_win_time=500, max_win_time=1600) feats = [] for spec in utterance_specs: spec = np.expand_dims(np.expand_dims(spec, 0), -1) v = network_eval.predict(spec) feats += [v] feats = np.array(feats)[:,0,:] # [splits, embedding dim] train_sequence.append(feats) train_cluster_id.append(utterance_speakers) print("epoch:{}, utterance length: {}, speakers: {}".format(epoch, len(utterance_speakers), len(path_spks))) np.savez('training_data', train_sequence=train_sequence, train_cluster_id=train_cluster_id)
def __init__(self): self.filename2embedding = {} arguments = "--net resnet34s --gpu 0 --ghost_cluster 2 --vlad_cluster 8 --loss softmax " \ "--resume " \ "/media/ben/datadrive/Software/VGG-Speaker-Recognition/model/gvlad_softmax" \ "/2020-11-15_resnet34s_bs16_adam_lr0.001_vlad8_ghost2_bdim512_ohemlevel0" \ "/weights-42-0.931.h5 --data_path " \ "/media/ben/datadrive/Zalo/voice-verification/Train-Test-Data/dataset/".split() ZALO_TEST = "/media/ben/datadrive/Zalo/voice-verification/vgg_db_files/val_trials.txt" parser = argparse.ArgumentParser() # set up training configuration. parser.add_argument("--gpu", default="", type=str) parser.add_argument("--resume", default="", type=str) parser.add_argument("--batch_size", default=16, type=int) parser.add_argument("--data_path", default="/media/weidi/2TB-2/datasets/voxceleb1/wav", type=str) # set up network configuration. parser.add_argument("--net", default="resnet34s", choices=["resnet34s", "resnet34l"], type=str) parser.add_argument("--ghost_cluster", default=2, type=int) parser.add_argument("--vlad_cluster", default=8, type=int) parser.add_argument("--bottleneck_dim", default=512, type=int) parser.add_argument("--aggregation_mode", default="gvlad", choices=["avg", "vlad", "gvlad"], type=str) # set up learning rate, training loss and optimizer. parser.add_argument("--loss", default="softmax", choices=["softmax", "amsoftmax"], type=str) parser.add_argument("--test_type", default="normal", choices=["normal", "hard", "extend"], type=str) global args args = parser.parse_args(arguments) # gpu configuration toolkits.initialize_GPU(args) # ================================== # Get Train/Val. # ================================== print("==> Initialising inference engine...".format(args.test_type)) # ================================== # Get Model # ================================== # construct the data generator. self.params = {"dim": (257, None, 1), "nfft": 512, "spec_len": 250, "win_length": 400, "hop_length": 160, "n_classes": 5994, "sampling_rate": 16000, "normalize": True, } self.network_eval = model.vggvox_resnet2d_icassp(input_dim=self.params["dim"], num_class=self.params["n_classes"], mode="eval", args=args) # ==> load pre-trained model ??? if args.resume: # ==> get real_model from arguments input, # load the model if the image_model == real_model. if os.path.isfile(args.resume): self.network_eval.load_weights(os.path.join(args.resume), by_name=True) print("==> successfully loading model {}.".format(args.resume)) else: raise IOError("==> no checkpoint found at '{}'".format(args.resume)) else: raise IOError("==> please type in the model to load") print("==> start testing.")
def dia_audio(wav_path, embedding_per_second=0.3, overlap_rate=0.33): # gpu configuration #toolkits.initialize_GPU(args) params = {'dim': (257, None, 1), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = spkModel.vggvox_resnet2d_icassp(input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) network_eval.load_weights(args.resume, by_name=True) model_args.observation_dim = 512 uisrnnModel = uisrnn.UISRNN(model_args) uisrnnModel.load(SAVED_MODEL_NAME) specs, intervals = load_data( wav_path, embedding_per_second=embedding_per_second, overlap_rate=overlap_rate) mapTable, keys = genMap(intervals) feats = [] for spec in specs: spec = np.expand_dims(np.expand_dims(spec, 0), -1) v = network_eval.predict(spec) feats += [v] feats = np.array(feats)[:, 0, :].astype(float) # [splits, embedding dim] predicted_label = uisrnnModel.predict(feats, inference_args) time_spec_rate = 1000*(1.0/embedding_per_second) * \ (1.0-overlap_rate) # speaker embedding every ?ms center_duration = int(1000*(1.0/embedding_per_second)//2) speakerSlice = arrangeResult(predicted_label, time_spec_rate) for spk, timeDicts in speakerSlice.items(): # time map to orgin wav(contains mute) for tid, timeDict in enumerate(timeDicts): s = 0 e = 0 for i, key in enumerate(keys): if(s != 0 and e != 0): break if(s == 0 and key > timeDict['start']): offset = timeDict['start'] - keys[i-1] s = mapTable[keys[i-1]] + offset if(e == 0 and key > timeDict['stop']): offset = timeDict['stop'] - keys[i-1] e = mapTable[keys[i-1]] + offset speakerSlice[spk][tid]['start'] = s speakerSlice[spk][tid]['stop'] = e for spk, timeDicts in speakerSlice.items(): ##print('========= ' + str(spk) + ' =========') for timeDict in timeDicts: s = timeDict['start'] e = timeDict['stop'] s = fmtTime(s) # change point moves to the center of the slice e = fmtTime(e) #print(s+' ==> '+e) # p = PlotDiar(map=speakerSlice, wav=wav_path, gui=True, size=(25, 6)) # p.draw() # p.plot.show() return speakerSlice
def main(wav_path, embedding_per_second=1.0, n_classes=5994, overlap_rate=0.5, plot_results=True): # gpu configuration toolkits.initialize_GPU(args) params = { 'dim': (257, None, 1), 'nfft': 512, 'spec_len': 250, 'win_length': 400, 'hop_length': 160, 'n_classes': 5994, 'sampling_rate': 16000, 'normalize': True, } network_eval = spkModel.vggvox_resnet2d_icassp( input_dim=params['dim'], num_class=params['n_classes'], mode='eval', args=args) network_eval.load_weights(args.resume, by_name=True) # model_args, _, inference_args = uisrnn.parse_arguments() model_args.observation_dim = 512 uisrnnModel = uisrnn.UISRNN(model_args) uisrnnModel.load(SAVED_MODEL_NAME) specs, intervals = load_data(wav_path, embedding_per_second=embedding_per_second, overlap_rate=overlap_rate) mapTable, keys = genMap(intervals) print('intervals', intervals, len(intervals)) print('mapTable', mapTable, len(mapTable)) print('keys', keys, len(keys)) # print('mapTable, keys', mapTable, keys) feats = [] for spec in specs: spec = np.expand_dims(np.expand_dims(spec, 0), -1) v = network_eval.predict(spec) # print('v',v.shape) #print('feats', feats.shape) feats += [v] feats = np.array(feats)[:, 0, :].astype(float) # [splits, embedding dim] predicted_label = uisrnnModel.predict(feats, inference_args) print(feats.shape) print(inference_args) print('predicted_label', predicted_label) time_spec_rate = 1000 * (1.0 / embedding_per_second) * ( 1.0 - overlap_rate) # speaker embedding every ?ms print('time_spec_rate', time_spec_rate) center_duration = int(1000 * (1.0 / embedding_per_second) // 2) speakerSlice = arrangeResult(predicted_label, time_spec_rate) print('speakerSlice', speakerSlice) for spk, timeDicts in speakerSlice.items( ): # time map to orgin wav(contains mute) print(spk, timeDicts) for tid, timeDict in enumerate(timeDicts): print(tid, timeDict) s = 0 e = 0 for i, key in enumerate(keys): if (s != 0 and e != 0): break if (s == 0 and key > timeDict['start']): offset = timeDict['start'] - keys[i - 1] print('offset', offset) s = mapTable[keys[i - 1]] + offset if (e == 0 and key > timeDict['stop']): offset = timeDict['stop'] - keys[i - 1] e = mapTable[keys[i - 1]] + offset print('i,s,e') print(i, s, e, tid, spk) print('>>>>>', i, s, e, tid, spk) speakerSlice[spk][tid]['start'] = s speakerSlice[spk][tid]['stop'] = e speaker_assingments = [] for spk, timeDicts in speakerSlice.items(): speaker = str(spk) print('========= ' + str(spk) + ' =========') for timeDict in timeDicts: start = timeDict['start'] end = timeDict['stop'] start = fmtTime( start) # change point moves to the center of the slice end = fmtTime(end) print(start + ' ==> ' + end) speaker_assingments.append((start, end, speaker, wav_path)) if plot_results: p = PlotDiar(map=speakerSlice, wav=wav_path, gui=True, size=(25, 6)) p.draw() p.plot.show() return feats, predicted_label, intervals, speaker_assingments, time_spec_rate