def get_id_result(): print("Loading model weights from [{}]....".format(c.WEIGHTS_FILE)) model = vggvox_model() model.load_weights(c.WEIGHTS_FILE) model.summary() print("Processing enroll samples....") enroll_result = get_embeddings_from_list_file(model, c.ENROLL_LIST_FILE, c.MAX_SEC) enroll_embs = np.array([emb.tolist() for emb in enroll_result['embedding']]) speakers = enroll_result['speaker'] print("Processing test samples....") test_result = get_embeddings_from_list_file(model, c.TEST_LIST_FILE, c.MAX_SEC) test_embs = np.array([emb.tolist() for emb in test_result['embedding']]) print("Comparing test samples against enroll samples....") distances = pd.DataFrame(cdist(test_embs, enroll_embs, metric=c.COST_METRIC), columns=speakers) scores = pd.read_csv(c.TEST_LIST_FILE, delimiter=",",header=0,names=['test_file','test_speaker']) scores = pd.concat([scores, distances],axis=1) scores['result'] = scores[speakers].idxmin(axis=1) scores['correct'] = (scores['result'] == scores['test_speaker'])*1. # bool to int print("Writing outputs to [{}]....".format(c.RESULT_FILE)) result_dir = os.path.dirname(c.RESULT_FILE) if not os.path.exists(result_dir): os.makedirs(result_dir) with open(c.RESULT_FILE, 'w') as f: scores.to_csv(f, index=False)
def RT_CNN(): print("Loading model weights from [{}]....".format(c.WEIGHTS_FILE)) model = vggvox_model() # Creates a VGGVox model model.load_weights( c.WEIGHTS_FILE) # Load the weights of the trained models model.summary() # Print a summary of the loaded model print("Loading embeddings from enroll") toLoad = load("data/model/RTSP_CNN.out") enroll_embs = [] speakers = [] for spk, embs in toLoad.items(): for e in embs: enroll_embs.append(e) speakers.append(spk) print(spk) count = 0 buffer = AudioBuffer() start_time = time.time() while count < 3: count += 1 buffer.record(chunk_size=c.SAMPLE_RATE) data = buffer.get_data() data = np.frombuffer(data, 'int16') buckets = build_buckets(c.MAX_SEC, c.BUCKET_STEP, c.FRAME_STEP) data *= 2**15 while (len(data) / (c.FRAME_STEP * c.SAMPLE_RATE) < 101): data = np.append(data, 0) # get FFT spectrum data = remove_dc_and_dither(data, c.SAMPLE_RATE) data = sigproc.preemphasis(data, coeff=c.PREEMPHASIS_ALPHA) frames = sigproc.framesig(data, frame_len=c.FRAME_LEN * c.SAMPLE_RATE, frame_step=c.FRAME_STEP * c.SAMPLE_RATE, winfunc=np.hamming) fft = abs(np.fft.fft(frames, n=c.NUM_FFT)) fft_norm = normalize_frames(fft.T) # truncate to max bucket sizes rsize = max(k for k in buckets if k <= len(fft_norm.T)) rstart = int((len(fft_norm.T) - rsize) / 2) x = fft_norm[:, rstart:rstart + rsize] test_embs = np.squeeze(model.predict(x.reshape(1, *x.shape, 1))) distances = [] for embs in enroll_embs: distances.append(euclidean(test_embs, embs)) print(len(speakers)) idx = np.argmin(distances) print(speakers[idx]) print("Ok, ", time.time() - start_time - 3, " seconds")
def train_vggvox_model(train_list_file): model = vggvox_model() train_data = get_train_list(train_list_file) # 编译模型 model.compile( optimizer=optimizers.RMSprop(lr=0.1), loss="categorical_crossentropy", # 使用分类交叉熵作为损失函数 metrics=['acc']) # 使用精度作为指标 # 测试输入格式 (**Most important**) # data = np.random.randn(1,512,30,1) # lable = np.zeros((1251,)) # lable[1000] = 1. # lable = lable.reshape((1,1,1,1251)) # print(lable.shape) train_data["voice"] = train_data["voice"].apply( lambda x: x.reshape(1, *x.shape, 1)) train_data["lable"] = train_data["lable"].apply(lambda x: x.reshape( (1, 1, 1, 1251))) print("Start training...") history = model.fit_generator(gene(train_data), epochs=100, steps_per_epoch=c.TRAIN_NUM) model.save_weights(filepath=c.PERSONAL_WEIGHT) print("loss: ", min(history.history["loss"])) print("Done!")
def batch_offline_test(): print("Loading model for batch offline test from [{}]....".format( c.WEIGHTS_FILE)) model = vggvox_model() model.load_weights(c.WEIGHTS_FILE) model.summary() print("Processing enroll samples in [{}]....".format(c.ENROLL_WAV_DIR)) enroll_result = forward_offline(model, c.ENROLL_WAV_DIR, c.ENROLL_LIST_FILE, c.MAX_SEC_ENROLL) enroll_embs = np.array( [emb.tolist() for emb in enroll_result['embedding']]) speakers = enroll_result['speaker'] print("Processing test samples in [{}]....".format(c.TEST_WAV_DIR)) test_result = forward_offline(model, c.TEST_WAV_DIR, c.TEST_LIST_FILE, c.MAX_SEC_TEST) test_embs = np.array([emb.tolist() for emb in test_result['embedding']]) print("Comparing test samples against enroll samples....") distances = pd.DataFrame(cdist(test_embs, enroll_embs, metric=c.COST_METRIC), columns=speakers) # get all speakers in top 10% num_speakers_top_1 = max(int(len(speakers) / 100), 1) num_speakers_top_5 = max(int(len(speakers) * 5 / 100), 1) num_speakers_top_10 = max(int(len(speakers) * 10 / 100), 1) results = pd.DataFrame(distances.columns[distances.values.argsort(1) [:, :num_speakers_top_10]].values, index=distances.index) results = results.rename(columns=lambda x: 'result_{}'.format(x + 1)) scores = pd.read_csv(c.TEST_LIST_FILE, delimiter=",", header=0, names=['test_file', 'test_speaker']) scores = pd.concat([scores, distances, results], axis=1) scores['correct'] = (scores['result_1'] == scores['test_speaker']) * 1. # bool to int correct = scores['correct'] for i in range(1, num_speakers_top_10 + 1, 1): correct = np.logical_or( correct, scores['result_{}'.format(i)] == scores['test_speaker']) * 1. if i == num_speakers_top_1: scores['correct_top_1%'] = correct elif i == num_speakers_top_5: scores['correct_top_5%'] = correct elif i == num_speakers_top_10: scores['correct_top_10%'] = correct # output print("Writing outputs to [{}]....".format(c.OFFLINE_RESULT_FILE)) with open(c.OFFLINE_RESULT_FILE, c.OFFLINE_RESULT_WRITE_OPTION) as f: scores.to_csv(f, index=False)
def train_vggvox_model(model_load_path, model_save_path, continue_training, save_model): audiolist, labellist = tools.get_voxceleb1_datalist( c.FA_DIR, c.VERI_TRAIN_LIST_FILE) train_gene = tools.DataGenerator(audiolist, labellist, c.DIM, c.MAX_SEC, c.BUCKET_STEP, c.FRAME_STEP, c.BATCH_SIZE, c.N_CLASS) if continue_training == 1: print("load model from {}...".format(model_load_path)) model = load_model(model_load_path) else: model = vggvox_model() # 编译模型 model.compile( optimizer=optimizers.Adam(lr=c.LR, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0), loss="categorical_crossentropy", # 使用分类交叉熵作为损失函数 metrics=['acc']) # 使用精度作为指标 # train_data["voice"] = train_data["voice"].apply(lambda x: x.reshape(1,*x.shape,1)) # train_data["lable"] = train_data["lable"].apply(lambda x: x.reshape((1, 1, 1, 1251))) tbcallbacks = keras.callbacks.TensorBoard(log_dir=c.TENSORBOARD_LOG_PATH, histogram_freq=0, write_graph=True, write_images=False, update_freq=c.BATCH_SIZE * 10000) callbacks = [ keras.callbacks.ModelCheckpoint(os.path.join( c.VERI_MODEL_FA_PATH, 'veri_model_128_{epoch:02d}_{loss:.3f}_{acc:.3f}.h5'), monitor='loss', mode='min', save_best_only=True, save_weights_only=False, period=5), tbcallbacks ] print("Start training...") history = model.fit_generator(train_gene, epochs=c.EPOCHS, steps_per_epoch=int( len(labellist) // c.BATCH_SIZE), callbacks=callbacks) print("save weights to {}...".format(c.PERSONAL_WEIGHT)) model.save_weights(filepath=c.PERSONAL_WEIGHT, overwrite=True) if save_model == 1: print("save model to {}...".format(model_save_path)) model.save(model_save_path, overwrite=True) tools.draw_loss_img(history.history, c.LOSS_PNG) tools.draw_acc_img(history.history, c.ACC_PNG) print("Done!")
def test(): TEST_WAV1 = "data/wav/file1.wav" TEST_WAV2 = "data/wav/file2.wav" model = vggvox_model() model.load_weights("data/model_weights/model_0.h5") buckets = build_buckets(c.MAX_SEC_TEST, c.BUCKET_STEP_SEC) spec1 = read_and_process_audio(TEST_WAV1, buckets) emb1 = model.predict(spec1.reshape(1, *spec1.shape, 1)) spec2 = read_and_process_audio(TEST_WAV2, buckets) emb2 = model.predict(spec2.reshape(1, *spec2.shape, 1)) dist = np.linalg.norm(emb1 - emb2) print(dist)
def get_id_result(): print("Loading model weights from [{}]....".format(c.WEIGHTS_FILE)) model = vggvox_model() # Creates a VGGVox model model.load_weights( c.WEIGHTS_FILE) # Load the weights of the trained models model.summary() # Print a summary of the loaded model print("Processing enroll samples....") enroll_result = get_embeddings_from_list_file( model, c.ENROLL_LIST_FILE, c.MAX_SEC) # Extracts information from fft using the VGGVox model enroll_embs = np.array( [emb.tolist() for emb in enroll_result['embedding']]) speakers = enroll_result['speaker'] toSave = defaultdict(list) for i in range(len(speakers)): toSave[speakers[i]].append(enroll_embs[i]) dump(toSave, "data/model/RTSP_CNN.out") start_time = time.time() print("Processing test samples....") test_result = get_embeddings_from_list_file(model, c.TEST_LIST_FILE, c.MAX_SEC) test_embs = np.array([emb.tolist() for emb in test_result['embedding']]) print("Comparing test samples against enroll samples....") distances = pd.DataFrame( cdist(test_embs, enroll_embs, metric=c.COST_METRIC), columns=speakers ) # Compute the distance between each test and enroll data scores = pd.read_csv(c.TEST_LIST_FILE, delimiter=",", header=0, names=['test_file', 'test_speaker']) scores = pd.concat([scores, distances], axis=1) scores['result'] = scores[speakers].idxmin(axis=1) print(time.time() - start_time, " seconds") index = scores[speakers].index result = scores[speakers].idxmin(axis=1) for idx in index: if (min(scores[speakers].values[idx]) > 0.16): result[idx] = "Unknown" scores['result_threshold'] = result scores['correct'] = (scores['result'] == scores['test_speaker']) * 1. # bool to int scores['correct_threshold'] = (scores['result_threshold'] == scores['test_speaker']) * 1. # bool to int
def retrain(x_train, y_train): print("Loading model weights from [{}]....".format(c.WEIGHTS_FILE)) baseline_model = vggvox_model() baseline_model.load_weights(c.WEIGHTS_FILE) print("Creating base network ...") model = vggvox_mod_model(baseline_model) model.summary() train_for_classification(model, x_train, y_train) return model
def retrain(x_train, y_train): print("Loading model weights from [{}]....".format(c.WEIGHTS_FILE)) baseline_model = vggvox_model() baseline_model.load_weights(c.WEIGHTS_FILE) print("Creating base network ...") model = vggvox_mod_model(baseline_model) model.summary() train_for_classification(model, x_train, y_train) # print("Creating siamese network ...") # siamese_model = siamese_network(input_shape, model) # print("Training....") # siamese_model = train_siamese(siamese_model, tr_pairs, tr_y) return model
def verify(opt): input_wav_path = opt.input test_wav_path = opt.test metric_fn = opt.metric threshold = opt.threshold print("Loading model weights from [{}]....".format(c.WEIGHTS_FILE)) model = vggvox_model() model.load_weights(c.WEIGHTS_FILE) model.summary() print("Processing enroll samples....") enroll_result = get_embeddings_from_file(model, input_wav_path, c.MAX_SEC) enroll_embs = np.array( [emb.tolist() for emb in enroll_result['embedding']]) # speakers = enroll_result['speaker'] print("Processing test samples....") test_result = get_embeddings_from_file(model, test_wav_path, c.MAX_SEC) test_embs = np.array([emb.tolist() for emb in test_result['embedding']]) print("Comparing test samples against enroll samples....") distances = pd.DataFrame(cdist(test_embs, enroll_embs, metric=metric_fn), columns=['distance']) # scores = pd.read_csv(c.TEST_LIST_FILE, delimiter=",",header=0,names=['test_file','test_speaker']) scores = pd.DataFrame({ 'input': [input_wav_path.split('/')[-1]], 'test': [test_wav_path.split('/')[-1]] }) scores['metric'] = metric_fn scores = pd.concat([scores, distances], axis=1) scores['threshold'] = threshold scores['result'] = scores['distance'] < threshold # scores['correct'] = (scores['result'] == scores['test_speaker'])*1. # bool to int print(scores) print("Writing outputs to [{}]....".format(c.RESULT_FILE)) result_dir = os.path.dirname(c.RESULT_FILE) if not os.path.exists(result_dir): os.makedirs(result_dir) with open(c.RESULT_FILE, 'w') as f: scores.to_csv(f, index=False)
def get_id_result(): #get training and testing pair for training (x_train, y_train) = get_fft_features_from_list_file(c.ENROLL_LIST_FILE, c.MAX_SEC) (x_test, y_test) = get_fft_features_from_list_file(c.TEST_LIST_FILE, c.MAX_SEC) y_train = to_categorical(y_train, num_classes=c.NUM_CLASSES) y_test = to_categorical(y_test, num_classes=c.NUM_CLASSES) print("Y_train.shape: ", y_train.shape) if c.RETRAIN: model = retrain(x_train, y_train) else: baseline_model = vggvox_model() model = vggvox_mod_model(baseline_model) model.load_weights(c.VGGM_WEIGHTS_FILE) model = compile_model(model) score = model.evaluate(x_test, y_test, verbose=1) print("loss: {}, top-1 accuracy (/%): {}, top-5 accuracy (/%): {}".format( score[0], score[1], score[2]))
def offline_test(): print("Loading model for offline test from [{}]....".format( c.WEIGHTS_FILE)) model = vggvox_model() model.load_weights(c.WEIGHTS_FILE) model.summary() print("Processing enroll samples in [{}]....".format(c.ENROLL_WAV_DIR)) enroll_result = forward_offline(model, c.ENROLL_WAV_DIR, c.ENROLL_LIST_FILE, c.MAX_SEC_ENROLL) enroll_embs = np.array( [emb.tolist() for emb in enroll_result['embedding']]) speakers = enroll_result['speaker'] print("Processing test samples in [{}]....".format(c.TEST_WAV_DIR)) test_result = forward_offline(model, c.TEST_WAV_DIR, c.TEST_LIST_FILE, c.MAX_SEC_TEST) test_embs = np.array([emb.tolist() for emb in test_result['embedding']]) print("Comparing test samples against enroll samples....") distances = pd.DataFrame(cdist(test_embs, enroll_embs, metric=c.COST_METRIC), columns=speakers) scores = pd.read_csv(c.TEST_LIST_FILE, delimiter=",", header=0, names=['test_file', 'test_speaker']) scores = pd.concat([scores, distances], axis=1) scores['result'] = scores[speakers].idxmin(axis=1) scores['correct'] = (scores['result'] == scores['test_speaker']) * 1. # bool to int print("Writing outputs to [{}]....".format(c.OFFLINE_RESULT_FILE)) with open(c.OFFLINE_RESULT_FILE, c.OFFLINE_RESULT_WRITE_OPTION) as f: scores.to_csv(f, index=False)
def online_test(): print("Loading model for online test from [{}]....".format(c.WEIGHTS_FILE)) model = vggvox_model() model.load_weights(c.WEIGHTS_FILE) model.summary() print("Processing enroll samples in [{}]....".format(c.ENROLL_WAV_DIR)) enroll_result = forward_offline(model, c.ENROLL_WAV_DIR, c.ENROLL_LIST_FILE, c.MAX_SEC_ENROLL) enroll_embs = np.array( [emb.tolist() for emb in enroll_result['embedding']]) speakers = enroll_result['speaker'] with open(c.ONLINE_RESULT_FILE, c.ONLINE_RESULT_WRITE_OPTION) as f: f.write("condition,test_speaker,{},result,correct\n".format( ','.join(speakers))) CSV_PREFIX = c.ONLINE_CONDITION + "," + c.ONLINE_SPEAKER + "," p = pyaudio.PyAudio() while True: # Record stream = p.open(format=c.FORMAT, channels=c.NUM_CHANNEL, rate=c.SAMPLE_RATE, input=True, frames_per_buffer=c.CHUNK) print("\nStart speaking") frames = [] for i in range(0, int(c.SAMPLE_RATE / c.CHUNK * c.ONLINE_RECORD_SEC)): data = stream.read(c.CHUNK) frames.append(data) print("Done recording") stream.stop_stream() stream.close() # Save audio wf = wave.open(c.ONLINE_WAV_FILE, 'wb') wf.setnchannels(c.NUM_CHANNEL) wf.setsampwidth(p.get_sample_size(c.FORMAT)) wf.setframerate(c.SAMPLE_RATE) wf.writeframes(b''.join(frames)) wf.close() # Test against enrolled samples print("Comparing test sample against enroll samples....") emb = forward_online(model, c.ONLINE_WAV_FILE, c.MAX_SEC_TEST) buff = CSV_PREFIX min_dist, min_spk = 1., None for i, spk in enumerate(enroll_result['speaker']): if c.COST_METRIC == "euclidean": dist = euclidean(emb, enroll_result['embedding'][i]) elif c.COST_METRIC == "cosine": dist = cosine(emb, enroll_result['embedding'][i]) else: print("Invalid cost metric [{}]".format(c.COST_METRIC)) if dist < min_dist: min_dist, min_spk = dist, spk buff += str(dist) + "," print("Distance with speaker [{}]:\t{}".format(spk, dist)) print("-----> {}".format(min_spk)) correct = int(min_spk == c.ONLINE_SPEAKER) buff += min_spk + "," + str(correct) with open(c.ONLINE_RESULT_FILE, 'a') as f: f.write(buff + "\n") p.terminate()
def load_model(self): model = vggvox_model() model.load_weights("data/model/weights.h5") return model