def enumerate_align_hash(self, video_list): align_hash = {} for video_path in video_list: video_id = os.path.splitext(video_path)[0].split('/')[-1] align_path = os.path.join(self.align_path, video_id)+".align" align_hash[video_id] = Align(self.absolute_max_string_len, text_to_labels).from_file(align_path) return align_hash
def prepare_align(self, video_list): align_dict = {} for video_path in video_list: video_id = os.path.splitext(video_path)[0].split('/')[-1] align_path = os.path.join(self.align_path, video_id)+".align" align_dict[video_id] = Align(self.absolute_max_string_len).from_file(align_path) return align_dict
def split_words(video, align): video_aligns = [] for sub in align.align: # Create new video _video = Video(video.vtype, video.face_predictor_path) _video.face = video.face[sub[0]:sub[1]] _video.mouth = video.mouth[sub[0]:sub[1]] _video.set_data(_video.mouth) # Create new align _align = Align(align.absolute_max_string_len, align.label_func).from_array([(0, sub[1]-sub[0], sub[2])]) # Append video_aligns.append((_video, _align)) return video_aligns
def merge(video_aligns): vsample = video_aligns[0][0] asample = video_aligns[0][1] video = Video(vsample.vtype, vsample.face_predictor_path) video.face = np.ones((0, vsample.face.shape[1], vsample.face.shape[2], vsample.face.shape[3]), dtype=np.uint8) video.mouth = np.ones((0, vsample.mouth.shape[1], vsample.mouth.shape[2], vsample.mouth.shape[3]), dtype=np.uint8) align = [] inc = 0 for _video, _align in video_aligns: video.face = np.concatenate((video.face, _video.face), 0) video.mouth = np.concatenate((video.mouth, _video.mouth), 0) for sub in _align.align: _sub = (sub[0]+inc, sub[1]+inc, sub[2]) align.append(_sub) inc = align[-1][1] video.set_data(video.mouth) align = Align(asample.absolute_max_string_len, asample.label_func).from_array(align) return (video, align)
#folders_list_train = random.sample(folders_list_train, 180) #folders_list_val = random.sample(folders_list_val, 100) #print('Training data:', len(folders_list_train)*2) #print('Validation data:', len(folders_list_val)*2) video_file = args.video_file transcript_file = video_file[:-9] + '.txt' lips = get_video_frames(video_file, fmt='rgb') lips = crop_pad_frames(frames=lips, fps=25, seconds=5) lips = lips.reshape(1, 125, 50, 100, 3) print('lips shape:', lips.shape) # Read text trans = (Align(128, text_to_labels).from_file(transcript_file)) y_data = (trans.padded_label) y_data = y_data.reshape(1, 128) print('y_data shape:', y_data.shape) label_length = (trans.label_length) input_length = 125 #lip = lipreading(mode='backendGRU', inputDim=256, hiddenDim=512, nClasses=29, frameLen=125, AbsoluteMaxStringLen=128, every_frame=True) #model = lip model = LipNet(input_shape=(125, 50, 100, 3), pretrained='pretrain', output_size=29, absolute_max_string_len=128) #model.load_weights('/data/models/combResnetLSTM_CTCloss_236k-train_1to3ratio_valWER_epochs20_lr1e-4_0.1decay9epochs/weights-07-117.3701.hdf5') from io import StringIO
def DataGenerator_sampling_softmask(folderlist_all, folders_per_epoch, batch_size): epoch_number = 0 L = folders_per_epoch #this line is just to make the generator infinite, keras needs that while True: batch_start = 0 batch_end = batch_size while batch_start < L: if batch_start == 0: epoch_number += 1 if epoch_number % 3 == 1: indices = [] for ind in range(len(folderlist_all)): indices.append(ind) pick_indices = random.sample(indices, L) for item in pick_indices: indices.remove(item) folderlist = [] for index in pick_indices: folderlist.append(folderlist_all[index]) limit = min(batch_end, L) folders_batch = folderlist[batch_start:limit] # print(folders_batch) lips = [] mask = [] spect = [] phase = [] samples = [] transcripts = [] for folder in folders_batch: lips_ = sorted(glob.glob(folder + '/*_lips.mp4'), key=numericalSort) #masks_ = sorted(glob.glob(folder + '/*_softmask.npy'), key=numericalSort) #samples_ = sorted(glob.glob(folder + '/*_samples.npy'), key=numericalSort) transcripts_ = sorted(glob.glob(folder + '/*.txt'), key=numericalSort) #spect_ = folder + '/mixed_spectrogram.npy' #phase_ = folder + '/phase_spectrogram.npy' lips.append(lips_[0]) lips.append(lips_[1]) # samples.append(samples_[0]) # samples.append(samples_[1]) # # mask.append(masks_[0]) # mask.append(masks_[1]) # # spect.append(spect_) # spect.append(spect_) # # phase.append(phase_) # phase.append(phase_) transcripts.append(transcripts_[0]) transcripts.append(transcripts_[1]) zipped = list(zip(lips, transcripts)) random.shuffle(zipped) lips, transcripts = zip(*zipped) # #X_mask = np.asarray([to_onehot(cv2.imread(fname, cv2.IMREAD_UNCHANGED)) for fname in mask]) # X_mask = np.asarray([np.load(fname).reshape(257, 500, 1) for fname in mask]) # #print(X_mask.shape) # # print('mask', X_mask.shape) # # X_spect = [np.load(fname) for fname in spect] # # X_phase = [np.load(fname) for fname in phase] # # X_samples = np.asarray([np.pad(np.load(fname), (0, 128500), mode='constant')[:128500] for fname in samples]) # # X_spect_phase = [] # for i in range(len(X_spect)): # x_spect_phase = np.stack([X_spect[i], X_phase[i]], axis=-1) # X_spect_phase.append(x_spect_phase) # # X_spect_phase = np.asarray(X_spect_phase) # # # print("X_spect_phase", X_spect_phase.shape) X_lips = [] for i in range(len(lips)): x_lips = get_video_frames(lips[i], fmt='grey') choices = [0, 1, 2, 3] choose = random.choice(choices) if choose == 0: choices = [1, 2, 3] choose = random.choice(choices) if choose == 1: choices = [1, 2] choose = random.choice(choices) if choose == 1: x_lips = seq1_1.augment_images(x_lips) elif choose == 2: x_lips = seq1_2.augment_images(x_lips) elif choose == 2: x_lips = seq2.augment_images(x_lips) elif choose == 3: choices = [0, 1, 2, 3] choose = random.choice(choices) if choose == 0: x_lips = seq3.augment_images(x_lips) elif choose == 1: x_lips = seq4.augment_images(x_lips) elif choose == 2: x_lips = seq5.augment_images(x_lips) elif choose == 3: x_lips = seq6.augment_images(x_lips) else: x_lips = x_lips #x_lips = seq.augment_images(x_lips) x_lips = crop_pad_frames(frames=x_lips, fps=25, seconds=5) X_lips.append(x_lips) align = [] Y_data = [] label_length = [] input_length = [] source_str = [] #X_lips = np.asarray(X_lips) # for i in range(len(transcripts)): # a=(Align(256, text_to_labels).from_file(transcripts[i])) # if(a.label_length<=125): # align.append(a) # X_lip.append(X_lips[i]) # for i in range(len(X_lip)): # Y_data.append(align[i].padded_label) # label_length.append(align[i].label_length) # input_length.append(125) # #source_str.append(align[i].sentence) # Y_data = np.array(Y_data) # print(X_lips.shape) #X = seq.augment_images(X) #yield [X_spect_phase, X_lips, X_samples], X_mask #yield [np.array(X_lip),Y_data,np.array(input_length),np.array(label_length)],np.zeros([len(X_lip)]) X_lips = np.asarray(X_lips) for i in range(len(transcripts)): align.append( Align(128, text_to_labels).from_file(transcripts[i])) for i in range(X_lips.shape[0]): Y_data.append(align[i].padded_label) label_length.append(align[i].label_length) input_length.append(X_lips.shape[1]) source_str.append(align[i].sentence) Y_data = np.array(Y_data) # print(X_lips.shape) #X = seq.augment_images(X) #yield [X_spect_phase, X_lips, X_samples], X_mask yield [ X_lips, Y_data, np.array(input_length), np.array(label_length) ], np.zeros([X_lips.shape[0]]) # inputs = {'the_input': X_lips, # 'the_labels': Y_data, # 'input_length': input_length, # 'label_length': label_length, # 'source_str': source_str # } # outputs = {'ctc': np.zeros([X_lips.shape[0]])} # dummy data for dummy loss function # # yield (inputs,outputs) batch_start += batch_size batch_end += batch_size
def on_epoch_end(self, epoch, logs={}): num = len(self.val_folders) div_num = 12 num_100s = int(num/div_num) total_list=[] total_norm_list=[] total_wer=[] for n in range(num_100s): val_folders_100 = self.val_folders[n*div_num:(n+1)*div_num] lips=[] transcripts=[] samples = [] samples_mix = [] for folder in val_folders_100: lips_ = sorted(glob.glob(folder + '/*_lips.mp4'), key=numericalSort) samples_ = sorted(glob.glob(folder + '/*_samples.npy'), key=numericalSort) samples_mix_ = '/data/mixed_audio_files/' +folder.split('/')[-1]+'.wav' transcripts_ = sorted(glob.glob(folder + '/*.txt'), key=numericalSort) '''lips.append(lips_[0]) lips.append(lips_[1]) transcripts.append(transcripts_[0]) transcripts.append(transcripts_[1])''' for i in range(len(lips_)): lips.append(lips_[i]) for i in range(len(samples_)): samples.append(samples_[i]) for i in range(len(lips_)): samples_mix.append(samples_mix_) for i in range(len(lips_)): transcripts.append(transcripts_[i]) zipped = list(zip(lips, samples, samples_mix, transcripts)) random.shuffle(zipped) lips, samples, samples_mix, transcripts = zip(*zipped) X_samples = np.asarray([np.pad(np.load(fname), (0, 32000), mode='constant')[:32000] for fname in samples]) X_samples_mix = np.asarray([np.pad(wavfile.read(fname)[1], (0, 32000), mode='constant')[:32000] for fname in samples_mix]) X_lips = [] for i in range(len(lips)): x_lips = get_video_frames(lips[i], fmt='grey') x_lips = crop_pad_frames(frames = x_lips, fps = 25, seconds = 2) X_lips.append(x_lips) X_lips = np.asarray(X_lips) align=[] Y_data = [] label_length = [] input_length = [] source_str = [] for i in range(len(transcripts)):align.append(Align(128, text_to_labels).from_file(transcripts[i])) for i in range(X_lips.shape[0]): Y_data.append(align[i].padded_label) label_length.append(align[i].label_length) input_length.append(X_lips.shape[1]) #source_str.append(align[i].sentence) Y_data = np.array(Y_data) X_samples_targ = X_samples.reshape(X_samples.shape[0], 32000, 1).astype('float32') X_samples_mix = X_samples_mix.reshape(X_samples_mix.shape[0], 32000, 1).astype('float32') X_samples_targ = X_samples_targ/1350.0 X_samples_mix = X_samples_mix/1350.0 val_predict=self.model_container.predict([X_lips, X_samples_mix]) val_predict = val_predict[1] # # for n in range(num_100s): # val_folders_100 = self.val_folders[n*100:(n+1)*100] # d0,d1,d2,d3=split(DataGenerator_test(val_folders_100, self.batch_size)) # val_predict = (self.model.predict(d0)) decode_res=decoder.decode(val_predict, input_length) ground_truth=[] for i in range(Y_data.shape[0]): ground_truth.append(labels_to_text(Y_data[i])) data=[] for j in range(0, X_lips.shape[0]): data.append((decode_res[j], ground_truth[j])) mean_individual_length = np.mean([len(pair[1].split()) for pair in data]) total = 0.0 total_norm = 0.0 w=0.0 length = len(data) for i in range(0, length): val = float(wer_sentence(data[i][0], data[i][1])) total += val total_norm += val / mean_individual_length w+=val/len(data[i][1]) total_wer.append(w/length) total_list.append(total/length) total_norm_list.append(total_norm/length) total_wer=np.array(total_wer) total_list=np.array(total_list) total_norm_list=np.array(total_norm_list) print('Validation WER_original:',np.mean(total_wer),'Validation WER: ', np.mean(total_list),'Validation WER_NORM:',np.mean(total_norm_list)) return
def on_epoch_end(self, epoch, logs={}): num = len(self.val_folders) div_num = self.batch_size num_100s = int(num/div_num) total_list=[] total_norm_list=[] total_wer=[] for n in range(num_100s): val_folders_100 = self.val_folders[n*div_num:(n+1)*div_num] lips=[] transcripts=[] for folder in val_folders_100: #lips_ = sorted(glob.glob(folder + '/*_lips.mp4'), key=numericalSort) #transcripts_ = sorted(glob.glob(folder + '/*.txt'), key=numericalSort) lips.append(folder) #lips.append(lips_[1]) transcripts.append(folder[:-9]+'.txt') #transcripts.append(transcripts_[1]) zipped = list(zip(lips, transcripts)) random.shuffle(zipped) lips, transcripts = zip(*zipped) X_lips = [] for i in range(len(lips)): x_lips = get_video_frames(lips[i], fmt='grey') x_lips = crop_pad_frames(frames = x_lips, fps = 25, seconds = 5) X_lips.append(x_lips) align=[] Y_data = [] label_length = [] input_length = [] source_str = [] X_lips = np.asarray(X_lips) for i in range(len(transcripts)):align.append(Align(128, text_to_labels).from_file(transcripts[i])) for i in range(X_lips.shape[0]): Y_data.append(align[i].padded_label) label_length.append(align[i].label_length) input_length.append(X_lips.shape[1]) #source_str.append(align[i].sentence) Y_data = np.array(Y_data) val_predict=self.model_container.predict(X_lips) # # for n in range(num_100s): # val_folders_100 = self.val_folders[n*100:(n+1)*100] # d0,d1,d2,d3=split(DataGenerator_test(val_folders_100, self.batch_size)) # val_predict = (self.model.predict(d0)) decode_res=decoder.decode(val_predict, input_length) ground_truth=[] for i in range(Y_data.shape[0]): ground_truth.append(labels_to_text(Y_data[i])) data=[] for j in range(0, X_lips.shape[0]): data.append((decode_res[j], ground_truth[j])) mean_individual_length = np.mean([len(pair[1].split()) for pair in data]) total = 0.0 total_norm = 0.0 w=0.0 length = len(data) for i in range(0, length): val = float(wer_sentence(data[i][0], data[i][1])) total += val total_norm += val / mean_individual_length w+=val/len(data[i][1]) total_wer.append(w/length) total_list.append(total/length) total_norm_list.append(total_norm/length) total_wer=np.array(total_wer) total_list=np.array(total_list) total_norm_list=np.array(total_norm_list) print('Validation WER_original:',np.mean(total_wer),'Validation WER: ', np.mean(total_list),'Validation WER_NORM:',np.mean(total_norm_list)) with open(self.save_path, "a") as myfile: myfile.write(', Validation WER_original: ' + str(np.mean(total_wer)) + ', Validation WER: ' + str(np.mean(total_list)) + ', Validation WER_NORM: ' + str(np.mean(total_norm_list)) + '\n')
def on_epoch_end(self, epoch, logs={}): num = len(self.val_folders) div_num = 12 num_100s = int(num/div_num) total_list=[] total_norm_list=[] total_wer=[] for n in range(num_100s): val_folders_100 = self.val_folders[n*div_num:(n+1)*div_num] lips=[] transcripts=[] for folder in val_folders_100: lips_ = sorted(glob.glob(folder + '/*_lips.mp4'), key=numericalSort) transcripts_ = sorted(glob.glob(folder + '/*.txt'), key=numericalSort) lips.append(lips_[0]) lips.append(lips_[1]) transcripts.append(transcripts_[0]) transcripts.append(transcripts_[1]) zipped = list(zip(lips, transcripts)) random.shuffle(zipped) lips, transcripts = zip(*zipped) X_lips = [] for i in range(len(lips)): x_lips = get_video_frames(lips[i], fmt='grey') x_lips = crop_pad_frames(frames = x_lips, fps = 25, seconds = 5) X_lips.append(x_lips) align=[] Y_data = [] label_length = [] input_length = [] source_str = [] X_lips = np.asarray(X_lips) for i in range(len(transcripts)):align.append(Align(128, text_to_labels).from_file(transcripts[i])) for i in range(X_lips.shape[0]): Y_data.append(align[i].padded_label) label_length.append(align[i].label_length) input_length.append(X_lips.shape[1]) #source_str.append(align[i].sentence) Y_data = np.array(Y_data) val_predict=self.model_container.predict(X_lips) # # for n in range(num_100s): # val_folders_100 = self.val_folders[n*100:(n+1)*100] # d0,d1,d2,d3=split(DataGenerator_test(val_folders_100, self.batch_size)) # val_predict = (self.model.predict(d0)) decode_res=decoder.decode(val_predict, input_length) ground_truth=[] for i in range(Y_data.shape[0]): ground_truth.append(labels_to_text(Y_data[i])) data=[] for j in range(0, X_lips.shape[0]): data.append((decode_res[j], ground_truth[j])) mean_individual_length = np.mean([len(pair[1].split()) for pair in data]) total = 0.0 total_norm = 0.0 w=0.0 length = len(data) for i in range(0, length): val = float(wer_sentence(data[i][0], data[i][1])) total += val total_norm += val / mean_individual_length w+=val/len(data[i][1]) total_wer.append(w/length) total_list.append(total/length) total_norm_list.append(total_norm/length) total_wer=np.array(total_wer) total_list=np.array(total_list) total_norm_list=np.array(total_norm_list) print('Validation WER_original:',np.mean(total_wer),'Validation WER: ', np.mean(total_list),'Validation WER_NORM:',np.mean(total_norm_list)) with open(self.save_path, "a") as myfile: myfile.write(', Validation WER_original: ' + str(np.mean(total_wer)) + ', Validation WER: ' + str(np.mean(total_list)) + ', Validation WER_NORM: ' + str(np.mean(total_norm_list)) + '\n') # return self.get_mean_tuples(data, mean_individual_length, wer_sentence) # # def get_mean_tuples(self, data, individual_length, func): # total = 0.0 # total_norm = 0.0 # length = len(data) # for i in range(0, length): # val = float(func(data[i][0], data[i][1])) # total += val # total_norm += val / individual_length # return (total/length, total_norm/length) # # # mixed_spect = val_predict[:,:,:,1] # mixed_phase = val_predict[:,:,:,2] # val_targ = val_predict[:,:,:,3] # batch = val_targ.shape[0] # val_targ = val_targ.reshape(batch, -1) # # val_targ = val_targ[:, :80000] # # masks = val_predict[:,:,:,0] # # samples_pred = [] # for i in range(masks.shape[0]): # mask = masks[i] # #print('mask', mask.shape) # mixed_spect_ = mixed_spect[i] # #print('mixed_spect_' ,mixed_spect_.shape) # mixed_phase_ = mixed_phase[i] # #print('mixed_phase_', mixed_phase_.shape) # samples = retrieve_samples(spec_signal = mixed_spect_,phase_spect = mixed_phase_,mask = mask,sample_rate=16e3, n_fft=512, window_size=25, step_size=10) # # #print('samples', samples.shape) # samples_pred.append(samples[256:]) # # val_targ1 = [] # for i in range(batch): # length_pred = len(samples_pred[i]) # #print('length_pred', length_pred) # val_targ_ = val_targ[i, :length_pred] # #val_targ_ = val_targ_.reshape(1, -1) # #print('val_targ_', val_targ_.shape) # val_targ1.append(val_targ_) # # val_targ = val_targ1 # # samples_pred = np.asarray(samples_pred) # #print('samples_pred', samples_pred.shape) # val_targ = np.asarray(val_targ) # #print('val_targ', val_targ.shape) # #val_predict = val_predict1 # #val_targ = val_targ1 # #_val_f1 = f1_score(val_targ, val_predict) # #_val_f1_weigh = f1_score(val_targ, val_predict, average='weighted') # #_val_recall = recall_score(val_targ, val_predict) # #_val_precision = precision_score(val_targ, val_predict) # # _val_sdr1, _ = metric_eval(target_samples = val_targ, predicted_samples = samples_pred) # sdr_list.append(_val_sdr1) # # sdr_list = np.asarray(sdr_list) # _val_sdr = np.mean(sdr_list) # self.val_sdr.append(_val_sdr) # #self.val_f1s_weigh.append(_val_f1_weigh) # #self.val_recalls.append(_val_recall) # #self.val_precisions.append(_val_precision) # # print '\n' # print('Validation SDR: ', _val_sdr) #print('Weighted validation f1: ', _val_f1_weigh) #, '_val_precision: ', _val_precision, '_val_recall', _val_recall return