Ejemplo n.º 1
0
 def enumerate_align_hash(self, video_list):
     align_hash = {}
     for video_path in video_list:
         video_id = os.path.splitext(video_path)[0].split('/')[-1]
         align_path = os.path.join(self.align_path, video_id)+".align"
         align_hash[video_id] = Align(self.absolute_max_string_len, text_to_labels).from_file(align_path)
     return align_hash
Ejemplo n.º 2
0
 def prepare_align(self, video_list):
     align_dict = {}
     for video_path in video_list:
         video_id = os.path.splitext(video_path)[0].split('/')[-1]
         align_path = os.path.join(self.align_path, video_id)+".align"
         align_dict[video_id] = Align(self.absolute_max_string_len).from_file(align_path)
     return align_dict
Ejemplo n.º 3
0
 def split_words(video, align):
     video_aligns = []
     for sub in align.align:
         # Create new video
         _video = Video(video.vtype, video.face_predictor_path)
         _video.face = video.face[sub[0]:sub[1]]
         _video.mouth = video.mouth[sub[0]:sub[1]]
         _video.set_data(_video.mouth)
         # Create new align
         _align = Align(align.absolute_max_string_len, align.label_func).from_array([(0, sub[1]-sub[0], sub[2])])
         # Append
         video_aligns.append((_video, _align))
     return video_aligns
Ejemplo n.º 4
0
 def merge(video_aligns):
     vsample = video_aligns[0][0]
     asample = video_aligns[0][1]
     video = Video(vsample.vtype, vsample.face_predictor_path)
     video.face = np.ones((0, vsample.face.shape[1], vsample.face.shape[2], vsample.face.shape[3]), dtype=np.uint8)
     video.mouth = np.ones((0, vsample.mouth.shape[1], vsample.mouth.shape[2], vsample.mouth.shape[3]), dtype=np.uint8)
     align = []
     inc = 0
     for _video, _align in video_aligns:
         video.face = np.concatenate((video.face, _video.face), 0)
         video.mouth = np.concatenate((video.mouth, _video.mouth), 0)
         for sub in _align.align:
             _sub = (sub[0]+inc, sub[1]+inc, sub[2])
             align.append(_sub)
         inc = align[-1][1]
     video.set_data(video.mouth)
     align = Align(asample.absolute_max_string_len, asample.label_func).from_array(align)
     return (video, align)
Ejemplo n.º 5
0
#folders_list_train = random.sample(folders_list_train, 180)
#folders_list_val = random.sample(folders_list_val, 100)

#print('Training data:', len(folders_list_train)*2)
#print('Validation data:', len(folders_list_val)*2)

video_file = args.video_file
transcript_file = video_file[:-9] + '.txt'
lips = get_video_frames(video_file, fmt='rgb')
lips = crop_pad_frames(frames=lips, fps=25, seconds=5)
lips = lips.reshape(1, 125, 50, 100, 3)
print('lips shape:', lips.shape)

# Read text

trans = (Align(128, text_to_labels).from_file(transcript_file))
y_data = (trans.padded_label)
y_data = y_data.reshape(1, 128)
print('y_data shape:', y_data.shape)
label_length = (trans.label_length)
input_length = 125

#lip = lipreading(mode='backendGRU', inputDim=256, hiddenDim=512, nClasses=29, frameLen=125, AbsoluteMaxStringLen=128, every_frame=True)
#model = lip
model = LipNet(input_shape=(125, 50, 100, 3),
               pretrained='pretrain',
               output_size=29,
               absolute_max_string_len=128)
#model.load_weights('/data/models/combResnetLSTM_CTCloss_236k-train_1to3ratio_valWER_epochs20_lr1e-4_0.1decay9epochs/weights-07-117.3701.hdf5')

from io import StringIO
Ejemplo n.º 6
0
def DataGenerator_sampling_softmask(folderlist_all, folders_per_epoch,
                                    batch_size):

    epoch_number = 0
    L = folders_per_epoch

    #this line is just to make the generator infinite, keras needs that
    while True:

        batch_start = 0
        batch_end = batch_size
        while batch_start < L:

            if batch_start == 0:
                epoch_number += 1

                if epoch_number % 3 == 1:
                    indices = []
                    for ind in range(len(folderlist_all)):
                        indices.append(ind)

                pick_indices = random.sample(indices, L)

                for item in pick_indices:
                    indices.remove(item)

                folderlist = []
                for index in pick_indices:
                    folderlist.append(folderlist_all[index])

            limit = min(batch_end, L)

            folders_batch = folderlist[batch_start:limit]
            # print(folders_batch)
            lips = []
            mask = []
            spect = []
            phase = []
            samples = []
            transcripts = []
            for folder in folders_batch:

                lips_ = sorted(glob.glob(folder + '/*_lips.mp4'),
                               key=numericalSort)
                #masks_ = sorted(glob.glob(folder + '/*_softmask.npy'), key=numericalSort)
                #samples_ = sorted(glob.glob(folder + '/*_samples.npy'), key=numericalSort)
                transcripts_ = sorted(glob.glob(folder + '/*.txt'),
                                      key=numericalSort)
                #spect_ = folder + '/mixed_spectrogram.npy'
                #phase_ = folder + '/phase_spectrogram.npy'

                lips.append(lips_[0])
                lips.append(lips_[1])

                # samples.append(samples_[0])
                # samples.append(samples_[1])
                #
                # mask.append(masks_[0])
                # mask.append(masks_[1])
                #
                # spect.append(spect_)
                # spect.append(spect_)
                #
                # phase.append(phase_)
                # phase.append(phase_)

                transcripts.append(transcripts_[0])
                transcripts.append(transcripts_[1])

            zipped = list(zip(lips, transcripts))
            random.shuffle(zipped)
            lips, transcripts = zip(*zipped)

            #             #X_mask = np.asarray([to_onehot(cv2.imread(fname, cv2.IMREAD_UNCHANGED)) for fname in mask])
            #             X_mask = np.asarray([np.load(fname).reshape(257, 500, 1) for fname in mask])
            #             #print(X_mask.shape)
            # #            print('mask', X_mask.shape)
            #
            #             X_spect = [np.load(fname) for fname in spect]
            #
            #             X_phase = [np.load(fname) for fname in phase]
            #
            #             X_samples = np.asarray([np.pad(np.load(fname), (0, 128500), mode='constant')[:128500] for fname in samples])
            #
            #             X_spect_phase = []
            #             for i in range(len(X_spect)):
            #                 x_spect_phase = np.stack([X_spect[i], X_phase[i]], axis=-1)
            #                 X_spect_phase.append(x_spect_phase)
            #
            #             X_spect_phase = np.asarray(X_spect_phase)
            #
            # #            print("X_spect_phase", X_spect_phase.shape)

            X_lips = []

            for i in range(len(lips)):

                x_lips = get_video_frames(lips[i], fmt='grey')
                choices = [0, 1, 2, 3]
                choose = random.choice(choices)
                if choose == 0:
                    choices = [1, 2, 3]
                    choose = random.choice(choices)
                    if choose == 1:
                        choices = [1, 2]
                        choose = random.choice(choices)
                        if choose == 1:
                            x_lips = seq1_1.augment_images(x_lips)
                        elif choose == 2:
                            x_lips = seq1_2.augment_images(x_lips)
                    elif choose == 2:
                        x_lips = seq2.augment_images(x_lips)
                    elif choose == 3:
                        choices = [0, 1, 2, 3]
                        choose = random.choice(choices)
                        if choose == 0:
                            x_lips = seq3.augment_images(x_lips)
                        elif choose == 1:
                            x_lips = seq4.augment_images(x_lips)
                        elif choose == 2:
                            x_lips = seq5.augment_images(x_lips)
                        elif choose == 3:
                            x_lips = seq6.augment_images(x_lips)
                else:
                    x_lips = x_lips
                #x_lips = seq.augment_images(x_lips)
                x_lips = crop_pad_frames(frames=x_lips, fps=25, seconds=5)
                X_lips.append(x_lips)

            align = []
            Y_data = []
            label_length = []
            input_length = []
            source_str = []

            #X_lips = np.asarray(X_lips)

            # for i in range(len(transcripts)):
            #     a=(Align(256, text_to_labels).from_file(transcripts[i]))
            #     if(a.label_length<=125):
            #             align.append(a)
            #             X_lip.append(X_lips[i])
            # for i in range(len(X_lip)):
            #     Y_data.append(align[i].padded_label)
            #     label_length.append(align[i].label_length)
            #     input_length.append(125)
            #     #source_str.append(align[i].sentence)
            # Y_data = np.array(Y_data)
            # print(X_lips.shape)
            #X = seq.augment_images(X)

            #yield [X_spect_phase, X_lips, X_samples], X_mask
            #yield [np.array(X_lip),Y_data,np.array(input_length),np.array(label_length)],np.zeros([len(X_lip)])

            X_lips = np.asarray(X_lips)

            for i in range(len(transcripts)):
                align.append(
                    Align(128, text_to_labels).from_file(transcripts[i]))
            for i in range(X_lips.shape[0]):
                Y_data.append(align[i].padded_label)
                label_length.append(align[i].label_length)
                input_length.append(X_lips.shape[1])
                source_str.append(align[i].sentence)
            Y_data = np.array(Y_data)
            # print(X_lips.shape)
            #X = seq.augment_images(X)

            #yield [X_spect_phase, X_lips, X_samples], X_mask
            yield [
                X_lips, Y_data,
                np.array(input_length),
                np.array(label_length)
            ], np.zeros([X_lips.shape[0]])
            #  inputs = {'the_input': X_lips,
            #       'the_labels': Y_data,
            #       'input_length': input_length,
            #       'label_length': label_length,
            #       'source_str': source_str
            #       }
            # outputs = {'ctc': np.zeros([X_lips.shape[0]])}  # dummy data for dummy loss function
            #
            # yield (inputs,outputs)

            batch_start += batch_size
            batch_end += batch_size
Ejemplo n.º 7
0
    def on_epoch_end(self, epoch, logs={}):
        num = len(self.val_folders)
        div_num = 12
        num_100s = int(num/div_num)

        total_list=[]
        total_norm_list=[]
        total_wer=[]
	
        for n in range(num_100s):
            val_folders_100 = self.val_folders[n*div_num:(n+1)*div_num]
            lips=[]
            transcripts=[]
            samples = []
            samples_mix = []
            for folder in val_folders_100:

                lips_ = sorted(glob.glob(folder + '/*_lips.mp4'), key=numericalSort)
                samples_ = sorted(glob.glob(folder + '/*_samples.npy'), key=numericalSort)
                samples_mix_ = '/data/mixed_audio_files/' +folder.split('/')[-1]+'.wav'
                transcripts_ = sorted(glob.glob(folder + '/*.txt'), key=numericalSort)

                '''lips.append(lips_[0])
                lips.append(lips_[1])

                transcripts.append(transcripts_[0])
                transcripts.append(transcripts_[1])'''

                for i in range(len(lips_)):
                    lips.append(lips_[i])
                for i in range(len(samples_)):
                    samples.append(samples_[i])
                for i in range(len(lips_)):
                    samples_mix.append(samples_mix_)
                for i in range(len(lips_)):
                    transcripts.append(transcripts_[i])

            zipped = list(zip(lips, samples, samples_mix, transcripts))
            random.shuffle(zipped)
            lips, samples, samples_mix, transcripts = zip(*zipped)

            X_samples = np.asarray([np.pad(np.load(fname), (0, 32000), mode='constant')[:32000] for fname in samples])
            X_samples_mix = np.asarray([np.pad(wavfile.read(fname)[1], (0, 32000), mode='constant')[:32000] for fname in samples_mix])

            X_lips = []

            for i in range(len(lips)):

                x_lips = get_video_frames(lips[i], fmt='grey')
                x_lips = crop_pad_frames(frames = x_lips, fps = 25, seconds = 2)
                X_lips.append(x_lips)

            X_lips = np.asarray(X_lips)

            align=[]
            Y_data = []
            label_length = []
            input_length = []
            source_str = []

            for i in range(len(transcripts)):align.append(Align(128, text_to_labels).from_file(transcripts[i]))
            for i in range(X_lips.shape[0]):
                Y_data.append(align[i].padded_label)
                label_length.append(align[i].label_length)
                input_length.append(X_lips.shape[1])
                #source_str.append(align[i].sentence)
            Y_data = np.array(Y_data)

            X_samples_targ = X_samples.reshape(X_samples.shape[0], 32000, 1).astype('float32')
            X_samples_mix = X_samples_mix.reshape(X_samples_mix.shape[0], 32000, 1).astype('float32')
            X_samples_targ = X_samples_targ/1350.0
            X_samples_mix = X_samples_mix/1350.0

            val_predict=self.model_container.predict([X_lips, X_samples_mix])

            val_predict = val_predict[1]

        #
        # for n in range(num_100s):
        #     val_folders_100 = self.val_folders[n*100:(n+1)*100]
        #     d0,d1,d2,d3=split(DataGenerator_test(val_folders_100, self.batch_size))
        #     val_predict = (self.model.predict(d0))

            decode_res=decoder.decode(val_predict, input_length)

            ground_truth=[]
            for i in range(Y_data.shape[0]):
                ground_truth.append(labels_to_text(Y_data[i]))

            data=[]
            for j in range(0, X_lips.shape[0]):
                data.append((decode_res[j], ground_truth[j]))


            mean_individual_length = np.mean([len(pair[1].split()) for pair in data])
            total       = 0.0
            total_norm  = 0.0
            w=0.0
            length      = len(data)
            for i in range(0, length):
                val         = float(wer_sentence(data[i][0], data[i][1]))
                total      += val
                total_norm += val / mean_individual_length
                w+=val/len(data[i][1])

            total_wer.append(w/length)
            total_list.append(total/length)
            total_norm_list.append(total_norm/length)
        total_wer=np.array(total_wer)
        total_list=np.array(total_list)
        total_norm_list=np.array(total_norm_list)

        print('Validation WER_original:',np.mean(total_wer),'Validation WER: ', np.mean(total_list),'Validation WER_NORM:',np.mean(total_norm_list))

        return
Ejemplo n.º 8
0
    def on_epoch_end(self, epoch, logs={}):
        num = len(self.val_folders)
        div_num = self.batch_size
        num_100s = int(num/div_num)

        total_list=[]
        total_norm_list=[]
        total_wer=[]
	
        for n in range(num_100s):
            val_folders_100 = self.val_folders[n*div_num:(n+1)*div_num]
            lips=[]
            transcripts=[]
            for folder in val_folders_100:

                #lips_ = sorted(glob.glob(folder + '/*_lips.mp4'), key=numericalSort)

                #transcripts_ = sorted(glob.glob(folder + '/*.txt'), key=numericalSort)

                lips.append(folder)
                #lips.append(lips_[1])

                transcripts.append(folder[:-9]+'.txt')
                #transcripts.append(transcripts_[1])

            zipped = list(zip(lips, transcripts))
            random.shuffle(zipped)
            lips, transcripts = zip(*zipped)



            X_lips = []

            for i in range(len(lips)):

                x_lips = get_video_frames(lips[i], fmt='grey')
                x_lips = crop_pad_frames(frames = x_lips, fps = 25, seconds = 5)
                X_lips.append(x_lips)

            align=[]
            Y_data = []
            label_length = []
            input_length = []
            source_str = []
            X_lips = np.asarray(X_lips)


            for i in range(len(transcripts)):align.append(Align(128, text_to_labels).from_file(transcripts[i]))
            for i in range(X_lips.shape[0]):
                Y_data.append(align[i].padded_label)
                label_length.append(align[i].label_length)
                input_length.append(X_lips.shape[1])
                #source_str.append(align[i].sentence)
            Y_data = np.array(Y_data)

            val_predict=self.model_container.predict(X_lips)

        #
        # for n in range(num_100s):
        #     val_folders_100 = self.val_folders[n*100:(n+1)*100]
        #     d0,d1,d2,d3=split(DataGenerator_test(val_folders_100, self.batch_size))
        #     val_predict = (self.model.predict(d0))

            decode_res=decoder.decode(val_predict, input_length)

            ground_truth=[]
            for i in range(Y_data.shape[0]):
                ground_truth.append(labels_to_text(Y_data[i]))

            data=[]
            for j in range(0, X_lips.shape[0]):
                data.append((decode_res[j], ground_truth[j]))


            mean_individual_length = np.mean([len(pair[1].split()) for pair in data])
            total       = 0.0
            total_norm  = 0.0
            w=0.0
            length      = len(data)
            for i in range(0, length):
                val         = float(wer_sentence(data[i][0], data[i][1]))
                total      += val
                total_norm += val / mean_individual_length
                w+=val/len(data[i][1])

            total_wer.append(w/length)
            total_list.append(total/length)
            total_norm_list.append(total_norm/length)
            
        total_wer=np.array(total_wer)
        total_list=np.array(total_list)
        total_norm_list=np.array(total_norm_list)

        print('Validation WER_original:',np.mean(total_wer),'Validation WER: ', np.mean(total_list),'Validation WER_NORM:',np.mean(total_norm_list))
        
        with open(self.save_path, "a") as myfile:
            myfile.write(', Validation WER_original: ' + str(np.mean(total_wer)) + ', Validation WER: ' + str(np.mean(total_list)) + ', Validation WER_NORM: ' + str(np.mean(total_norm_list)) + '\n')
Ejemplo n.º 9
0
    def on_epoch_end(self, epoch, logs={}):
        num = len(self.val_folders)
        div_num = 12
        num_100s = int(num/div_num)

        total_list=[]
        total_norm_list=[]
        total_wer=[]
	
        for n in range(num_100s):
            val_folders_100 = self.val_folders[n*div_num:(n+1)*div_num]
            lips=[]
            transcripts=[]
            for folder in val_folders_100:

                lips_ = sorted(glob.glob(folder + '/*_lips.mp4'), key=numericalSort)

                transcripts_ = sorted(glob.glob(folder + '/*.txt'), key=numericalSort)

                lips.append(lips_[0])
                lips.append(lips_[1])

                transcripts.append(transcripts_[0])
                transcripts.append(transcripts_[1])

            zipped = list(zip(lips, transcripts))
            random.shuffle(zipped)
            lips, transcripts = zip(*zipped)



            X_lips = []

            for i in range(len(lips)):

                x_lips = get_video_frames(lips[i], fmt='grey')
                x_lips = crop_pad_frames(frames = x_lips, fps = 25, seconds = 5)
                X_lips.append(x_lips)

            align=[]
            Y_data = []
            label_length = []
            input_length = []
            source_str = []
            X_lips = np.asarray(X_lips)


            for i in range(len(transcripts)):align.append(Align(128, text_to_labels).from_file(transcripts[i]))
            for i in range(X_lips.shape[0]):
                Y_data.append(align[i].padded_label)
                label_length.append(align[i].label_length)
                input_length.append(X_lips.shape[1])
                #source_str.append(align[i].sentence)
            Y_data = np.array(Y_data)

            val_predict=self.model_container.predict(X_lips)

        #
        # for n in range(num_100s):
        #     val_folders_100 = self.val_folders[n*100:(n+1)*100]
        #     d0,d1,d2,d3=split(DataGenerator_test(val_folders_100, self.batch_size))
        #     val_predict = (self.model.predict(d0))

            decode_res=decoder.decode(val_predict, input_length)

            ground_truth=[]
            for i in range(Y_data.shape[0]):
                ground_truth.append(labels_to_text(Y_data[i]))

            data=[]
            for j in range(0, X_lips.shape[0]):
                data.append((decode_res[j], ground_truth[j]))


            mean_individual_length = np.mean([len(pair[1].split()) for pair in data])
            total       = 0.0
            total_norm  = 0.0
            w=0.0
            length      = len(data)
            for i in range(0, length):
                val         = float(wer_sentence(data[i][0], data[i][1]))
                total      += val
                total_norm += val / mean_individual_length
                w+=val/len(data[i][1])

            total_wer.append(w/length)
            total_list.append(total/length)
            total_norm_list.append(total_norm/length)
        total_wer=np.array(total_wer)
        total_list=np.array(total_list)
        total_norm_list=np.array(total_norm_list)

        print('Validation WER_original:',np.mean(total_wer),'Validation WER: ', np.mean(total_list),'Validation WER_NORM:',np.mean(total_norm_list))
        
        with open(self.save_path, "a") as myfile:
            myfile.write(', Validation WER_original: ' + str(np.mean(total_wer)) + ', Validation WER: ' + str(np.mean(total_list)) + ', Validation WER_NORM: ' + str(np.mean(total_norm_list)) + '\n')

#             return self.get_mean_tuples(data, mean_individual_length, wer_sentence)
#
#             def get_mean_tuples(self, data, individual_length, func):
#                 total       = 0.0
#                 total_norm  = 0.0
#                 length      = len(data)
#                 for i in range(0, length):
#                     val         = float(func(data[i][0], data[i][1]))
#                     total      += val
#                     total_norm += val / individual_length
#                 return (total/length, total_norm/length)
#
#
#             mixed_spect = val_predict[:,:,:,1]
#             mixed_phase = val_predict[:,:,:,2]
#             val_targ = val_predict[:,:,:,3]
#             batch = val_targ.shape[0]
#             val_targ = val_targ.reshape(batch, -1)
# #           val_targ = val_targ[:, :80000]
#
#             masks = val_predict[:,:,:,0]
#
#             samples_pred = []
#             for i in range(masks.shape[0]):
#                 mask = masks[i]
#                 #print('mask', mask.shape)
#                 mixed_spect_ = mixed_spect[i]
#                 #print('mixed_spect_' ,mixed_spect_.shape)
#                 mixed_phase_ = mixed_phase[i]
#                 #print('mixed_phase_', mixed_phase_.shape)
#                 samples = retrieve_samples(spec_signal = mixed_spect_,phase_spect = mixed_phase_,mask = mask,sample_rate=16e3, n_fft=512, window_size=25, step_size=10)
#
#                 #print('samples', samples.shape)
#                 samples_pred.append(samples[256:])
#
#             val_targ1 = []
#             for i in range(batch):
#                 length_pred = len(samples_pred[i])
#                 #print('length_pred', length_pred)
#                 val_targ_ = val_targ[i, :length_pred]
#                 #val_targ_ = val_targ_.reshape(1, -1)
#                 #print('val_targ_', val_targ_.shape)
#                 val_targ1.append(val_targ_)
#
#             val_targ = val_targ1
#
#             samples_pred = np.asarray(samples_pred)
#             #print('samples_pred', samples_pred.shape)
#             val_targ = np.asarray(val_targ)
#             #print('val_targ', val_targ.shape)
#             #val_predict = val_predict1
#             #val_targ = val_targ1
#             #_val_f1 = f1_score(val_targ, val_predict)
#             #_val_f1_weigh = f1_score(val_targ, val_predict, average='weighted')
#             #_val_recall = recall_score(val_targ, val_predict)
#             #_val_precision = precision_score(val_targ, val_predict)
#
#             _val_sdr1, _ = metric_eval(target_samples = val_targ, predicted_samples = samples_pred)
#             sdr_list.append(_val_sdr1)
#
#         sdr_list = np.asarray(sdr_list)
#         _val_sdr = np.mean(sdr_list)
#         self.val_sdr.append(_val_sdr)
#         #self.val_f1s_weigh.append(_val_f1_weigh)
#         #self.val_recalls.append(_val_recall)
#         #self.val_precisions.append(_val_precision)
# #        print '\n'
#         print('Validation SDR: ', _val_sdr)
        #print('Weighted validation f1: ', _val_f1_weigh)
        #, '_val_precision: ', _val_precision, '_val_recall', _val_recall
        return