def enumerate_videos(self, path): video_list = [] for video_path in glob.glob(path): try: if os.path.isfile(video_path): video = Video( self.vtype, self.face_predictor_path).from_video(video_path) else: video = Video( self.vtype, self.face_predictor_path).from_frames(video_path) except AttributeError as err: raise err except: print("Error loading video: " + video_path) continue if K.image_data_format( ) == 'channels_first' and video.data.shape != ( self.img_c, self.frames_n, self.img_w, self.img_h): print("Video " + video_path + " has incorrect shape " + str(video.data.shape) + ", must be " + str((self.img_c, self.frames_n, self.img_w, self.img_h)) + "") continue if K.image_data_format( ) != 'channels_first' and video.data.shape != ( self.frames_n, self.img_w, self.img_h, self.img_c): print("Video " + video_path + " has incorrect shape " + str(video.data.shape) + ", must be " + str((self.frames_n, self.img_w, self.img_h, self.img_c)) + "") continue video_list.append(video_path) return video_list
def process_video(weight_path, video_path): print "\nLoading data from disk..." video = Video(vtype='face', face_predictor_path=FACE_PREDICTOR_PATH) if os.path.isfile(video_path): video.from_video(video_path) else: video.from_frames(video_path) print "Data loaded.\n" a = video.split_commands() show_square(video.sq[20:], video.avg_sq) ans_v = [] ans_r = [] if (a != []): for i in range(len(a)): if (i == 0): video.from_video_test(video_path, 0, a[i]) v, r = predict_videos(video, weight_path) ans_v.append(v) ans_r.append(r) if (i == len(a) - 1): video.from_video_test(video_path, a[i], -1, last=True) v, r = predict_videos(video, weight_path) ans_v.append(v) ans_r.append(r) break video.from_video_test(video_path, a[i], a[i + 1]) v, r = predict_videos(video, weight_path) ans_v.append(v) ans_r.append(r) return ans_v, ans_r
def predict(weight_path, video_path, absolute_max_string_len=32, output_size=28): print ("\nLoading data from disk...") video = Video(vtype='face', face_predictor_path=FACE_PREDICTOR_PATH) if os.path.isfile(video_path): video.from_video(video_path) else: video.from_frames(video_path) print ("Data loaded.\n") if K.image_data_format() == 'channels_first': img_c, frames_n, img_w, img_h = video.data.shape else: frames_n, img_w, img_h, img_c = video.data.shape lipnet = LipNet(img_c=img_c, img_w=img_w, img_h=img_h, frames_n=frames_n, absolute_max_string_len=absolute_max_string_len, output_size=output_size) adam = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08) lipnet.model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=adam) lipnet.model.load_weights(weight_path) spell = Spell(path=PREDICT_DICTIONARY) decoder = Decoder(greedy=PREDICT_GREEDY, beam_width=PREDICT_BEAM_WIDTH, postprocessors=[labels_to_text, spell.sentence]) X_data = np.array([video.data]).astype(np.float32) / 255 input_length = np.array([len(video.data)]) y_pred = lipnet.predict(X_data) result = decoder.decode(y_pred, input_length)[0] return (video, result)
def load(video_path): print "\n[{}]\nLoading data from disk...".format(video_path) video = Video(vtype='face', face_predictor_path=FACE_PREDICTOR_PATH) if os.path.isfile(video_path): video.from_video(video_path) else: video.from_frames(video_path) print "Data loaded.\n" return video
def predict(video_path): print "\nLoading data from disk..." video = Video(vtype='face', face_predictor_path=FACE_PREDICTOR_PATH) if os.path.isfile(video_path): video.from_video(video_path) else: video.from_frames(video_path) print "Data loaded.\n" a = video.split_commands() ##slide disp d = 0 for item in video.avg_sq: d += item d = d/len(video.avg_sq) print('Avarage dispertion(slide) = ', d) #disp all avg_sq = 0 disp = 0 for i in range(len(video.sq)): avg_sq += video.sq[i] disp += video.sq[i]**2 avg_sq = ((avg_sq)*(avg_sq))/len(video.sq) disp = (disp - avg_sq)/len(video.sq) disp = np.sqrt(disp) print('disp = ', disp) #avarage square avg = 0 for item in video.sq: avg += item avg = avg/len(video.sq) print('Avarage square = ', avg) show_square(video.sq[20:],video.avg_sq)
def predict(weight_path, video_path, absolute_max_string_len=32, output_size=28): #print("\nLoading data from disk...") video = Video(vtype='face', face_predictor_path=FACE_PREDICTOR_PATH) if os.path.isfile(video_path): video.from_video(video_path) else: video.from_frames(video_path) #print("Data loaded.\n") if K.image_data_format() == 'channels_first': img_c, frames_n, img_w, img_h = video.data.shape else: frames_n, img_w, img_h, img_c = video.data.shape lipnet = LipNet(img_c=img_c, img_w=img_w, img_h=img_h, frames_n=frames_n, absolute_max_string_len=absolute_max_string_len, output_size=output_size) if not MODEL.model: #lipnet = LipNet(img_c=img_c, img_w=img_w, img_h=img_h, frames_n=frames_n, # absolute_max_string_len=absolute_max_string_len, output_size=output_size) #adam = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08) #lipnet.model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=adam) #lipnet.model.load_weights(weight_path) #print("Built Model.") #spell = Spell(path=PREDICT_DICTIONARY) #decoder = Decoder(greedy=PREDICT_GREEDY, beam_width=PREDICT_BEAM_WIDTH, # postprocessors=[labels_to_text])#, spell.sentence]) MODEL.model = Prebuilt_model(weight_path, video_path, lipnet, absolute_max_string_len, output_size) X_data = np.array([video.data]).astype(np.float32) / 255 input_length = np.array([len(video.data)]) y_pred = MODEL.model.lipnet.predict(X_data) results = MODEL.model.decoder.decode(y_pred, input_length) print("Before cognitive services: " + results[0]) cog = cognitive() cog_result = cog.speech_to_text(cog.text_to_speech(results[0])) print("after cognitive services: " + cog_result) return (video, cog_result)
def get_batch(self, index, size, train): if train: video_list = self.train_list else: video_list = self.val_list X_data_path = get_list_safe(video_list, index, size) X_data = [] Y_data = [] label_length = [] input_length = [] source_str = [] for path in X_data_path: video = Video().from_frames(path) align = self.get_align(path.split('/')[-1]) video_unpadded_length = video.length if self.curriculum is not None: video, align, video_unpadded_length = self.curriculum.apply( video, align) X_data.append(video.data) Y_data.append(align.padded_label) label_length.append(align.label_length) # CHANGED [A] -> A, CHECK! # input_length.append([video_unpadded_length - 2]) # 2 first frame discarded input_length.append( video.length ) # Just use the video padded length to avoid CTC No path found error (v_len < a_len) source_str.append(align.sentence) # CHANGED [A] -> A, CHECK! source_str = np.array(source_str) label_length = np.array(label_length) input_length = np.array(input_length) Y_data = np.array(Y_data) X_data = np.array(X_data).astype( np.float32 ) / 255 # Normalize image data to [0,1], TODO: mean normalization over training data inputs = { 'the_input': X_data, 'the_labels': Y_data, 'input_length': input_length, 'label_length': label_length, 'source_str': source_str # used for visualization only } outputs = { 'ctc': np.zeros([size]) } # dummy data for dummy loss function return (inputs, outputs)
FACE_PREDICTOR_PATH = sys.argv[4] def mkdir_p(path): try: os.makedirs(path) except OSError as exc: # Python >2.5 if exc.errno == errno.EEXIST and os.path.isdir(path): pass else: raise def find_files(directory, pattern): for root, dirs, files in os.walk(directory): for basename in files: if fnmatch.fnmatch(basename, pattern): filename = os.path.join(root, basename) yield filename for filepath in find_files(SOURCE_PATH, SOURCE_EXTS): print("Processing: {}".format(filepath)) video = Video(vtype='face', face_predictor_path=FACE_PREDICTOR_PATH).from_video(filepath) filepath_wo_ext = os.path.splitext(filepath)[0] target_dir = os.path.join(TARGET_PATH, filepath_wo_ext) mkdir_p(target_dir) i = 0 for frame in video.mouth: io.imsave(os.path.join(target_dir, "mouth_{0:03d}.png".format(i)), frame) i += 1
print np.array_equiv(_video.mouth, video.mouth), print np.array_equiv(_video.data, video.data), print np.array_equiv(_video.face, video.face) print "Align: " print labels_to_text(_align.padded_label.astype(np.int)) print _align.padded_label print _align.label_length print np.array_equiv(_align.sentence, align.sentence), print np.array_equiv(_align.label, align.label), print np.array_equiv(_align.padded_label, align.padded_label) curriculum = Curriculum(rules) video = Video(vtype='face', face_predictor_path= 'evaluation/models/shape_predictor_68_face_landmarks.dat') video.from_video('evaluation/samples/id2_vcd_swwp2s.mpg') align = Align( absolute_max_string_len=32, label_func=text_to_labels).from_file('evaluation/samples/swwp2s.align') print "=== TRAINING ===" for i in range(6): curriculum.update(i, train=True) print curriculum _video, _align, _ = curriculum.apply(video, align) show_results(_video, _align, video, align) print "=== VALIDATION/TEST ==="