Esempio n. 1
0
 def enumerate_videos(self, path):
     video_list = []
     for video_path in glob.glob(path):
         try:
             if os.path.isfile(video_path):
                 video = Video(
                     self.vtype,
                     self.face_predictor_path).from_video(video_path)
             else:
                 video = Video(
                     self.vtype,
                     self.face_predictor_path).from_frames(video_path)
         except AttributeError as err:
             raise err
         except:
             print("Error loading video: " + video_path)
             continue
         if K.image_data_format(
         ) == 'channels_first' and video.data.shape != (
                 self.img_c, self.frames_n, self.img_w, self.img_h):
             print("Video " + video_path + " has incorrect shape " +
                   str(video.data.shape) + ", must be " +
                   str((self.img_c, self.frames_n, self.img_w,
                        self.img_h)) + "")
             continue
         if K.image_data_format(
         ) != 'channels_first' and video.data.shape != (
                 self.frames_n, self.img_w, self.img_h, self.img_c):
             print("Video " + video_path + " has incorrect shape " +
                   str(video.data.shape) + ", must be " +
                   str((self.frames_n, self.img_w, self.img_h,
                        self.img_c)) + "")
             continue
         video_list.append(video_path)
     return video_list
Esempio n. 2
0
def predict(weight_path,
            video_path,
            absolute_max_string_len=32,
            output_size=28):
    print "\nLoading data from disk..."
    video = Video(vtype='face', face_predictor_path=FACE_PREDICTOR_PATH)
    if os.path.isfile(video_path):
        video.from_video(video_path)
    else:
        video.from_frames(video_path)
    print "Data loaded.\n"

    a = video.split_commands()
    show_square(video.sq[20:], video.avg_sq)

    if (a != []):
        for i in range(len(a)):
            if (i == len(a) - 1):
                a[i + 1] = len(a)
            video.from_video_test(video_path, a[i], a[i + 1])

        if K.image_data_format() == 'channels_first':
            img_c, frames_n, img_w, img_h = video.data.shape
        else:
            frames_n, img_w, img_h, img_c = video.data.shape

        lipnet = LipNet(img_c=img_c,
                        img_w=img_w,
                        img_h=img_h,
                        frames_n=frames_n,
                        absolute_max_string_len=absolute_max_string_len,
                        output_size=output_size)

        adam = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)

        lipnet.model.compile(loss={
            'ctc': lambda y_true, y_pred: y_pred
        },
                             optimizer=adam)
        lipnet.model.load_weights(weight_path)

        spell = Spell(path=PREDICT_DICTIONARY)
        decoder = Decoder(greedy=PREDICT_GREEDY,
                          beam_width=PREDICT_BEAM_WIDTH,
                          postprocessors=[labels_to_text, spell.sentence])

        X_data = np.array([video.data]).astype(np.float32) / 255
        input_length = np.array([len(video.data)])

        y_pred = lipnet.predict(X_data)
        result = decoder.decode(y_pred, input_length)[0]

    return (video, result)
Esempio n. 3
0
def load(video_path):
    print "\n[{}]\nLoading data from disk...".format(video_path)
    video = Video(vtype='face', face_predictor_path=FACE_PREDICTOR_PATH)
    if os.path.isfile(video_path):
        video.from_video(video_path)
    else:
        video.from_frames(video_path)
    print "Data loaded.\n"
    return video
Esempio n. 4
0
    def get_batch(self, index, size, train):
        if train:
            video_list = self.train_list
        else:
            video_list = self.val_list

        X_data_path = get_list_safe(video_list, index, size)
        X_data = []
        Y_data = []
        label_length = []
        input_length = []
        source_str = []
        for path in X_data_path:
            video = Video().from_frames(path)
            align = self.get_align(path.split('/')[-1])
            video_unpadded_length = video.length
            if self.curriculum is not None:
                video, align, video_unpadded_length = self.curriculum.apply(
                    video, align)
            X_data.append(video.data)
            Y_data.append(align.padded_label)
            label_length.append(align.label_length)  # CHANGED [A] -> A, CHECK!
            # input_length.append([video_unpadded_length - 2]) # 2 first frame discarded
            input_length.append(
                video.length
            )  # Just use the video padded length to avoid CTC No path found error (v_len < a_len)
            source_str.append(align.sentence)  # CHANGED [A] -> A, CHECK!

        source_str = np.array(source_str)
        label_length = np.array(label_length)
        input_length = np.array(input_length)
        Y_data = np.array(Y_data)
        X_data = np.array(X_data).astype(
            np.float32
        ) / 255  # Normalize image data to [0,1], TODO: mean normalization over training data

        inputs = {
            'the_input': X_data,
            'the_labels': Y_data,
            'input_length': input_length,
            'label_length': label_length,
            'source_str': source_str  # used for visualization only
        }
        outputs = {
            'ctc': np.zeros([size])
        }  # dummy data for dummy loss function

        return (inputs, outputs)
Esempio n. 5
0
def predict(video_path):
    print "\nLoading data from disk..."
    video = Video(vtype='face', face_predictor_path=FACE_PREDICTOR_PATH)
    if os.path.isfile(video_path):
        video.from_video(video_path)
    else:
        video.from_frames(video_path)
    print "Data loaded.\n"

    


    
    a = video.split_commands()

    ##slide disp
    d = 0
    for item in video.avg_sq:
        d += item
    d = d/len(video.avg_sq)

    print('Avarage dispertion(slide) = ', d)

    #disp all
    avg_sq = 0
    disp = 0
    for i in range(len(video.sq)):
        avg_sq += video.sq[i]
        disp += video.sq[i]**2

    avg_sq = ((avg_sq)*(avg_sq))/len(video.sq)
    disp = (disp - avg_sq)/len(video.sq)
    disp = np.sqrt(disp)

    print('disp = ', disp)

    #avarage square
    avg = 0
    for item in video.sq:
        avg += item
    avg = avg/len(video.sq)
    print('Avarage square = ', avg) 

     

   


    show_square(video.sq[20:],video.avg_sq)
Esempio n. 6
0
def predict(weight_path,
            video_path,
            absolute_max_string_len=32,
            output_size=43):
    print "\nLoading data from disk..."
    video = Video(vtype='face', face_predictor_path=FACE_PREDICTOR_PATH)
    if os.path.isfile(video_path):
        video.from_video(video_path)
    else:
        video.from_frames(video_path)
    print "Data loaded.\n"

    if K.image_data_format() == 'channels_first':
        img_c, frames_n, img_w, img_h = video.data.shape
    else:
        frames_n, img_w, img_h, img_c = video.data.shape

    lipnet = LipNet(img_c=img_c,
                    img_w=img_w,
                    img_h=img_h,
                    frames_n=frames_n,
                    absolute_max_string_len=absolute_max_string_len,
                    output_size=output_size)

    adam = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)

    lipnet.model.compile(loss={
        'ctc': lambda y_true, y_pred: y_pred
    },
                         optimizer=adam)
    lipnet.model.load_weights(weight_path)

    spell = Spell(path=PREDICT_DICTIONARY)
    decoder = Decoder(greedy=PREDICT_GREEDY,
                      beam_width=PREDICT_BEAM_WIDTH,
                      postprocessors=[labels_to_text, spell.sentence])

    X_data = np.array([video.data]).astype(np.float32) / 255
    input_length = np.array([len(video.data)])

    layer_name = 'dense1'
    layer_idx = [
        idx for idx, layer in enumerate(lipnet.model.layers)
        if layer.name == layer_name
    ][0]

    y_pred = lipnet.predict(X_data)
    result = decoder.decode(y_pred, input_length)[0]

    heatmap = visualize_saliency(lipnet.model, layer_idx, range(0, 28),
                                 video.data)

    return (heatmap, result)
Esempio n. 7
0
def predict(weight_path,
            video_path,
            absolute_max_string_len=32,
            output_size=28):
    #print("\nLoading data from disk...")
    video = Video(vtype='face', face_predictor_path=FACE_PREDICTOR_PATH)
    if os.path.isfile(video_path):
        video.from_video(video_path)
    else:
        video.from_frames(video_path)
    #print("Data loaded.\n")

    if K.image_data_format() == 'channels_first':
        img_c, frames_n, img_w, img_h = video.data.shape
    else:
        frames_n, img_w, img_h, img_c = video.data.shape

    lipnet = LipNet(img_c=img_c,
                    img_w=img_w,
                    img_h=img_h,
                    frames_n=frames_n,
                    absolute_max_string_len=absolute_max_string_len,
                    output_size=output_size)

    if not MODEL.model:
        #lipnet = LipNet(img_c=img_c, img_w=img_w, img_h=img_h, frames_n=frames_n,
        #            absolute_max_string_len=absolute_max_string_len, output_size=output_size)
        #adam = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
        #lipnet.model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=adam)
        #lipnet.model.load_weights(weight_path)

        #print("Built Model.")
        #spell = Spell(path=PREDICT_DICTIONARY)
        #decoder = Decoder(greedy=PREDICT_GREEDY, beam_width=PREDICT_BEAM_WIDTH,
        #              postprocessors=[labels_to_text])#, spell.sentence])
        MODEL.model = Prebuilt_model(weight_path, video_path, lipnet,
                                     absolute_max_string_len, output_size)

    X_data = np.array([video.data]).astype(np.float32) / 255
    input_length = np.array([len(video.data)])

    y_pred = MODEL.model.lipnet.predict(X_data)
    results = MODEL.model.decoder.decode(y_pred, input_length)
    print("Before cognitive services: " + results[0])
    cog = cognitive()
    cog_result = cog.speech_to_text(cog.text_to_speech(results[0]))
    print("after cognitive services: " + cog_result)

    return (video, cog_result)
Esempio n. 8
0
FACE_PREDICTOR_PATH = sys.argv[4]

def mkdir_p(path):
    try:
        os.makedirs(path)
    except OSError as exc:  # Python >2.5
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else:
            raise

def find_files(directory, pattern):
    for root, dirs, files in os.walk(directory):
        for basename in files:
            if fnmatch.fnmatch(basename, pattern):
                filename = os.path.join(root, basename)
                yield filename

for filepath in find_files(SOURCE_PATH, SOURCE_EXTS):
    print("Processing: {}".format(filepath))
    video = Video(vtype='face', face_predictor_path=FACE_PREDICTOR_PATH).from_video(filepath)

    filepath_wo_ext = os.path.splitext(filepath)[0]
    target_dir = os.path.join(TARGET_PATH, filepath_wo_ext)
    mkdir_p(target_dir)

    i = 0
    for frame in video.mouth:
    	io.imsave(os.path.join(target_dir, "mouth_{0:03d}.png".format(i)), frame)
    	i += 1
Esempio n. 9
0
def process_video(weight_path, video_path):
    print "\nLoading data from disk..."
    video = Video(vtype='face', face_predictor_path=FACE_PREDICTOR_PATH)
    if os.path.isfile(video_path):
        video.from_video(video_path)
    else:
        video.from_frames(video_path)
    print "Data loaded.\n"

    a = video.split_commands()
    show_square(video.sq[20:], video.avg_sq)

    ans_v = []
    ans_r = []

    if (a != []):
        for i in range(len(a)):
            if (i == 0):
                video.from_video_test(video_path, 0, a[i])
                v, r = predict_videos(video, weight_path)
                ans_v.append(v)
                ans_r.append(r)

            if (i == len(a) - 1):
                video.from_video_test(video_path, a[i], -1, last=True)
                v, r = predict_videos(video, weight_path)
                ans_v.append(v)
                ans_r.append(r)
                break

            video.from_video_test(video_path, a[i], a[i + 1])
            v, r = predict_videos(video, weight_path)
            ans_v.append(v)
            ans_r.append(r)
    return ans_v, ans_r
Esempio n. 10
0
    print np.array_equiv(_video.mouth, video.mouth),
    print np.array_equiv(_video.data, video.data),
    print np.array_equiv(_video.face, video.face)
    print "Align: "
    print labels_to_text(_align.padded_label.astype(np.int))
    print _align.padded_label
    print _align.label_length
    print np.array_equiv(_align.sentence, align.sentence),
    print np.array_equiv(_align.label, align.label),
    print np.array_equiv(_align.padded_label, align.padded_label)


curriculum = Curriculum(rules)

video = Video(vtype='face',
              face_predictor_path=
              'evaluation/models/shape_predictor_68_face_landmarks.dat')
video.from_video('evaluation/samples/id2_vcd_swwp2s.mpg')

align = Align(
    absolute_max_string_len=32,
    label_func=text_to_labels).from_file('evaluation/samples/swwp2s.align')

print "=== TRAINING ==="
for i in range(6):
    curriculum.update(i, train=True)
    print curriculum
    _video, _align, _ = curriculum.apply(video, align)
    show_results(_video, _align, video, align)

print "=== VALIDATION/TEST ==="
Esempio n. 11
0
def predict(weight_path,
            video_path,
            absolute_max_string_len=32,
            output_size=11174):
    print "\nLoading data from disk..."
    video = Video(vtype='face', face_predictor_path=FACE_PREDICTOR_PATH)
    if os.path.isfile(video_path):
        video.from_video(video_path)
    else:
        video.from_frames(video_path)
    print "Data loaded.\n"

    if K.image_data_format() == 'channels_first':
        # print "a"
        img_c, frames_n, img_w, img_h = video.data.shape
    else:
        # print "b"
        frames_n, img_w, img_h, img_c = video.data.shape

    # print "c"

    lipnet = LipNet(img_c=img_c,
                    img_w=img_w,
                    img_h=img_h,
                    frames_n=frames_n,
                    absolute_max_string_len=absolute_max_string_len,
                    output_size=output_size)

    lipnet.summary()
    # print "d"

    adam = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
    # print "e"
    lipnet.model.compile(loss={
        'ctc': lambda y_true, y_pred: y_pred
    },
                         optimizer=adam)
    # print "f"

    lipnet.model.load_weights(weight_path)

    # print "g"
    spell = Spell(path=PREDICT_DICTIONARY)
    # print "h"
    decoder = Decoder(greedy=PREDICT_GREEDY,
                      beam_width=PREDICT_BEAM_WIDTH,
                      postprocessors=[labels_to_text, spell.sentence])
    # print "i"
    X_data = np.array([video.data]).astype(np.float32) / 255
    input_length = np.array([len(video.data)])
    y_pred = lipnet.predict(X_data)

    #print X_data
    #print "========= x_pred ============"
    #print y_pred
    #print "========= y_pred ============"
    #print input_length
    #print "========= input_length ============"
    result = decoder.decode(y_pred, input_length)[0]
    #print result
    print "========= result ============"

    return (video, result)