コード例 #1
0
def predict(weight_path, video):
    global lipnet
    global adam
    global spell
    global decoder

    if lipnet is None:
        lipnet = LipNet(img_c=3,
                        img_w=100,
                        img_h=50,
                        frames_n=75,
                        absolute_max_string_len=32,
                        output_size=28)

        adam = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)

        lipnet.model.compile(loss={
            'ctc': lambda y_true, y_pred: y_pred
        },
                             optimizer=adam)
        lipnet.model.load_weights(weight_path)

        spell = Spell(path=PREDICT_DICTIONARY)
        decoder = Decoder(greedy=PREDICT_GREEDY,
                          beam_width=PREDICT_BEAM_WIDTH,
                          postprocessors=[labels_to_text, spell.sentence])

    X_data = np.array([video.data]).astype(np.float32) / 255
    input_length = np.array([len(video.data)])

    y_pred = lipnet.predict(X_data)
    result = decoder.decode(y_pred, input_length)[0]

    show_video_subtitle(video.face, result)
    print result
コード例 #2
0
ファイル: curriculum_test.py プロジェクト: LiZhenghua0311/lip
def show_results(_video, _align, video, align):
    show_video_subtitle(frames=_video.face, subtitle=_align.sentence)
    print "Video: "
    print _video.length
    print np.array_equiv(_video.mouth, video.mouth),
    print np.array_equiv(_video.data, video.data),
    print np.array_equiv(_video.face, video.face)
    print "Align: "
    print labels_to_text(_align.padded_label.astype(np.int))
    print _align.padded_label
    print _align.label_length
    print np.array_equiv(_align.sentence, align.sentence),
    print np.array_equiv(_align.label, align.label),
    print np.array_equiv(_align.padded_label, align.padded_label)
コード例 #3
0
    input_length = np.array([len(video.data)])

    layer_name = 'dense1'
    layer_idx = [
        idx for idx, layer in enumerate(lipnet.model.layers)
        if layer.name == layer_name
    ][0]

    y_pred = lipnet.predict(X_data)
    result = decoder.decode(y_pred, input_length)[0]

    heatmap = visualize_saliency(lipnet.model, layer_idx, range(0, 28),
                                 video.data)

    return (heatmap, result)


if __name__ == '__main__':
    if len(sys.argv) == 3:
        video, result = predict(sys.argv[1], sys.argv[2])
    elif len(sys.argv) == 4:
        video, result = predict(sys.argv[1], sys.argv[2], sys.argv[3])
    elif len(sys.argv) == 5:
        video, result = predict(sys.argv[1], sys.argv[2], sys.argv[3],
                                sys.argv[4])
    else:
        video, result = None, ""

    if video is not None:
        show_video_subtitle(video.face, result)
コード例 #4
0
        y_pred = lipnet.predict(X_data)
        result = decoder.decode(y_pred, input_length)[0]

    return (video, result)


if __name__ == '__main__':
    video = []
    result = []
    video, result = process_video(sys.argv[1], sys.argv[2])

    list_v_r = zip(video, result)

    for item in list_v_r:
        if item[0] is not None:
            show_video_subtitle(item[0].face, item[1])

            stripe = "-" * len(item[1])

            print ""
            print "             --{}- ".format(stripe)
            print "[ DECODED ] |> {} |".format(item[1])
            print "             --{}- ".format(stripe)
    '''
    for v,r in video,result:
        if v is not None:
            #show_video_subtitle(v.face, r)

            stripe = "-" * len(r)
    
            print ""