Esempio n. 1
0
def main(video_source_directory_path, count, fps, width, height):
    path_list = Path(video_source_directory_path).glob("**/*.mp4")
    current_count = 0
    print("Generating pickle files according to the given config...")
    for path in path_list:
        if current_count >= count:
            break
        else:
            print("\n\n\nCOUNT: " + str(current_count), end="\n\n\n")
            video_source_file_name = str(path.stem) + ".mp4"
            print("Processing video file: " + video_source_file_name)
            print("Preprocessing...")
            start = time.time()
            frames = pp.get_frames(video_file_source_path=str(path),
                                   req_fps=fps,
                                   width=width,
                                   height=height)
            mid = time.time()
            print("Time required for preprocessing is: " + str(mid - start))
            print("Generating pickle dump...")
            ph.generate_pickle_list(video_name=str(path.stem), frames=frames)
            end = time.time()
            print("Time required for generating pickle file is: " +
                  str(end - mid))
            print("Total time required for saving a video is: " +
                  str(end - start))
        current_count += 1
Esempio n. 2
0
def test_sift_extract():
    video_source_file_path = "../dataset/videos/200_512kb.mp4"
    test_frames_file_path = "frames/"
    fps = 2
    width = 480
    height = 360
    frames = pp.get_frames(video_file_source_path=video_source_file_path,
                           req_fps=fps,
                           width=width,
                           height=height)
    c = 0
    file = open("d.txt", "a")
    for f in frames:
        # key_points = lf.extract_sift_key_points(image=f)
        # descriptors = lf.extract_sift_descriptors(image=f)
        kp, d = lf.extract_sift_keypoints_and_descriptors(image=f, limit=100)
        if d is not None:
            file.write(str(d))
        # print(type(key_points), end=' ')
        # print(type(descriptors))
        # print(c, end=' ')
        # if key_points is not None:
        #     print(len(key_points), end=' ')
        # else:
        #     pass
        # if descriptors is None:
        #     print("none")
        # else:
        #     pass
        # c += 1
    # print(len(key_points))
    # print(len(descriptors))
    file.close()
Esempio n. 3
0
def main():
        video_source_file_path = "../dataset/videos/test.mp4"
        test_frames_file_path = "frames/"
        fps = 30
        width = 480
        height = 360
        frames = preprocessing.get_frames(video_file_source_path=video_source_file_path, fps=fps, width=width, height=height)
        c = 0
        for frame in frames:
                cv2.imwrite(test_frames_file_path + str(c) + ".png", frame)
                c += 1
Esempio n. 4
0
def test_pickling():
    video_source_file_path = "../dataset/videos/200_512kb.mp4"
    test_frames_file_path = "frames/"
    fps = 30
    width = 480
    height = 360
    frames = pp.get_frames(video_file_source_path=video_source_file_path,
                           fps=fps,
                           width=width,
                           height=height)
    c = 0
    key_points = lf.extract_sift_key_points(image=frames[20])
    descriptors = lf.extract_sift_descriptors(image=frames[20])

    p = pickle.dumps(key_points)
    print(len(p))

    d_new = pickle.loads(p)
Esempio n. 5
0
def get_audio_image(tr_data):
    # we keep track of the number of videos we cannot process
    num_skipped_videos = 0
    #numpy array to hold all W matrices
    W_all = np.zeros((len(tr_data), 2401, 25))

    for count in tqdm(range(len(tr_data))):
        sample = tr_data[count]
        url = 'https://www.youtube.com/watch?v=' + sample[0]
        video_start_time = sample[1]

        # Download from local video file
        if (url):
            os.system("ffmpeg -ss " + str(video_start_time) +
                      " -i $(youtube-dl -i -f 37/22/18 -g \'" + url +
                      "\') -t " + str(10) +
                      " -c copy video.mp4 >/dev/null 2>&1")
            os.system("ffmpeg -i video.mp4 audio.wav >/dev/null 2>&1")

            # obtain cv2.VideoCapture obj from downloaded video if success
            cap = cv2.VideoCapture("video.mp4")
        else:
            print("Error in downloading youtube video")
        if not os.path.exists("./video.mp4"):
            num_skipped_videos += 1
            continue

        # load audio from file
        ts, sr = li.core.load("./audio.wav", sr=48000)

        # skip if audio is shorter than 10 seconds
        if (len(ts) < 10 * sr):
            os.remove("./audio.wav")
            os.remove("./video.mp4")
            print("\n\n\n Sample {} is too short to be processed.".format(
                sample[0]))
            print("Namely, the sample is {} seconds long.\n\n\n".format(
                len(ts) / sr))
            num_skipped_videos += 1
            continue
        s = ts[0:10 *
               sr]  # cut audio into exact 10 seconds if it's longer than that

        all_image_tensors, skip = get_frames(
            cap)  # get all the transformed frames

        # skip the current video if error occured during the frame extraction process
        if skip:
            num_skipped_videos += 1
            print("\n\n\nUnable to extract all frames from sample {}\n\n\n".
                  format(sample[0]))
            if os.path.exists('./audio.wav'):
                os.remove('./audio.wav')
            if os.path.exists('./video.mp4'):
                os.remove('./video.mp4')
            for k in range(skip):
                if os.path.exists('frame{}.jpg'.format(k)):
                    os.remove('frame{}.jpg'.format(k))
            continue

        max_pool_labels = get_frame_labels(
            all_image_tensors)  # get predicted labels for captured frames

        # create the set of basis vectors and object labels for each audio sample
        if count == 0:
            # call the NMF algorithm
            W_all = np.expand_dims(extract_bases(ts),
                                   0)  # extract audio into audio bases
            labels_all = max_pool_labels.detach().unsqueeze(
                0)  # use predicted maxpool labels

        else:
            W = extract_bases(ts)  # extract audio into audio bases
            W_all = np.concatenate(
                (W_all, np.expand_dims(W, 0)))  # append audio bases into list
            labels_all = torch.cat(
                (labels_all, max_pool_labels.detach().unsqueeze(0)), 0)

        # remove all the captured images, downloaded video and audio
        for i in range(10):
            os.remove('./frame{}.jpg'.format(i))
        os.remove('./video.mp4')
        os.remove('./audio.wav')

        # write data to h5 file every 500 samples in case lose connection
        # write audio frequency bases and Resnet maxpool labels into h5 file
        if (count % 500 == 0):
            with h5py.File('./test_data.h5', 'w') as hdf5:
                hdf5.create_dataset('bases', data=W_all)
                hdf5.create_dataset('labels', data=labels_all)

    # dump audio frequency bases and Resnet maxpool labels into h5 file
    with h5py.File('./test_data.h5', 'w') as hdf5:
        hdf5.create_dataset('bases', data=W_all)
        hdf5.create_dataset('labels', data=labels_all)

    print("{} samples were skipped.".format(num_skipped_videos))
Esempio n. 6
0
#from tensorflow.keras import Sequential
#from tensorflow.keras.layers import Dense, Dropout
#from tensorflow.keras.layers import Conv2D, MaxPool2D
#from tensorflow.keras.optimizers import Adam
#print(tf.__version__)
import pandas as pd
import numpy as np
from preprocessing import get_frames
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

Fs = 10
frame_size = Fs*2 # 20
hop_size = Fs*1 # 10

X_train, y_train = get_frames(frame_size, hop_size)

X_ = np.concatenate((X_train, y_train), axis =1)
#np.random.shuffle(X)

X = X_[:,:-1]
y = X_[:,-1]
#X = dataset.iloc[:, :-1].values
#y = dataset.iloc[:, -1:].values

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
                                                            return_elements=['new_all_output:0'])

        new_output=tf.identity(output,name='new_output')
        print(new_output)

        config = tf.ConfigProto(allow_soft_placement=True)  
        config.gpu_options.per_process_gpu_memory_fraction = 0.6 
        config.gpu_options.allow_growth = True

        results=[]
        with tf.Session(config=config) as sess:

            fid = 0
            for vcount, vid in enumerate(annotations):
                print('Processing sequence {}...'.format(vid['name']))
                frames, _ = get_frames(vid['full_path'], skip=skip)
                for frame in frames:
                    # do sw inference
                    image=normalize(frame)[np.newaxis, ...]

                    quantized_image=quantize_input_image(image,5)
                    pred=sess.run(new_output,feed_dict={input_:quantized_image})
                    print(pred.shape)

                    boxes, scores = get_bboxes(np.squeeze(pred), anchors, nms_params)
                    for j in range(0, len(boxes)):
                        box = boxes[j].tolist()
                        results.append(
                            dict(image_id=fid, category_id=1, bbox=box, score=scores[j].tolist(), height=box[3]))
                    if not len(boxes):  # if there are no detections
                        results.append(dict(image_id=fid, category_id=0, bbox=[0, 0, 0, 0], score=0))
Esempio n. 8
0
def main():
    # retrieve the audio basis vectors for each object
    object_dict = disentangle()

    if os.path.exists('./audio.wav'):
        os.remove('./audio.wav')
    if os.path.exists('./video.mp4'):
        os.remove('./video.mp4')

    # test video
    video_url = 'https://www.youtube.com/watch?v=DOn33Ugbefw'
    if (video_url):
        os.system("ffmpeg -ss " + str(105) +
                  " -i $(youtube-dl -i -f 37/22/18 -g \'" + video_url +
                  "\') -t " + str(10) + " -c copy video.mp4 >/dev/null 2>&1")
        os.system("ffmpeg -i video.mp4 audio.wav >/dev/null 2>&1")

        # obtain cv2.VideoCapture obj from downloaded video if success
        cap = cv2.VideoCapture("video.mp4")
    else:
        print("Error in downloading youtube video")

    # load audio file
    ts, sr = librosa.core.load("./audio.wav", sr=48000)

    # skip if audio is shorted than 10 seconds
    if (len(ts) < 10 * sr):
        os.remove("./audio.wav")
        os.remove("./video.mp4")
        print("\n\n\nSample {} is too short to be processed.".format(1))
        print("Namely, the sample is {} seconds long.\n\n\n".format(
            len(ts) / sr))
        exit(1)

    # crop to 10 seconds if audio is longer
    ts = ts[0:sr * 10]

    all_image_tensors, skip = get_frames(cap)  # get all the transformed frames

    # skip video if error in frame extraction process
    if skip:
        print("\n\n\nUnable to extract all frames from sample {}\n\n\n".format(
            1))
        if os.path.exists('./audio.wav'):
            os.remove('./audio.wav')
        if os.path.exists('./video.mp4'):
            os.remove('./video.mp4')
        for k in range(skip):
            if os.path.exists('frame{}.jpg'.format(k)):
                os.remove('frame{}.jpg'.format(k))
        exit(1)

    # get predicted labels for captured frames
    max_pool_labels = get_frame_labels(all_image_tensors)

    # reshape the labels into (1000,) and perform softmax on labels
    labels = max_pool_labels.detach().unsqueeze(0).numpy().astype(
        float).reshape(1000, )
    softmax_labels = np.exp(labels) / np.sum(np.exp(labels), axis=0)

    # we take the top 4 objects in the scene and intersect with the piano/violin/guitar/drum labels
    labels = set(softmax_labels.argsort()[-4:][::-1]).intersection(
        set([889, 579, 881, 402, 541]))

    # reindex the labels of drum/guitar/piano/violin for convenience
    labels_new = []
    start_index = 0
    # sep holds the start index for each concatenated W matrix
    sep = [start_index]

    # append audio basis vectors of each object in columns
    for i in labels:
        if i == 541:
            labels_new.append('drum')
            start_index += object_dict['drum'].shape[1]
            sep.append(start_index)
        elif i == 402:
            labels_new.append('guitar')
            start_index += object_dict['guitar'].shape[1]
            sep.append(start_index)
        elif (i == 579 or i == 881) and 'piano' not in labels_new:
            labels_new.append('piano')
            start_index += object_dict['piano'].shape[1]
            sep.append(start_index)
        elif (i == 889):
            labels_new.append('violin')
            start_index += object_dict['violin'].shape[1]
            sep.append(start_index)

    print("Objects in test video: ", labels_new)

    # the last index is the number of basis vectors in the concatenated W matrix
    num_basis_vectors = sep[-1]

    # W shape (num_of_frequency_bins, num_of_basis_vectors)
    W = np.zeros((2401, num_basis_vectors))

    # concatenate audio bases of each object into W in columns
    for index, object in enumerate(labels_new):
        W[:, sep[index]:sep[index + 1]] = object_dict[object]

    # get spectrograms of audio
    spec, magnitude_spec, phase_spec = get_spectrograms(ts)

    V = magnitude_spec

    # W_transpose is used as the fixed "H" in the NMF procedure
    W_transpose = W.T
    assert (W_transpose.shape == (num_basis_vectors, 2401))

    # Since sklearn can only solve V=WH while keeping the H fixed, we solve the factorization:
    # V^T = H^T*W^T, and take the transpose of the resultant matrix H_t to retreive H.
    H_t, _, _ = non_negative_factorization(X=V.T,
                                           H=W_transpose,
                                           n_components=num_basis_vectors,
                                           init='random',
                                           update_H=False,
                                           max_iter=1500,
                                           verbose=1)
    H = H_t.T

    V_dict = {}

    #append to dictionary of object spectrograms
    for i, object in enumerate(labels_new):
        V_dict[object] = np.matmul(object_dict[object], H[sep[i]:sep[i + 1]])
        assert (V_dict[object].shape == (2401, 201))

    #calculate sum of all object magnitude spectrograms
    V_sum = np.zeros((2401, 201))
    for V_obj in V_dict.values():
        V_sum = V_sum + V_obj

    #mask the spectrogram, compute istft and write to wav file
    sample_rate = 48000
    for i, object in enumerate(labels_new):

        # softmask the mixture spectrogram
        double_V_j = (V_dict[object] / (V_sum)) * spec

        # use istft to reconstruct time domain signal from sprectrogram
        source_j = librosa.core.istft(double_V_j, hop_length=2400)

        # write reconstructed signal into wav file for testing
        print("Writing to ./{}.wav...".format(object))
        librosa.output.write_wav('./{}.wav'.format(object), source_j,
                                 sample_rate)

    # remove generated audio, video and frame images
    if os.path.exists('./audio.wav'):
        os.remove('./audio.wav')
    if os.path.exists('./video.mp4'):
        os.remove('./video.mp4')
    for i in range(10):
        if os.path.exists('./frame{}'.format(i)):
            os.remove('./frame{}'.format(i))
Esempio n. 9
0
from sqlalchemy.orm import relationship
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import pickle
import preprocessing as pp
import local_features as lf

Base = declarative_base()

video_source_file_path = "../dataset/videos/200_512kb.mp4"
test_frames_file_path = "frames/"
fps = 30
width = 480
height = 360
frames = pp.get_frames(video_file_source_path=video_source_file_path,
                       fps=fps,
                       width=width,
                       height=height)
c = 0
key_points = lf.extract_sift_key_points(image=frames[20])
descriptors = lf.extract_sift_descriptors(image=frames[20])

p = pickle.dumps(descriptors)


class Video(Base):
    __tablename__ = 'local_features_video'
    # Here we define columns for the table person
    # Notice that each column is also a normal Python instance attribute.
    id = Column(Integer, primary_key=True)
    video_name = Column(LargeBinary(length=(2**32) - 1), nullable=False)