def _generate_data_for_late_fusion(self, total_files):
     my_csv = {}
     total = len(total_files)
     for file in tqdm(total_files):
         clip_id = file.split(".")[0]
         audio_path = clip_id.replace("AFEW/aligned",
                                      "late_feature/" + self.feature_name)
         # print("audio_path", audio_path)
         if len(glob.glob(audio_path + "*")) == 0:
             # print("audio_path", audio_path)
             # print("glob", len(glob.glob(audio_path + "*")), "\n", glob.glob(audio_path + "*"))
             continue
         if "full" in self.feature_name:
             signal, sound_sr = librosa.load(audio_path + ".wav", 48000)
             mul = np.tile(signal, 298368 // len(signal))
             add = signal[:298368 % len(signal)]
             signal = np.concatenate([mul, add])
             mel = preprocess_input(signal, sound_sr)
             pred = self.ac.model.predict(mel.reshape(1, 620, 64))
             label_from_audio = self.lb.inverse_transform(pred)[0]
         else:
             label_from_audio = self.ac.clip_classification(audio_path)
         ground_truth, label_from_frame = self.fc.predict(file)
         clip_id = basename(clip_id)
         my_csv[clip_id] = [
             ground_truth, label_from_frame, label_from_audio
         ]
         # print(len(my_csv), "/", total)
     with open('lables_late_fusion' + self.feature_name + '.csv', 'w') as f:
         f.write("clip_id, ground_truth, frame_label, audio_label\n")
         for k in my_csv:
             f.write(
                 str(k) + "," + str(my_csv[k][0]) + "," +
                 str(my_csv[k][1]) + "," + str(my_csv[k][2]) + "\n")
     return my_csv
    def data_gen(self,
                 list_feature_vectors,
                 batch_size,
                 mode="train",
                 aug=None):
        c = 0
        if mode == "train":
            random.shuffle(list_feature_vectors)
        while True:
            labels = []
            features = np.zeros(
                (batch_size, self.feature_number, 64)).astype('float')
            for i in range(c, c + batch_size):
                signal, sound_sr = librosa.load(list_feature_vectors[i], 48000)
                if "full" in self.feature_name and len(
                        signal) < 298368:  # max_length
                    mul = np.tile(signal, 298368 // len(signal))
                    add = signal[:298368 % len(signal)]
                    signal = np.concatenate([mul, add])

                mel = preprocess_input(signal, sound_sr)
                mel = mel if aug is None else aug(mel)
                features[i - c] = np.array(mel)
                labels.append(list_feature_vectors[i].split("/")[-2])
            c += batch_size
            if c + batch_size > len(list_feature_vectors):
                c = 0
                if mode == "train":
                    random.shuffle(list_feature_vectors)
                if mode == "eval":
                    break
            labels = self.lb.transform(np.array(labels))
            yield features, labels
    def joint_gen_test_clip(self, list_files, clip_id, stride=1):
        """ stride su singolo file, quindi va richiamato per ogni file """
        ground_truth = list_files[0][0]
        csv_path = '/user/vlongobardi/AFEW/aligned/Val/GroundTruth/ID.csv'
        csv_path = csv_path.replace("GroundTruth",
                                    ground_truth).replace("ID", clip_id)
        first_frame_num = int(list_files[0][1].split("_")[-1].split(".")[0])
        start = 0
        end = len(list_files) - self.time_step
        while True:
            labels = []
            features = [np.zeros(
                (1, self.audio_feature, 64)).astype('float')] * self.time_step
            features += [
                np.zeros((1, self.time_step, 224, 224, 3)).astype('float')
            ]
            images = DataGen(csv_path,
                             '',
                             1,
                             31,
                             NoAug(),
                             16,
                             1,
                             12,
                             test=True)[0][0][0]
            for index, elem in enumerate(list_files[start:start +
                                                    self.time_step]):

                path1 = self.feature_name.replace("wav", "yam")
                path2 = "temp_clips_" + self.feature_name.split("_")[1]
                audio_path = audio_path.replace(path1, path2)
                audio_path = audio_path.replace("npy", "wav")

                signal, sound_sr = librosa.load(audio_path, 48000)
                if "full" in self.feature_name and len(
                        signal) < 298368:  # max_length
                    mul = np.tile(signal, 298368 // len(signal))
                    add = signal[:298368 % len(signal)]
                    signal = np.concatenate([mul, add])
                # features[index][0] = mel
                features[0][0] = preprocess_input(signal, sound_sr).reshape(
                    1, self.audio_feature, 64)
                features[-1][0][index] = images[first_frame_num + start +
                                                index]
            labels.append(ground_truth)
            start += self.time_step // stride
            if start >= end:
                break
            labels = self.lb.transform(np.array(labels)).reshape((1, 7))
            yield features, labels
def generate_feature_yamnet(model, files, win_dim, feature_number, mode="Train"):
    execeptions = []
    flag_except = 0
    for wav in tqdm(files):
        try:
            c = wav.split("/")[-2]
            clip_id = wav.split("/")[-1].split(".")[0]
            signal, sound_sr = librosa.load(wav, 48000)
            if win_dim == "full" and len(signal) < 298368:  # max_length
                mul = np.tile(signal, 298368 // len(signal))
                add = signal[:298368 % len(signal)]
                signal = np.concatenate([mul, add])
            mel = preprocess_input(signal, sound_sr)
            mel = mel.reshape(1, feature_number, 64)
            feature = model.predict(mel)
            pth = root_path + "early_feature/emobase2010_" + win_dim + "_yam/" + mode + "/" + c + "/" + clip_id + ".npy"
            np.save(pth, feature)
        except ValueError:
            execeptions.append(wav)
            print("wav", wav)
    if flag_except:
        print("\n\n\nexeceptions", len(execeptions), execeptions)
Esempio n. 5
0
 def __call__(self, signal, sound_sr):
     return preprocess_input(signal, sound_sr)
    def joint_val_gen(self, list_files, batch_size, mode="val", stride=1):
        """ stride 50% sul su tutti i file """
        c = 0
        clip_ids = list(list_files.keys())
        while True:
            try:
                for clip_id in clip_ids:
                    video_info = list_files[clip_id]
                    ground_truth = video_info[0][0]
                    csv_path = '/user/vlongobardi/AFEW/aligned/Val/GroundTruth/ID.csv'
                    csv_path = csv_path.replace("GroundTruth",
                                                ground_truth).replace(
                                                    "ID", clip_id)
                    images = DataGen(csv_path,
                                     '',
                                     1,
                                     31,
                                     NoAug(),
                                     16,
                                     1,
                                     12,
                                     test=True)[0][0][0]
                    first_frame_num = int(
                        video_info[0][1].split("_")[-1].split(".")[0])

                    for start in range(0,
                                       len(video_info) - self.time_step,
                                       self.time_step // stride):
                        if c == 0:
                            labels = []
                            features = [
                                np.zeros((batch_size, self.audio_feature,
                                          64)).astype('float')
                            ] * self.time_step
                            features += [
                                np.zeros((batch_size, self.time_step, 224, 224,
                                          3)).astype('float')
                            ]

                        for index, elem in enumerate(
                                video_info[start:self.time_step + start]):

                            path1 = self.feature_name.replace("wav", "yam")
                            path2 = "temp_clips_" + self.feature_name.split(
                                "_")[1]
                            audio_path = elem[2].replace(path1, path2)
                            audio_path = audio_path.replace("npy", "wav")

                            signal, sound_sr = librosa.load(audio_path, 48000)
                            if "full" in self.feature_name and len(
                                    signal) < 298368:  # max_length
                                mul = np.tile(signal, 298368 // len(signal))
                                add = signal[:298368 % len(signal)]
                                signal = np.concatenate([mul, add])
                            # features[index][c] = mel
                            features[index][c] = preprocess_input(
                                signal,
                                sound_sr).reshape(1, self.audio_feature, 64)
                            features[-1][c][index] = images[first_frame_num +
                                                            start + index]
                        labels.append(ground_truth)
                        c += 1
                        if c == batch_size:
                            c = 0
                            labels = self.lb.transform(
                                np.array(labels)).reshape((batch_size, 7))
                            yield features, labels
            except Exception as ex:
                print("\n\nEXCEPTION")
                traceback.print_exception(type(ex), ex, ex.__traceback__)
                print("\nclip_index:", clip_id, "\nlen(clip_ids)",
                      len(clip_ids))
                print("\ncsv_path", csv_path, "\nstart", start, "\nc", c)
            if mode == "eval":
                break
    def joint_gen_train(self, list_files, batch_size):
        c = 0
        clip_ids = list(list_files.keys())
        random.shuffle(clip_ids)
        while True:
            labels = []
            features = [
                np.zeros((batch_size, self.audio_feature, 64)).astype('float')
            ] * self.time_step
            features += [
                np.zeros(
                    (batch_size, self.time_step, 224, 224, 3)).astype('float')
            ]
            for i in range(c, c + batch_size):
                try:
                    clip_id = clip_ids[i]
                    video_info = list_files[clip_id]
                    ground_truth = video_info[0][0]
                    csv_path = '/user/vlongobardi/AFEW/aligned/Train/GroundTruth/ID.csv'
                    csv_path = csv_path.replace("GroundTruth",
                                                ground_truth).replace(
                                                    "ID", clip_id)
                    images = DataGen(csv_path,
                                     '',
                                     1,
                                     31,
                                     NoAug(),
                                     16,
                                     1,
                                     12,
                                     test=True)[0][0][0]
                    first_frame_num = int(
                        video_info[0][1].split("_")[-1].split(".")[0])
                    start = random.randint(0, len(video_info) - self.time_step)
                    for index, elem in enumerate(
                            video_info[start:self.time_step + start]):
                        ground_truth, _, audio_path = elem

                        path1 = self.feature_name.replace("wav", "yam")
                        path2 = "temp_clips_" + self.feature_name.split("_")[1]
                        audio_path = audio_path.replace(path1, path2)
                        audio_path = audio_path.replace("npy", "wav")

                        signal, sound_sr = librosa.load(audio_path, 48000)
                        if "full" in self.feature_name and len(
                                signal) < 298368:  # max_length
                            mul = np.tile(signal, 298368 // len(signal))
                            add = signal[:298368 % len(signal)]
                            signal = np.concatenate([mul, add])
                        # features[index][i - c]
                        features[index][i - c] = preprocess_input(
                            signal, sound_sr).reshape(1, self.audio_feature,
                                                      64)
                        features[-1][i - c][index] = images[first_frame_num +
                                                            start + index]
                    labels.append(ground_truth)
                except:
                    print("\n\nEXCEPTION!")
                    traceback.print_exc()
                    print("\nlen(video_info)", len(video_info), "\n",
                          video_info)
                    print("self.time_step", self.time_step)
                    print("csv_path", csv_path)
                    print("start", start)
            c += batch_size
            if c + batch_size > len(clip_ids):
                c = 0
                random.shuffle(clip_ids)
            labels = self.lb.transform(np.array(labels)).reshape(
                (batch_size, 7))
            yield features, labels