def _extract_audio_features(self, audio_path): if self.feature_type == "MFCC": return tools.calculate_mfcc(audio_path) if self.feature_type == "Pros": return tools.extract_prosodic_features(audio_path) if self.feature_type == "MFCC+Pros": mfcc_vectors = tools.calculate_mfcc(audio_path) pros_vectors = tools.extract_prosodic_features(audio_path) mfcc_vectors, pros_vectors = tools.shorten(mfcc_vectors, pros_vectors) return np.concatenate((mfcc_vectors, pros_vectors), axis=1) if self.feature_type =="Spectro": return tools.calculate_spectrogram(audio_path) if self.feature_type == "Spectro+Pros": spectr_vectors = tools.calculate_spectrogram(audio_path) pros_vectors = tools.extract_prosodic_features(audio_path) spectr_vectors, pros_vectors = tools.shorten(spectr_vectors, pros_vectors) return np.concatenate((spectr_vectors, pros_vectors), axis=1) # Unknown feature type print(f"ERROR: unknown feature type '{self.feature_type}' in the 'extract_audio_features' call!") print(f"Possible values: {self.supported_features}.") exit(-1)
def _encode_vectors(audio_filename, gesture_filename, text_filename, embedding_model, mode, args, augment_with_context): """ Extract features from a given pair of audio and motion files. To be used by "_save_data_as_sequences" and "_save_dataset" functions. Args: audio_filename: file name for an audio file (.wav) gesture_filename: file name for a motion file (.bvh) text_filename: file name with the text transcript (.json) embedding_model: the embedding model to encode the text with mode: dataset type ('train', 'dev' or 'test') args: see the 'create_dataset' function for details augment_with_context: if True, the data sequences will be augmented with future/past context intended use: True if the data will be used for training, False if it will be used for validation/testing Returns: input_vectors [N, T, D] : speech features text_vectors : text features output_vectors [N, T, D] : motion features """ debug = False if mode == 'test': seq_length = 0 elif mode == 'train' or mode == "train_mirrored": seq_length = args.seq_len elif mode == 'dev': seq_length = 5 * args.seq_len else: print( f"ERROR: Unknown dataset type '{mode}'! Possible values: 'train', 'train_mirrored', 'dev' and 'test'." ) exit(-1) # Step 1: Vectorizing speech, with features of 'n_inputs' dimension, time steps of 0.01s # and window length with 0.025s => results in an array of 100 x 'n_inputs' if args.feature_type == "MFCC": input_vectors = tools.calculate_mfcc(audio_filename) elif args.feature_type == "Pros": input_vectors = tools.extract_prosodic_features(audio_filename) elif args.feature_type == "GeMAPS": input_vectors = tools.extract_gemaps_features(audio_filename) elif args.feature_type == "MFCC+Pros": mfcc_vectors = tools.calculate_mfcc(audio_filename) pros_vectors = tools.extract_prosodic_features(audio_filename) mfcc_vectors, pros_vectors = tools.shorten(mfcc_vectors, pros_vectors) input_vectors = np.concatenate((mfcc_vectors, pros_vectors), axis=1) elif args.feature_type == "Spectro": input_vectors = tools.calculate_spectrogram(audio_filename) elif args.feature_type == "Spectro+Pros": spectr_vectors = tools.calculate_spectrogram(audio_filename) pros_vectors = tools.extract_prosodic_features(audio_filename) spectr_vectors, pros_vectors = tools.shorten(spectr_vectors, pros_vectors) input_vectors = np.concatenate((spectr_vectors, pros_vectors), axis=1) # Step 2: Read BVH ges_str = np.load(gesture_filename) output_vectors = ges_str['clips'] # Subsample motion (from 60 fsp to 20 fps) output_vectors = output_vectors[0::3] # Step 3: Obtain text transcription: if isinstance(embedding_model, BertEmbedding): text_encoding = encode_json_transcript_with_bert( text_filename, embedding_model) elif isinstance(embedding_model, FastText): text_encoding = encode_json_transcript_with_fasttext( text_filename, embedding_model) if debug: print(input_vectors.shape) print(output_vectors.shape) print(text_encoding.shape) # Step 4: Align vector length min_len = min(len(input_vectors), len(output_vectors), 2 * len(text_encoding)) # make sure the length is even if min_len % 2 == 1: min_len -= 1 input_vectors, output_vectors = tools.shorten(input_vectors, output_vectors, min_len) text_encoding = text_encoding[:int(min_len / 2)] if debug: print(input_vectors.shape) print(output_vectors.shape) print(text_encoding.shape) if not augment_with_context: return input_vectors, text_encoding, output_vectors # create a list of sequences with a fixed past and future context length ( overlap them to use data more efficiently) # ToDo: make sure the allignment holds start_ind = args.past_context seq_step = 10 # overlap of sequences: 0.5s # Test if the context length is appropriate assert args.past_context % 2 == 0 assert args.future_context % 2 == 0 assert seq_step % 2 == 0 n_reserved_inds = seq_length + args.future_context stop_ind = input_vectors.shape[0] - n_reserved_inds input_vectors_final = np.array([ input_vectors[i - args.past_context:i + n_reserved_inds] for i in range(start_ind, stop_ind, seq_step) ]) stop_ind = output_vectors.shape[0] - n_reserved_inds output_vectors_final = np.array([ output_vectors[i - args.past_context:i + n_reserved_inds] for i in range(start_ind, stop_ind, seq_step) ]) # The text was sampled at half the sampling rate compared to audio # So the 1 frame of text corresponds to 2 frames of audio stop_ind = text_encoding.shape[0] - n_reserved_inds // 2 text_vectors_final = np.array([ text_encoding[i - args.past_context // 2:i + n_reserved_inds // 2] for i in range(start_ind // 2, stop_ind, seq_step // 2) ]) if debug: print(input_vectors_final.shape) print(output_vectors_final.shape) print(text_vectors_final.shape) return input_vectors_final, text_vectors_final, output_vectors_final