Ejemplo n.º 1
0
    def process_directory(self, input_directory: str, output_directory: str):
        """
        Run process to the input directory where audio files exist and save processed audio files
        :param input_directory: Path to input audio file
        :param output_directory: Output directory
        """
        # Make directory if it does not exist
        if not os.path.isdir(output_directory):
            os.mkdir(output_directory)

        # Extract file names in the input directory
        file_names = FileUtil.get_file_names(input_directory)

        # Apply process to audio files in the input directory
        for audio_file in file_names:
            # Process one audio file
            try:
                processed_audio = self.process_audio_file(
                    os.path.join(input_directory, audio_file),
                    self.audio_length, self.sampling_rate)
                # Rename file
                file_path, extension = os.path.splitext(audio_file)

                # Save it
                self.save_audio_file(
                    processed_audio,
                    os.path.join(output_directory, file_path + ".wav"),
                    self.sampling_rate, self.normalize)
            except Exception as err:
                print(f"Skipping too short audio: {err}")
    def extract_dataset(self, dataset_path: str):
        """
        Feature extraction to dataset
        Extract time series feature as 2D pandas dataframe and 3D numpy array, as well as label vector as list
        :param  dataset_path: path to dataset
        :return directory_files_feature_dict: dictionary of extracted features from all audio files in dataset folder
        {key: name of directory, value: list of file names {key: file name, value: list of extracted features}}
        :return label_list: list of numerical label vector
        """
        # Make label
        label_list = self.make_label_from_directory(dataset_path)

        # Get file names and store them into a dictionary
        directory_files_dict = {}
        for directory in FileUtil.get_folder_names(dataset_path, sort=True):
            directory_files_dict[directory] = FileUtil.get_file_names(
                os.path.join(dataset_path, directory))

        # Extract all features and store them into list
        directory_files_feature_dict = {}
        for directory, audio_files in tqdm(directory_files_dict.items()):
            # Apply feature extraction to one directory
            directory_files_feature_dict[directory] = self.extract_directory(
                os.path.join(dataset_path, directory))

        return directory_files_feature_dict, label_list
Ejemplo n.º 3
0
 def folder2label(input_data_directory: str) -> list:
     """
     # Get unique values in one column
     :param  input_data_directory: input data directory where data folder with the label name exist
     :return list of folder names
     """
     return FileUtil.get_file_names(input_data_directory)
Ejemplo n.º 4
0
    def make_label_from_directory(dataset_path: str):
        # Init parameter
        dir_num = 0
        label_list = []

        # Iterate over directories
        for directory in FileUtil.get_folder_names(dataset_path, sort=True):
            # Make label as list
            label_list.extend([dir_num] * len(FileUtil.get_file_names(os.path.join(dataset_path, directory))))
            dir_num += 1
        return label_list
    def extract_dataset(self, dataset_path: str, stats_type: str):
        """
        Feature extraction to dataset
        Extract time series feature as 2D pandas dataframe and 3D numpy array, as well as label vector as list
        :param  dataset_path: path to dataset
        :param  stats_type: type of statistics for 2D feature
        :return all_2d_dataframe: 2D feature pandas dataframe across all frames
        :return all_3d_array: 3D feature numpy array across all frames
        :return label_list: list of numerical label vector
        """
        # Get folder names under data set path
        directory_names = FileUtil.get_folder_names(dataset_path, sort=True)

        # Get file names and store them into a dictionary
        directory_files_dict = {}
        for directory in directory_names:
            directory_files_dict[directory] = FileUtil.get_file_names(os.path.join(dataset_path, directory))

        # Extract all features and store them into list
        all_2d_dataframe = pd.DataFrame()
        dir_num = 0
        label_list = []
        for directory, audio_files in directory_files_dict.items():
            # Apply feature extraction to a directory
            file_feature_stat_dict, class_3d_feature = self.extract_directory(os.path.join(dataset_path, directory), stats_type)

            # Convert dictionary to data frame
            class_2d_dataframe = DataProcess.dict2dataframe(file_feature_stat_dict, segment_feature=True)

            # Add label to 2D feature data frame
            class_2d_dataframe_with_label = DataProcess.add_label(class_2d_dataframe, directory)

            # Combine 2D feature data frame
            all_2d_dataframe = all_2d_dataframe.append(class_2d_dataframe_with_label)

            # Append 3D arrays
            if dir_num == 0:
                all_3d_array = class_3d_feature
            else:
                all_3d_array = np.dstack((all_3d_array, class_3d_feature))

            # Make label as list
            a = [dir_num] * len(audio_files)
            label_list.extend(a)
            dir_num += 1

        # Transpose 3D array
        all_3d_array = all_3d_array.T

        return all_2d_dataframe, all_3d_array, label_list
Ejemplo n.º 6
0
    def process_directory(self, input_directory: str, output_directory: str):
        """
        Run process to the input directory where audio files exist and save processed audio files
        :param input_directory: Path to input audio file
        :param output_directory: Output directory
        """
        # Make directory if it does not exist
        if not os.path.isdir(output_directory):
            os.mkdir(output_directory)

        # Extract file names in the input directory
        file_names = FileUtil.get_file_names(input_directory)

        # Apply process to audio files in the input directory
        for audio_file in file_names:
            # Process one audio file
            processed_audio = self.process_audio_file(os.path.join(input_directory, audio_file), self.audio_length, self.sampling_rate)
            # Save it
            self.save_audio_file(processed_audio, os.path.join(output_directory, audio_file), self.sampling_rate, self.normalize)
Ejemplo n.º 7
0
    def extract_directory(self, input_directory: str):
        """
        Feature extraction to a folder which contains audio files
        :param  input_directory: folder name which has audio files
        :return dictionary of extracted features from audio file
                {key: name of files, value: list of extracted features}
        """
        # Extract file names in the input directory
        file_names = FileUtil.get_file_names(input_directory)

        # Extract features from audio files in a directory
        # file_feature_stat_dict = {}
        file_feature_dict = {}
        start = time.time()

        # Extract each audio file
        for count, audio_file in tqdm(enumerate(file_names)):
            # Extract features from one audio file
            file_feature_dict[audio_file] = self.extract_file(os.path.join(input_directory, audio_file))

        print("Extracted {0} with {1} \n".format(input_directory, time.time() - start))
        return file_feature_dict
    def extract_directory(self, input_directory: str, stats_type: str):
        """
        Feature extraction to a folder which contains audio files
        :param  input_directory: folder name which has audio files
        :param  stats_type: type of statistics
        :return dictionary of extracted features from audio file
                {key: name of feature, value: list of array(number of frames)}
        :return directory_short_feature_list: short-term features extracted from all audio files from one directory
        """
        # Extract file names in the input directory
        file_names = FileUtil.get_file_names(input_directory)

        # Extract features from audio files in a directory
        file_feature_stat_dict = {}
        start = time.time()

        # Extract each audio file
        for count, audio_file in enumerate(file_names):
            # Extract features from one audio file
            frame_extracted_feature, file_short_feature_list = self.extract_file(os.path.join(input_directory, audio_file))

            # Append short-term feature to 3D array
            if count == 0:
                directory_3d_feature = np.array(file_short_feature_list).T
                print("Data structure is {}".format(directory_3d_feature.shape))
            else:
                directory_3d_feature = np.dstack((directory_3d_feature, np.array(file_short_feature_list).T))
                print("Data structure is {}".format(directory_3d_feature.shape))

            # Take stats across frames
            file_feature_stat_dict[audio_file] = self.get_feature_stats(frame_extracted_feature, stats_type)

        end = time.time()
        print("Extracted {0} with {1} \n".format(input_directory, end - start))

        return file_feature_stat_dict, directory_3d_feature