def remove_unkown_words(sentence: str) -> str:

    words_list = load_pickle_file(Path("words_list.p"))
    sentence_words = []
    for word in sentence.split():
        if word in words_list:
            sentence_words.append(word)

    return " ".join(sentence_words)
def get_clotho_loader(split: str,
                      is_training: bool,
                      settings_data: MutableMapping[
                          str, Union[str, bool, MutableMapping[str, str]]],
                      settings_io: MutableMapping[
                          str, Union[str, bool, MutableMapping[
                              str, Union[str, MutableMapping[str, str]]]]]) \
        -> DataLoader:
    """Gets the data loader.

    :param split: Split to be used.
    :type split: str
    :param is_training: Is training data?
    :type is_training: bool
    :param settings_data: Data loading and dataset settings.
    :type settings_data: dict
    :param settings_io: Files I/O settings.
    :type settings_io: dict
    :return: Data loader.
    :rtype: torch.utils.data.DataLoader
    """
    data_dir = Path(settings_io['root_dirs']['data'],
                    settings_io['dataset']['features_dirs']['output'])

    if settings_data['use_validation_split']:
        validation_files_path = Path(
            settings_io['root_dirs']['data'],
            settings_io['dataset']['pickle_files_dir'],
            settings_io['dataset']['files']['validation_files_file_name'])
        validation_files_path = load_pickle_file(validation_files_path)
        validation_files = [Path(i) for i in validation_files_path]
    else:
        validation_files = None

    dataset = ClothoDataset(
        data_dir=data_dir,
        split=split,
        input_field_name=settings_data['input_field_name'],
        output_field_name=settings_data['output_field_name'],
        load_into_memory=settings_data['load_into_memory'],
        multiple_captions_mode=settings_data['use_multiple_mode'],
        validation_files=validation_files)

    shuffle = settings_data['shuffle'] if is_training else False
    drop_last = settings_data['drop_last'] if is_training else False

    return DataLoader(dataset=dataset,
                      batch_size=settings_data['batch_size'],
                      shuffle=shuffle if split == 'development' else False,
                      num_workers=settings_data['num_workers'],
                      drop_last=drop_last if split == 'development' else False,
                      collate_fn=_clotho_collate_fn)
Example #3
0
    def __init__(self, root_dir, data_dir, data_fold, scene,
                 input_features_file_name, target_values_input_name, seq_len):
        """Base class for real life datasets.

        :param root_dir: The root directory for the dataset.
        :type root_dir: str
        :param data_dir: The data directory for the dataset.
        :type data_dir: str
        :param data_fold: The data fold.
        :type data_fold: int
        :param scene: The acoustic scene (if applicable, else '').
        :type scene: str
        :param input_features_file_name: Input features file name.
        :type input_features_file_name: str
        :param target_values_input_name: Target values file name.
        :type target_values_input_name: str
        :param seq_len: Amount of feature vectors in one sequence.
        :type seq_len: int
        """
        super(SEDRealLife, self).__init__()

        data_path = Path(root_dir, data_dir, scene,
                         'fold_{}'.format(data_fold))

        x_path = data_path.joinpath(input_features_file_name)
        y_path = data_path.joinpath(target_values_input_name)

        self.x = file_io.load_pickle_file(x_path)
        self.y = file_io.load_pickle_file(y_path)

        nb_sequences, red = divmod(self.x.shape[0], seq_len)

        self.x = np.concatenate(
            [self.x, np.zeros((seq_len - red, self.x.shape[-1]))]).reshape(
                (-1, seq_len, self.x.shape[-1]))
        self.y = np.concatenate(
            [self.y, np.zeros((seq_len - red, self.y.shape[-1]))]).reshape(
                (-1, seq_len, self.y.shape[-1]))
def _load_indices_file(settings_files: MutableMapping[str, Any],
                       settings_data: MutableMapping[str, Any]) \
        -> MutableSequence[str]:
    """Loads and returns the indices file.
    :param settings_files: Settings of file i/o to be used.
    :type settings_files: dict
    :param settings_data: Settings of data to be used. .
    :type settings_data: dict
    :return: The indices file.
    :rtype: list[str]
    """
    path = Path(settings_files['root_dirs']['data'],
                settings_files['dataset']['pickle_files_dir'])
    p_field = 'words_list_file_name' \
        if settings_data['output_field_name'].startswith('words') \
        else 'characters_list_file_name'
    return file_io.load_pickle_file(
        path.joinpath(settings_files['dataset']['files'][p_field]))
def extract_features_test(root_dir: str,
                          settings_data: MutableMapping[str, Any],
                          settings_features: MutableMapping[str, Any],
                          settings_audio: MutableMapping[str, Any]) \
        -> None:
    """Extracts test features from the audio data of Clotho.
    :param root_dir: Root dir for the data.
    :type root_dir: str
    :param settings_data: Settings for creating data files.
    :type settings_data: dict[str, T]
    :param settings_features: Settings for feature extraction.
    :type settings_features: dict[str, T]
    :param settings_audio: Settings for the audio.
    :type settings_audio: dict
    """
    # Get the root directory.
    dir_root = Path(root_dir)

    # Get the directories of files.
    dir_test = dir_root.joinpath(settings_data['audio_dirs']['downloaded'],
                                 settings_data['audio_dirs']['test'])

    audio_exists = False
    if dir_test.exists() and len(list(dir_test.iterdir())) != 0:
        audio_exists = True
    if not audio_exists:
        raise AttributeError(
            'Testing workflow selected, but could not find the test set audio files. '
            'Please download the test set audio before making test predictions.'
        )

    # Get the directories for output.
    dir_output_test = dir_root.joinpath(
        settings_data['features_dirs']['output'],
        settings_data['features_dirs']['test'])

    words_list = load_pickle_file(
        dir_root.joinpath(settings_data['pickle_files_dir'],
                          settings_data['files']['words_list_file_name']))

    # Create the directories.
    dir_output_test.mkdir(parents=True, exist_ok=True)

    # Apply the function to each file and save the result.
    for data_file_name in filter(lambda _x: _x.is_file(), dir_test.iterdir()):
        # Load the audio
        audio = load_audio_file(audio_file=str(data_file_name),
                                sr=int(settings_audio['sr']),
                                mono=settings_audio['to_mono'])

        # Extract the features.
        features = feature_extraction(audio, **settings_features['process'])

        # Populate the recarray data and dtypes.
        array_data = (data_file_name.name, )
        dtypes = [('file_name', f'U{len(data_file_name.name)}')]

        # Check if we keeping the raw audio data.
        if settings_features['keep_raw_audio_data']:
            # And add them to the recarray data and dtypes.
            array_data += (audio, )
            dtypes.append(('audio_data', audio.dtype))

        # Add the rest to the recarray.
        # Word indices are required for the dataloader to work
        array_data += (features,
                       np.array([
                           words_list.index('<sos>'),
                           words_list.index('<eos>')
                       ]))
        dtypes.extend([('features', np.dtype(object)),
                       ('words_ind', np.dtype(object))])

        # Make the recarray
        np_rec_array = np.rec.array([array_data], dtype=dtypes)

        # Make the path for serializing the recarray.
        parent_path = dir_output_test

        file_template = settings_data['files'][
            'np_file_name_template'].replace('_{caption_index}', '')
        file_path = parent_path.joinpath(
            file_template.format(audio_file_name=data_file_name.name))

        # Dump it.
        dump_numpy_object(np_rec_array, file_path)
def check_data_for_split(dir_audio: Path, dir_data: Path, dir_root: Path,
                         csv_split: MutableSequence[MutableMapping[str, str]],
                         settings_ann: MutableMapping[str, Any],
                         settings_audio: MutableMapping[str, Any],
                         settings_cntr: MutableMapping[str, Any]) -> None:
    """Goes through all audio files and checks the created data.

    Gets each audio file and checks if there are associated data. If there are,\
    checks the validity of the raw audio data and the validity of the captions,\
    words, and characters.

    :param dir_audio: Directory with the audio files.
    :type dir_audio: pathlib.Path
    :param dir_data: Directory with the data to be checked.
    :type dir_data: pathlib.Path
    :param dir_root: Root directory.
    :type dir_root: pathlib.Path
    :param csv_split: CSV entries for the data/
    :type csv_split: list[collections.OrderedDict]
    :param settings_ann: Settings for annotations.
    :type settings_ann: dict
    :param settings_audio: Settings for audio.
    :type settings_audio: dict
    :param settings_cntr: Settings for counters.
    :type settings_cntr: dict
    """
    # Load the words and characters lists
    words_list = load_pickle_file(
        dir_root.joinpath(settings_cntr['words_list_file_name']))
    chars_list = load_pickle_file(
        dir_root.joinpath(settings_cntr['characters_list_file_name']))

    for csv_entry in csv_split:
        # Get audio file name
        file_name_audio = Path(csv_entry[settings_ann['audio_file_column']])

        # Check if the audio file existed originally
        if not dir_audio.joinpath(file_name_audio).exists():
            raise FileExistsError(
                'Audio file {f_name_audio} not exists in {d_audio}'.format(
                    f_name_audio=file_name_audio, d_audio=dir_audio))

        # Flag for checking if there are data files for the audio file
        audio_has_data_files = False

        # Get the original audio data
        data_audio_original = load_audio_file(audio_file=str(
            dir_audio.joinpath(file_name_audio)),
                                              sr=int(settings_audio['sr']),
                                              mono=settings_audio['to_mono'])

        for data_file in dir_root.joinpath(dir_data).iterdir():
            # Get the stem of the audio file name
            f_stem = str(data_file).split('file_')[-1].split('.wav_')[0]

            if f_stem == file_name_audio.stem:
                audio_has_data_files = True
                # Get the numpy record array
                data_array = load_numpy_object(data_file)

                # Get the audio data from the numpy record array
                data_audio_rec_array = data_array['audio_data'].item()

                # Compare the lengths
                if len(data_audio_rec_array) != len(data_audio_original):
                    raise ValueError(
                        'File {f_audio} was not saved successfully to the numpy '
                        'object {f_np}.'.format(f_audio=file_name_audio,
                                                f_np=data_file))

                # Check all elements, one to one
                if not all([
                        data_audio_original[i] == data_audio_rec_array[i]
                        for i in range(len(data_audio_original))
                ]):
                    raise ValueError(
                        'Numpy object {} has wrong audio data.'.format(
                            data_file))

                # Get the original caption
                caption_index = data_array['caption_ind'].item()

                # Clean it to remove any spaces before punctuation.
                original_caption = clean_sentence(
                    sentence=csv_entry[settings_ann['captions_fields_prefix'].
                                       format(caption_index + 1)],
                    keep_case=True,
                    remove_punctuation=False,
                    remove_specials=not settings_ann['use_special_tokens'])

                # Check with the file caption
                caption_data_array = clean_sentence(
                    sentence=data_array['caption'].item(),
                    keep_case=True,
                    remove_punctuation=False,
                    remove_specials=not settings_ann['use_special_tokens'])

                if not original_caption == caption_data_array:
                    raise ValueError(
                        'Numpy object {} has wrong caption.'.format(data_file))

                # Since caption in the file is OK, we can use it instead of
                # the original, because it already has the special tokens.
                caption_data_array = clean_sentence(
                    sentence=data_array['caption'].item(),
                    keep_case=settings_ann['keep_case'],
                    remove_punctuation=settings_ann[
                        'remove_punctuation_words'],
                    remove_specials=not settings_ann['use_special_tokens'])

                # Check with the indices of words
                words_indices = data_array['words_ind'].item()
                caption_form_words = ' '.join(
                    [words_list[i] for i in words_indices])

                if not caption_data_array == caption_form_words:
                    raise ValueError(
                        'Numpy object {} has wrong words indices.'.format(
                            data_file))

                # Check with the indices of characters
                caption_from_chars = ''.join(
                    [chars_list[i] for i in data_array['chars_ind'].item()])

                caption_data_array = clean_sentence(
                    sentence=data_array['caption'].item(),
                    keep_case=settings_ann['keep_case'],
                    remove_punctuation=settings_ann[
                        'remove_punctuation_chars'],
                    remove_specials=not settings_ann['use_special_tokens'])

                if not caption_data_array == caption_from_chars:
                    raise ValueError('Numpy object {} has wrong characters '
                                     'indices.'.format(data_file))

        if not audio_has_data_files:
            raise FileExistsError(
                'Audio file {} has no associated data.'.format(
                    file_name_audio))