def train_dataloader(self, batch_size):
     return DataLoader(AudioDataset(HPText.dataset,
                                    start_idx=0,
                                    end_idx=HPText.num_train,
                                    durations=False),
                       batch_size=batch_size,
                       collate_fn=self.collate,
                       shuffle=True)
 def val_dataloader(self, batch_size):
     dataset = AudioDataset(HPText.dataset,
                            start_idx=HPText.num_train,
                            end_idx=HPText.num_valid,
                            durations=False)
     return DataLoader(dataset,
                       batch_size=batch_size,
                       collate_fn=self.collate,
                       shuffle=False,
                       sampler=SequentialSampler(dataset))
Esempio n. 3
0
    def _parse_function(example_proto, y_label=None):
        features = {
            LABEL: tf.FixedLenFeature([], dtype=tf.string),
            NAME:  tf.FixedLenFeature([], dtype=tf.string),
            AUDIO: tf.VarLenFeature(dtype=tf.float32), #max 396900
        }
        parsed_features = tf.parse_single_example(example_proto, features)

        # Expand dims for channels
        audio = tf.expand_dims(tf.concat(parsed_features[AUDIO].values, axis=0), axis=-1)
        label = AudioDataset.str_label_to_int(parsed_features[y_label], LABEL_TO_INT_DICT)

        if y_label:
            return (audio, label,)
        else:
            return (audio,)
Esempio n. 4
0
parser = argparse.ArgumentParser()
parser.add_argument("--batch_size", default=64, type=int, help="Batch size")
parser.add_argument("--num_samples",
                    default=4096,
                    type=int,
                    help="On how many samples to compute statistics")
parser.add_argument("--dataset",
                    default=HPText.dataset,
                    type=str,
                    help="Path to dataset")
args = parser.parse_args()

collate = Collate('cuda' if torch.cuda.is_available() else 'cpu',
                  standardize=False)
dl = DataLoader(AudioDataset(HPText.dataset,
                             alignments=True,
                             end_idx=args.num_samples),
                collate_fn=collate,
                batch_size=args.batch_size,
                shuffle=False)

maxi = float('-inf')
mini = float('inf')

mean = 0
std = 0
w = 0

for i, b in enumerate(Bar(dl), 1):
    s, slen, _, plen, _, _ = b
    m = mask(s, slen, dim=1)
    dx = x[1:] - x[:-1]
    return torch.all(dx >= 0, dim=0)


if __name__ == '__main__':
    import argparse
    import sys
    sys.path.append('code')

    from torch.utils.data import DataLoader
    from torch.utils.data.sampler import SequentialSampler

    from datasets.AudioDataset import AudioDataset
    from duration_extractor_custom import DurationExtractor, Collate

    parser = argparse.ArgumentParser()
    parser.add_argument("checkpoint", type=str, help="Path to checkpoint of convolutional_cacotron model")
    parser.add_argument("data_folder", type=str, help="Where the data live and where to save durations.")
    parser.add_argument("--durations_filename", default='durations.txt', type=str, help="Name of the final durations file.")
    parser.add_argument("--batch_size", default=64, type=int, help="Batch size")
    args = parser.parse_args()

    # Load pretrained checkpoint and extract alignments to data_folder
    m = DurationExtractor().load(args.checkpoint)
    dataset = AudioDataset(root=args.data_folder, durations=False)
    dataloader = DataLoader(dataset, batch_size=args.batch_size, collate_fn=Collate(m.device),
                      shuffle=False, sampler=SequentialSampler(dataset))

    save_alignments_as_fertilities(m, dataloader, args.data_folder, args.durations_filename)

#python code/extract_durations.py checkpoints/aligner_ind_10500.pth code/datasets/data/TITML-IDN-LJ --batch_size 16 --durations_filename durations_aligner_ind_10500.txt