def train_dataloader(self, batch_size): return DataLoader(AudioDataset(HPText.dataset, start_idx=0, end_idx=HPText.num_train, durations=False), batch_size=batch_size, collate_fn=self.collate, shuffle=True)
def val_dataloader(self, batch_size): dataset = AudioDataset(HPText.dataset, start_idx=HPText.num_train, end_idx=HPText.num_valid, durations=False) return DataLoader(dataset, batch_size=batch_size, collate_fn=self.collate, shuffle=False, sampler=SequentialSampler(dataset))
def _parse_function(example_proto, y_label=None): features = { LABEL: tf.FixedLenFeature([], dtype=tf.string), NAME: tf.FixedLenFeature([], dtype=tf.string), AUDIO: tf.VarLenFeature(dtype=tf.float32), #max 396900 } parsed_features = tf.parse_single_example(example_proto, features) # Expand dims for channels audio = tf.expand_dims(tf.concat(parsed_features[AUDIO].values, axis=0), axis=-1) label = AudioDataset.str_label_to_int(parsed_features[y_label], LABEL_TO_INT_DICT) if y_label: return (audio, label,) else: return (audio,)
parser = argparse.ArgumentParser() parser.add_argument("--batch_size", default=64, type=int, help="Batch size") parser.add_argument("--num_samples", default=4096, type=int, help="On how many samples to compute statistics") parser.add_argument("--dataset", default=HPText.dataset, type=str, help="Path to dataset") args = parser.parse_args() collate = Collate('cuda' if torch.cuda.is_available() else 'cpu', standardize=False) dl = DataLoader(AudioDataset(HPText.dataset, alignments=True, end_idx=args.num_samples), collate_fn=collate, batch_size=args.batch_size, shuffle=False) maxi = float('-inf') mini = float('inf') mean = 0 std = 0 w = 0 for i, b in enumerate(Bar(dl), 1): s, slen, _, plen, _, _ = b m = mask(s, slen, dim=1)
dx = x[1:] - x[:-1] return torch.all(dx >= 0, dim=0) if __name__ == '__main__': import argparse import sys sys.path.append('code') from torch.utils.data import DataLoader from torch.utils.data.sampler import SequentialSampler from datasets.AudioDataset import AudioDataset from duration_extractor_custom import DurationExtractor, Collate parser = argparse.ArgumentParser() parser.add_argument("checkpoint", type=str, help="Path to checkpoint of convolutional_cacotron model") parser.add_argument("data_folder", type=str, help="Where the data live and where to save durations.") parser.add_argument("--durations_filename", default='durations.txt', type=str, help="Name of the final durations file.") parser.add_argument("--batch_size", default=64, type=int, help="Batch size") args = parser.parse_args() # Load pretrained checkpoint and extract alignments to data_folder m = DurationExtractor().load(args.checkpoint) dataset = AudioDataset(root=args.data_folder, durations=False) dataloader = DataLoader(dataset, batch_size=args.batch_size, collate_fn=Collate(m.device), shuffle=False, sampler=SequentialSampler(dataset)) save_alignments_as_fertilities(m, dataloader, args.data_folder, args.durations_filename) #python code/extract_durations.py checkpoints/aligner_ind_10500.pth code/datasets/data/TITML-IDN-LJ --batch_size 16 --durations_filename durations_aligner_ind_10500.txt