def build_transforms(cfg): transforms = Compose([ Resize(cfg.TRANSFORMS.RESIZE_SIZE), ToTensor(), Normalize(mean=cfg.TRANSFORMS.MEAN, std=cfg.TRANSFORMS.STD) ]) return transforms
def get_dataset(rhythm, split, features): if features == "raw": transforms = None cache = False elif features == "ms": transforms = Compose([MelSpectogram(8000)]) cache = True log.info("Transforms: {}".format(transforms)) return LibrivoxDataset(split, rhythm, transforms=transforms, cache=cache)
def __init__(self, data_dir, split='train', transform=None, img_size=416, rtn_path=False, keep_difficult=False): """Dataset for VOC data. Args: data_dir: the root of the VOC2007 or VOC2012 dataset, the directory contains the following sub-directories: Annotations, ImageSets, JPEGImages, SegmentationClass, SegmentationObject. """ self.rtn_path = rtn_path if split == 'train': transform = [ ConvertFromInts(), PhotometricDistort(), Expand([123, 117, 104]), RandomSampleCrop(), RandomMirror(), ToPercentCoords(), Resize(img_size), SubtractMeans([123, 117, 104]), ToTensor(), ] else: transform = [ Resize(img_size), #ToPercentCoords(), SubtractMeans([123, 117, 104]), ToTensor() ] self.transform = Compose(transform) self.data_dir = data_dir self.split = split if split != 'test': image_sets_file = [ os.path.join(self.data_dir, f'VOC{year}', "ImageSets", "Main", "%s.txt" % self.split) for year in [2007, 2012] ] self.ids = VOCDataset._read_image_ids(image_sets_file) else: image_sets_file = [ os.path.join(self.data_dir, f'VOC{year}', "ImageSets", "Main", "%s.txt" % self.split) for year in [2007] ] self.ids = VOCDataset._read_image_ids(image_sets_file) self.keep_difficult = keep_difficult self.batch_count = 0 self.img_size = 416 self.min_size = self.img_size - 3 * 32 self.max_size = self.img_size + 3 * 32 self.class_dict = { class_name: i for i, class_name in enumerate(self.class_names) }
def val_dataloader(self): transforms = Compose([self.text_transform, ToNumpy(), AudioSqueeze()]) dataset_val = get_dataset(self.config, part='val', transforms=transforms) dataset_val = torch.utils.data.DataLoader(dataset_val, batch_size=1, collate_fn=no_pad_collate, num_workers=1) return dataset_val
def get_transforms(stage: str = None, mode: str = None, image_size: int = 256, min_multiple: int = 64, detection_pixel_threshold: float = 0.5, detection_area_threshold: int = 10): """ :param stage: :param mode: :param image_size: maximal image side for transformed image :param min_multiple: minimal multiple for each image side of transformed image (it is recommended to use your neural network scale here) :param detection_pixel_threshold: threshold to binarize "detection" output channels :param detection_area_threshold: threshold to filter-out too small objects (with area < threshold) :return: """ pre_transform_fn = pre_transforms(image_size=image_size, min_multiple=min_multiple) if mode == _Modes.TRAIN: post_transform_fn = Compose([ hard_transform(image_size=image_size), post_transforms(), ]) elif mode == _Modes.VALID: post_transform_fn = Compose( [soft_transform(image_size=image_size), post_transforms()]) elif mode == _Modes.INFER: post_transform_fn = post_transforms() else: raise NotImplementedError() transform_fn = Compose([pre_transform_fn, post_transform_fn]) converter = Converter(TargetMapInfo()) process = DictTransformer(converter=converter, transform_fn=transform_fn, build_before=False) return process
def train_dataloader(self): transforms = Compose([self.text_transform, ToNumpy(), AudioSqueeze()]) dataset_train = get_dataset(self.config, part='train', transforms=transforms) dataset_train = torch.utils.data.DataLoader( dataset_train, batch_size=self.batch_size, collate_fn=no_pad_collate, shuffle=True, num_workers=self.num_workers) return dataset_train
def __init__(self, config, Vocoder=None): super(Tacotron2Trainer, self).__init__() fix_seeds(seed=config.train.seed) self.model = Tacotron2(config) self.lr = config.train.lr self.batch_size = config.train.batch_size self.weight_decay = config.train.get('weight_decay', 0.) self.num_workers = config.train.get('num_workers', 4) self.step_size = config.train.get('step_size', 15) self.gamma = config.train.get('gamma', 0.2) self.text_transform = TextPreprocess(config.alphabet) self.mel = MelSpectrogram() self.gpu = ToGpu('cuda' if torch.cuda.is_available() else 'cpu') self.preprocess = Compose([AddLengths(), Pad()]) self.mseloss = nn.MSELoss() self.gate_bce = nn.BCEWithLogitsLoss() self.g = config.train.get('guiding_window_width', 0.2) if Vocoder is not None: self.vocoder = Vocoder().eval() else: self.vocoder = None self.config = config self.sample_rate = config.dataset.get('sample_rate', 16000) self.epoch_idx = 0