Esempio n. 1
0
def build_transforms(cfg):
    transforms = Compose([
        Resize(cfg.TRANSFORMS.RESIZE_SIZE),
        ToTensor(),
        Normalize(mean=cfg.TRANSFORMS.MEAN, std=cfg.TRANSFORMS.STD)
    ])
    return transforms
Esempio n. 2
0
def get_dataset(rhythm, split, features):
    if features == "raw":
        transforms = None
        cache = False
    elif features == "ms":
        transforms = Compose([MelSpectogram(8000)])
        cache = True
    log.info("Transforms: {}".format(transforms))
    return LibrivoxDataset(split, rhythm, transforms=transforms, cache=cache)
Esempio n. 3
0
    def __init__(self,
                 data_dir,
                 split='train',
                 transform=None,
                 img_size=416,
                 rtn_path=False,
                 keep_difficult=False):
        """Dataset for VOC data.
		Args:
			data_dir: the root of the VOC2007 or VOC2012 dataset, the directory contains the following sub-directories:
				Annotations, ImageSets, JPEGImages, SegmentationClass, SegmentationObject.
		"""
        self.rtn_path = rtn_path
        if split == 'train':
            transform = [
                ConvertFromInts(),
                PhotometricDistort(),
                Expand([123, 117, 104]),
                RandomSampleCrop(),
                RandomMirror(),
                ToPercentCoords(),
                Resize(img_size),
                SubtractMeans([123, 117, 104]),
                ToTensor(),
            ]
        else:
            transform = [
                Resize(img_size),
                #ToPercentCoords(),
                SubtractMeans([123, 117, 104]),
                ToTensor()
            ]
        self.transform = Compose(transform)
        self.data_dir = data_dir
        self.split = split
        if split != 'test':
            image_sets_file = [
                os.path.join(self.data_dir, f'VOC{year}', "ImageSets", "Main",
                             "%s.txt" % self.split) for year in [2007, 2012]
            ]
            self.ids = VOCDataset._read_image_ids(image_sets_file)
        else:
            image_sets_file = [
                os.path.join(self.data_dir, f'VOC{year}', "ImageSets", "Main",
                             "%s.txt" % self.split) for year in [2007]
            ]
            self.ids = VOCDataset._read_image_ids(image_sets_file)
        self.keep_difficult = keep_difficult
        self.batch_count = 0
        self.img_size = 416
        self.min_size = self.img_size - 3 * 32
        self.max_size = self.img_size + 3 * 32
        self.class_dict = {
            class_name: i
            for i, class_name in enumerate(self.class_names)
        }
Esempio n. 4
0
 def val_dataloader(self):
     transforms = Compose([self.text_transform, ToNumpy(), AudioSqueeze()])
     dataset_val = get_dataset(self.config,
                               part='val',
                               transforms=transforms)
     dataset_val = torch.utils.data.DataLoader(dataset_val,
                                               batch_size=1,
                                               collate_fn=no_pad_collate,
                                               num_workers=1)
     return dataset_val
    def get_transforms(stage: str = None,
                       mode: str = None,
                       image_size: int = 256,
                       min_multiple: int = 64,
                       detection_pixel_threshold: float = 0.5,
                       detection_area_threshold: int = 10):
        """

        :param stage:
        :param mode:
        :param image_size: maximal image side for transformed image
        :param min_multiple: minimal multiple for each image side of transformed image
            (it is recommended to use your neural network scale here)
        :param detection_pixel_threshold: threshold to binarize "detection" output channels
        :param detection_area_threshold: threshold to filter-out too small objects (with area < threshold)
        :return:
        """
        pre_transform_fn = pre_transforms(image_size=image_size,
                                          min_multiple=min_multiple)

        if mode == _Modes.TRAIN:
            post_transform_fn = Compose([
                hard_transform(image_size=image_size),
                post_transforms(),
            ])
        elif mode == _Modes.VALID:
            post_transform_fn = Compose(
                [soft_transform(image_size=image_size),
                 post_transforms()])
        elif mode == _Modes.INFER:
            post_transform_fn = post_transforms()
        else:
            raise NotImplementedError()

        transform_fn = Compose([pre_transform_fn, post_transform_fn])
        converter = Converter(TargetMapInfo())

        process = DictTransformer(converter=converter,
                                  transform_fn=transform_fn,
                                  build_before=False)

        return process
Esempio n. 6
0
 def train_dataloader(self):
     transforms = Compose([self.text_transform, ToNumpy(), AudioSqueeze()])
     dataset_train = get_dataset(self.config,
                                 part='train',
                                 transforms=transforms)
     dataset_train = torch.utils.data.DataLoader(
         dataset_train,
         batch_size=self.batch_size,
         collate_fn=no_pad_collate,
         shuffle=True,
         num_workers=self.num_workers)
     return dataset_train
Esempio n. 7
0
 def __init__(self, config, Vocoder=None):
     super(Tacotron2Trainer, self).__init__()
     fix_seeds(seed=config.train.seed)
     self.model = Tacotron2(config)
     self.lr = config.train.lr
     self.batch_size = config.train.batch_size
     self.weight_decay = config.train.get('weight_decay', 0.)
     self.num_workers = config.train.get('num_workers', 4)
     self.step_size = config.train.get('step_size', 15)
     self.gamma = config.train.get('gamma', 0.2)
     self.text_transform = TextPreprocess(config.alphabet)
     self.mel = MelSpectrogram()
     self.gpu = ToGpu('cuda' if torch.cuda.is_available() else 'cpu')
     self.preprocess = Compose([AddLengths(), Pad()])
     self.mseloss = nn.MSELoss()
     self.gate_bce = nn.BCEWithLogitsLoss()
     self.g = config.train.get('guiding_window_width', 0.2)
     if Vocoder is not None:
         self.vocoder = Vocoder().eval()
     else:
         self.vocoder = None
     self.config = config
     self.sample_rate = config.dataset.get('sample_rate', 16000)
     self.epoch_idx = 0