def __init__(
            self,
            file_name,
            sequence_len: int,
            hop: int,
            sr: int = 44100,
            fft_size: int = 4096,
            fft_hop: int = 441,
            n_freq_bins: int = 256,
            freq_compression: str = "linear",
            f_min: int = 200,
            f_max: int = 18000,
            cache_dir=None  #added
    ):
        self.sequence_len = sequence_len
        self.hop = hop

        self.audio = T.load_audio_file(file_name, sr=sr, mono=True)
        self.n_frames = self.audio.shape[1]

        self.t = [
            T.PreEmphasize(DefaultSpecDatasetOps["preemphases"]),
            T.Spectrogram(fft_size, fft_hop, center=False),
        ]

        if freq_compression == "linear":
            self.t.append(T.Interpolate(n_freq_bins, sr, f_min, f_max))
        elif freq_compression == "mel":
            self.t.append(
                T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max))
        elif freq_compression == "mfcc":
            t_mel = T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max)
            self.t.append(T.Compose(t_mel, T.M2MFCC()))
        else:
            raise "Undefined frequency compression"
        self.t.append(
            T.Amp2Db(min_level_db=DefaultSpecDatasetOps["min_level_db"]))
        self.t.append(
            T.Normalize(
                min_level_db=DefaultSpecDatasetOps["min_level_db"],
                ref_level_db=DefaultSpecDatasetOps["ref_level_db"],
            ))

        #self.file_reader = AsyncFileReader()
        self.t = T.CachedSpectrogram(
            cache_dir=cache_dir,
            spec_transform=T.Compose(self.t),
            n_fft=fft_size,
            hop_length=fft_hop,
            #file_reader=AsyncFileReader()
        )
Exemple #2
0
 def _create_transform(self):
     print('Set Augmentation...')
     self.transforms = transforms.Compose([
         transforms.RandomCrop([self.opt.crop_height, self.opt.crop_width]),
         transforms.RandomVerticalFlip(),
         transforms.RandomHorizontalFlip()
     ])
    def __init__(
        self,
        file_name,
        sequence_len: int,
        hop: int,
        sr: int = 44100,
        fft_size: int = 4096,
        fft_hop: int = 441,
        n_freq_bins: int = 256,
        freq_compression: str = "linear",
        f_min: int = 200,
        f_max: int = 18000,
        center=True
    ):

        self.sp = signal.signal_proc()

        self.hop = hop
        self.center = center
        self.filename = file_name
        self.sequence_len = sequence_len
        self.audio = T.load_audio_file(file_name, sr=sr, mono=True)
        self.n_frames = self.audio.shape[1]

        spec_t = [
            T.PreEmphasize(DefaultSpecDatasetOps["preemphases"]),
            T.Spectrogram(fft_size, fft_hop, center=self.center),
        ]

        self.spec_transforms = T.Compose(spec_t)

        if freq_compression == "linear":
            self.t_compr_f = (T.Interpolate(n_freq_bins, sr, f_min, f_max))
        elif freq_compression == "mel":
            self.t_compr_f = (T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max))
        elif freq_compression == "mfcc":
            t_mel = T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max)
            self.t_compr_f = (T.Compose(t_mel, T.M2MFCC()))
        else:
            raise "Undefined frequency compression"

        self.t_compr_a = T.Amp2Db(min_level_db=DefaultSpecDatasetOps["min_level_db"])

        self.t_norm = T.Normalize(
                min_level_db=DefaultSpecDatasetOps["min_level_db"],
                ref_level_db=DefaultSpecDatasetOps["ref_level_db"]
        )
def evaluate(args):
    assert torch.cuda.is_available(), 'CUDA is not available.'
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True

    print('The image is {:}'.format(args.image))
    print('The model is {:}'.format(args.model))
    snapshot = Path(args.model)
    assert snapshot.exists(), 'The model path {:} does not exist'
    facebox=face_detect(args.image,args.face_detector)

    print('The face bounding box is {:}'.format(facebox))
    assert len(facebox)==4,'Invalid face input : {:}'.format(facebox)
    snapshot = torch.load(str(snapshot))

    # General Data Argumentation
    mean_fill = tuple([int(x * 255) for x in [0.485, 0.456, 0.406]])
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])

    param = snapshot['args']
    eval_transform = transforms.Compose(
        [transforms.PreCrop(param.pre_crop_expand), transforms.TrainScale2WH((param.crop_width, param.crop_height)),
         transforms.ToTensor(), normalize])
    model_config = load_configure(param.model_config, None)
    dataset = GeneralDataset(eval_transform, param.sigma, model_config.downsample, param.heatmap_type, param.data_indicator)
    dataset.reset(param.num_pts)

    net = obtain_model(model_config, param.num_pts + 1)
    net = net.cuda()
    weights = remove_module_dict(snapshot['state_dict'])
    net.load_state_dict(weights)
    print('Prepare input data')
    [image, _, _, _, _, _, cropped_size], meta = dataset.prepare_input(args.image, facebox)
    inputs = image.unsqueeze(0).cuda()
    # network forward
    with torch.no_grad():
        batch_heatmaps, batch_locs, batch_scos = net(inputs)
    # obtain the locations on the image in the orignial size
    cpu = torch.device('cpu')
    np_batch_locs, np_batch_scos, cropped_size = batch_locs.to(cpu).numpy(), batch_scos.to(
        cpu).numpy(), cropped_size.numpy()
    locations, scores = np_batch_locs[0, :-1, :], np.expand_dims(np_batch_scos[0, :-1], -1)

    scale_h, scale_w = cropped_size[0] * 1. / inputs.size(-2), cropped_size[1] * 1. / inputs.size(-1)

    locations[:, 0], locations[:, 1] = locations[:, 0] * scale_w + cropped_size[2], locations[:, 1] * scale_h + \
                                       cropped_size[3]
    prediction = np.concatenate((locations, scores), axis=1).transpose(1, 0)

    print('the coordinates for {:} facial landmarks:'.format(param.num_pts))
    for i in range(param.num_pts):
        point = prediction[:, i]
        print('the {:02d}/{:02d}-th point : ({:.1f}, {:.1f}), score = {:.2f}'.format(i+1, param.num_pts, float(point[0]),
                                                                                     float(point[1]), float(point[2])))
    image = draw_image_by_points(args.image, prediction, 2, (255, 0, 0), facebox, None,None)
    image.show()
    image.save(args.image.split('.')[0]+'_result.jpg')
Exemple #5
0
def build_transforms(config):
    transform_train = T.Compose([
        T.RandomCroping(config.DATA.HEIGHT,
                        config.DATA.WIDTH,
                        p=config.AUG.RC_PROB),
        T.RandomHorizontalFlip(),
        T.ToTensor(),
        T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        T.RandomErasing(probability=config.AUG.RE_PROB)
    ])

    transform_test = T.Compose([
        T.Resize((config.DATA.HEIGHT, config.DATA.WIDTH)),
        T.ToTensor(),
        T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    return transform_train, transform_test
Exemple #6
0
def Generate_transform_Dict(origin_width=256,
                            width=227,
                            ratio=0.16,
                            rot=0,
                            args=None):

    std_value = 1.0 / 255.0
    if (args is not None) and ("ResNet" in args.net):
        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                         std=[0.229, 0.224, 0.225])
        cc = []
    else:
        normalize = transforms.Normalize(
            mean=[104 / 255.0, 117 / 255.0, 128 / 255.0],
            std=[1.0 / 255, 1.0 / 255, 1.0 / 255])
        print("bgr init")
        cc = [transforms.CovertBGR()]
    transform_dict = {}

    transform_dict['rand-crop'] = \
    transforms.Compose(cc +
                [transforms.Resize((origin_width)),
                transforms.RandomResizedCrop(scale=(ratio, 1), size=width),
                transforms.RandomHorizontalFlip(),
                transforms.ToTensor(),
                normalize,
               ])

    transform_dict['center-crop'] = \
    transforms.Compose(cc +
                [
                    transforms.Resize((origin_width)),
                    transforms.CenterCrop(width),
                    transforms.ToTensor(),
                    normalize,
                ])

    transform_dict['resize'] = \
    transforms.Compose(cc +
                [    transforms.Resize((width)),
                    transforms.ToTensor(),
                    normalize,
                ])
    return transform_dict
    def __init__(self,
                 file_name,
                 sequence_len: int,
                 hop: int,
                 sr: int = 44100,
                 fft_size: int = 4096,
                 fft_hop: int = 441,
                 n_freq_bins: int = 256,
                 freq_compression: str = "linear",
                 f_min: int = 200,
                 f_max: int = 18000):
        self.sequence_len = sequence_len
        self.hop = hop

        self.audio = T.load_audio_file(file_name, sr=sr, mono=True)
        self.n_frames = self.audio.shape[
            1]  # total num of samples in the audio (transposed mono)

        self.t = [
            T.PreEmphasize(DefaultSpecDatasetOps["preemphases"]),
            T.Spectrogram(fft_size, fft_hop, center=False),
        ]

        if freq_compression == "linear":
            self.t.append(T.Interpolate(n_freq_bins, sr, f_min, f_max))
        elif freq_compression == "mel":
            self.t.append(
                T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max))
        elif freq_compression == "mfcc":
            t_mel = T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max)
            self.t.append(T.Compose(t_mel, T.M2MFCC()))
        else:
            raise "Undefined frequency compression"
        self.t.append(
            T.Amp2Db(min_level_db=DefaultSpecDatasetOps["min_level_db"]))
        self.t.append(
            T.Normalize(
                min_level_db=DefaultSpecDatasetOps["min_level_db"],
                ref_level_db=DefaultSpecDatasetOps["ref_level_db"],
            ))

        self.t = T.Compose(self.t)
def get_transform(train, dataset_name):
    base_size = cfg.DATA_TRANSFORM.LOADSIZE
    crop_size = cfg.DATA_TRANSFORM.CROPSIZE
    ignore_label = cfg.DATASET.IGNORE_LABEL

    if dataset_name == cfg.DATASET.SOURCE:
        input_size = cfg.DATA_TRANSFORM.INPUT_SIZE_S
    else:
        input_size = cfg.DATA_TRANSFORM.INPUT_SIZE_T

    min_size = int((1.0 if train else 1.0) * base_size)
    max_size = int((1.3 if train else 1.0) * base_size)

    transforms = []
    if cfg.DATA_TRANSFORM.RANDOM_RESIZE_AND_CROP:
        if train:
            transforms.append(T.RandomResize(min_size, max_size))
            transforms.append(T.RandomHorizontalFlip(0.5))
            transforms.append(
                T.RandomCrop(crop_size, ignore_label=ignore_label))
        else:
            transforms.append(T.Resize(cfg.DATA_TRANSFORM.INPUT_SIZE_T, True))
    else:
        if train:
            transforms.append(T.Resize(input_size))
            transforms.append(T.RandomHorizontalFlip(0.5))
        else:
            transforms.append(T.Resize(input_size, True))

    mapping = get_label_map(cfg.DATASET.SOURCE, cfg.DATASET.TARGET)
    transforms.append(T.LabelRemap(mapping[dataset_name]))
    transforms.append(T.ToTensor(cfg.DATASET.IMG_MODE))
    if cfg.DATASET.IMG_MODE == "BGR":
        mean = (104.00698793, 116.66876762, 122.67891434)
        std = (1.0, 1.0, 1.0)
    else:
        mean = (0.485, 0.456, 0.406)
        std = (0.229, 0.224, 0.225)

    transforms.append(T.Normalize(mean, std))
    return T.Compose(transforms)
def get_transform(dataset_name):
    base_size = cfg.DATA_TRANSFORM.LOADSIZE
    ignore_label = cfg.DATASET.IGNORE_LABEL

    min_size = base_size
    max_size = base_size

    transforms = []
    transforms.append(T.Resize(cfg.DATA_TRANSFORM.INPUT_SIZE_T, True))

    mapping = get_label_map(cfg.DATASET.SOURCE, cfg.DATASET.TARGET)
    transforms.append(T.LabelRemap(mapping[dataset_name]))
    transforms.append(T.ToTensor(cfg.DATASET.IMG_MODE))
    if cfg.DATASET.IMG_MODE == "BGR":
        mean = (104.00698793, 116.66876762, 122.67891434)
        std = (1.0, 1.0, 1.0)
    else:
        mean = (0.485, 0.456, 0.406)
        std = (0.229, 0.224, 0.225)
    transforms.append(T.Normalize(mean, std))

    return T.Compose(transforms)
Exemple #10
0
    def __init__(
            self,
            # exp params
            exp_name="u50_block",
            # arch params
            backbone="resnet50",
            backbone_kwargs={},
            dim_embedding=256,
            feature_spatial_scale=0.25,
            max_junctions=512,
            junction_pooling_threshold=0.2,
            junc_pooling_size=15,
            attention_sigma=1.,
            junction_heatmap_criterion="binary_cross_entropy",
            block_inference_size=64,
            adjacency_matrix_criterion="binary_cross_entropy",
            # data params
            data_root=r"/home/ziheng/indoorDist_new2",
            img_size=416,
            junc_sigma=3.,
            batch_size=2,
            # train params
            gpus=[
                0,
            ],
            num_workers=5,
            resume_epoch="latest",
            is_train_junc=True,
            is_train_adj=True,
            # vis params
            vis_junc_th=0.3,
            vis_line_th=0.3):
        os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(str(c) for c in gpus)

        self.is_cuda = bool(gpus)

        self.model = LSDModule(
            backbone=backbone,
            dim_embedding=dim_embedding,
            backbone_kwargs=backbone_kwargs,
            junction_pooling_threshold=junction_pooling_threshold,
            max_junctions=max_junctions,
            feature_spatial_scale=feature_spatial_scale,
            junction_heatmap_criterion=junction_heatmap_criterion,
            junction_pooling_size=junc_pooling_size,
            attention_sigma=attention_sigma,
            block_inference_size=block_inference_size,
            adjacency_matrix_criterion=adjacency_matrix_criterion,
            weight_fn=weight_fn,
            is_train_adj=is_train_adj,
            is_train_junc=is_train_junc)

        self.exp_name = exp_name
        os.makedirs(os.path.join("log", exp_name), exist_ok=True)
        os.makedirs(os.path.join("ckpt", exp_name), exist_ok=True)
        self.writer = SummaryWriter(log_dir=os.path.join("log", exp_name))

        # checkpoints
        self.states = dict(last_epoch=-1, elapsed_time=0, state_dict=None)

        if resume_epoch and os.path.isfile(
                os.path.join("ckpt", exp_name,
                             f"train_states_{resume_epoch}.pth")):
            states = torch.load(
                os.path.join("ckpt", exp_name,
                             f"train_states_{resume_epoch}.pth"))
            print(f"resume traning from epoch {states['last_epoch']}")
            self.model.load_state_dict(states["state_dict"])
            self.states.update(states)

        self.train_data = SISTLine(data_root=data_root,
                                   transforms=tf.Compose(
                                       tf.Resize((img_size, img_size)),
                                       tf.RandomHorizontalFlip(),
                                       tf.RandomColorAug()),
                                   phase="train",
                                   sigma_junction=junc_sigma,
                                   max_junctions=max_junctions)

        self.train_loader = DataLoader(self.train_data,
                                       batch_size=batch_size,
                                       shuffle=True,
                                       num_workers=num_workers,
                                       pin_memory=True)

        self.eval_data = SISTLine(data_root=data_root,
                                  transforms=tf.Compose(
                                      tf.Resize((img_size, img_size)), ),
                                  phase="val",
                                  sigma_junction=junc_sigma,
                                  max_junctions=max_junctions)

        self.eval_loader = DataLoader(self.eval_data,
                                      batch_size=batch_size,
                                      shuffle=False,
                                      num_workers=num_workers,
                                      pin_memory=True)

        self.vis_junc_th = vis_junc_th
        self.vis_line_th = vis_line_th
        self.block_size = block_inference_size
        self.max_junctions = max_junctions
        self.is_train_junc = is_train_junc
        self.is_train_adj = is_train_adj
Exemple #11
0
def main():
    anchor_generator = AnchorGenerator(sizes=tuple([(16, 24, 32, 48, 96)
                                                    for _ in range(5)]),
                                       aspect_ratios=tuple([
                                           (0.5, 1.0, 2.0) for _ in range(5)
                                       ]))
    rpnhead = RPNHead(256, anchor_generator.num_anchors_per_location()[0])
    model = maskrcnn_resnet50_fpn(num_classes=2,
                                  pretrained_backbone=True,
                                  max_size=MAX_SIZE,
                                  rpn_head=rpnhead,
                                  rpn_anchor_generator=anchor_generator,
                                  rpn_pre_nms_top_n_train=12000,
                                  rpn_pre_nms_top_n_test=6000,
                                  rpn_post_nms_top_n_train=2000,
                                  rpn_post_nms_top_n_test=300,
                                  rpn_fg_iou_thresh=0.5,
                                  rpn_bg_iou_thresh=0.3,
                                  rpn_positive_fraction=0.7,
                                  bbox_reg_weights=(1.0, 1.0, 1.0, 1.0),
                                  box_batch_size_per_image=32)
    model.load_state_dict(
        torch.load('saved_models' + os.sep + '0_deeplesion.pth',
                   map_location='cpu'))
    data_transforms = {
        'train':
        T.Compose([
            T.ToOriginalHU(INTENSITY_OFFSET),
            T.IntensityWindowing(WINDOWING),
            T.SpacingResize(NORM_SPACING, MAX_SIZE),
            T.ToTensor()
        ]),
        'val':
        T.Compose([
            T.ToOriginalHU(INTENSITY_OFFSET),
            T.IntensityWindowing(WINDOWING),
            T.SpacingResize(NORM_SPACING, MAX_SIZE),
            T.ToTensor()
        ]),
        'test':
        T.Compose([
            T.ToOriginalHU(INTENSITY_OFFSET),
            T.IntensityWindowing(WINDOWING),
            T.SpacingResize(NORM_SPACING, MAX_SIZE),
            T.ToTensor()
        ])
    }
    image_datasets = {
        x: DeepLesion(DIR_IN + os.sep + x, GT_FN_DICT[x], data_transforms[x])
        for x in ['train', 'val', 'test']
    }

    dataloaders = {
        x: DataLoader(image_datasets[x],
                      batch_size=3,
                      shuffle=True,
                      num_workers=0,
                      collate_fn=BatchCollator)
        for x in ['train', 'val', 'test']
    }
    for batch_id, (inputs, targets) in enumerate(dataloaders['test']):
        outputs = test_model(model, inputs)
        outputs = remove_overlapping(outputs, 0.655)
        for image, target, output in zip(inputs, targets, outputs):
            img_copy = image.squeeze().numpy()
            images = [img_copy] * 3
            images = [im.astype(float) for im in images]
            img_copy = cv2.merge(images)
            for bbox, pseudo_mask in zip(target["boxes"], target["masks"]):
                bbox = bbox.squeeze().numpy()
                bbox = np.int16(bbox)
                mask = pseudo_mask.squeeze().numpy()
                cv2.rectangle(img_copy, (bbox[0], bbox[1]), (bbox[2], bbox[3]),
                              (0, 255, 0), 1)
                msk_idx = np.where(mask == 1)
                img_copy[msk_idx[0], msk_idx[1], 0] = 255
            for predbox, predmask, score in zip(output['boxes'],
                                                output['masks'],
                                                output['scores']):
                if score < 0.655:
                    break
                predbox = predbox.numpy()
                predmask = predmask.squeeze().numpy()
                score = score.numpy()
                predmask = np.where(predmask > 0.5, 1, 0)
                cv2.rectangle(img_copy, (predbox[0], predbox[1]),
                              (predbox[2], predbox[3]), (0, 0, 255), 1)
                pmsk_idx = np.where(predmask == 1)
                img_copy[pmsk_idx[0], pmsk_idx[1], 2] = 255
                cv2.putText(img_copy, str(score),
                            (int(predbox[0]), int(predbox[1] - 5)),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1,
                            cv2.LINE_AA)
            # cv2.imshow(str(target['image_id']), img_copy)
            cv2.imwrite(
                'simple_test' + os.sep +
                str(target['image_id']).replace(os.sep, '_') + '_pred.jpg',
                img_copy * 255)
Exemple #12
0
    def __init__(self,
                 file_names: Iterable[str],
                 working_dir=None,
                 cache_dir=None,
                 sr=44100,
                 n_fft=4096,
                 hop_length=441,
                 freq_compression="linear",
                 n_freq_bins=256,
                 f_min=0,
                 f_max=18000,
                 seq_len=128,
                 augmentation=False,
                 noise_files=[],
                 min_max_normalize=False,
                 *args,
                 **kwargs):
        super().__init__(file_names, working_dir, sr, *args, **kwargs)
        if self.dataset_name is not None:
            self._logger.info("Init dataset {}...".format(self.dataset_name))

        self.n_fft = n_fft
        self.hop_length = hop_length
        self.f_min = f_min
        self.f_max = f_max

        valid_freq_compressions = ["linear", "mel", "mfcc"]

        if freq_compression not in valid_freq_compressions:
            raise ValueError(
                "{} is not a valid freq_compression. Must be one of {}",
                format(freq_compression, valid_freq_compressions),
            )
        self.freq_compression = freq_compression

        self.possible_call_labels = re.compile("|".join(["call"]))
        self.possible_nocall_labels = re.compile("|".join(["noise"]))

        self._logger.debug("Number of files : {}".format(len(self.file_names)))

        _n_calls = 0
        for f in self.file_names:
            if self.is_call(f):
                _n_calls += 1

        self._logger.debug("Number of calls: {}".format(_n_calls))
        self._logger.debug(
            "Number of noise: {}".format(len(self.file_names) - _n_calls))

        self.augmentation = augmentation

        spec_transforms = [
            lambda fn: T.load_audio_file(fn, sr=sr),
            T.PreEmphasize(DefaultSpecDatasetOps["preemphases"]),
            T.Spectrogram(n_fft, hop_length, center=False),
        ]
        self.file_reader = AsyncFileReader()
        if cache_dir is None:
            self.t_spectrogram = T.Compose(spec_transforms)
        else:
            self.t_spectrogram = T.CachedSpectrogram(
                cache_dir=cache_dir,
                spec_transform=T.Compose(spec_transforms),
                n_fft=n_fft,
                hop_length=hop_length,
                file_reader=AsyncFileReader(),
            )
        if augmentation:
            self._logger.debug(
                "Init augmentation transforms for time and pitch shift")
            self.t_amplitude = T.RandomAmplitude(3, -6)
            self.t_timestretch = T.RandomTimeStretch()
            self.t_pitchshift = T.RandomPitchSift()
        else:
            self._logger.debug("Running without augmentation")
        if self.freq_compression == "linear":
            self.t_compr_f = T.Interpolate(n_freq_bins, sr, f_min, f_max)
        elif self.freq_compression == "mel":
            self.t_compr_f = T.F2M(sr=sr,
                                   n_mels=n_freq_bins,
                                   f_min=f_min,
                                   f_max=f_max)
        elif self.freq_compression == "mfcc":
            self.t_compr_f = T.Compose(
                T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max))
            self.t_compr_mfcc = T.M2MFCC(n_mfcc=32)
        else:
            raise "Undefined frequency compression"
        if augmentation:
            if noise_files:
                self._logger.debug(
                    "Init augmentation transform for random noise addition")
                self.t_addnoise = T.RandomAddNoise(
                    noise_files,
                    self.t_spectrogram,
                    T.Compose(self.t_timestretch, self.t_pitchshift,
                              self.t_compr_f),
                    min_length=seq_len,
                    return_original=True)
            else:
                self.t_addnoise = None
        self.t_compr_a = T.Amp2Db(
            min_level_db=DefaultSpecDatasetOps["min_level_db"])

        if min_max_normalize:
            self.t_norm = T.MinMaxNormalize()
            self._logger.debug("Init min-max-normalization activated")
        else:
            self.t_norm = T.Normalize(
                min_level_db=DefaultSpecDatasetOps["min_level_db"],
                ref_level_db=DefaultSpecDatasetOps["ref_level_db"],
            )
            self._logger.debug("Init 0/1-dB-normalization activated")

        self.t_subseq = T.PaddedSubsequenceSampler(seq_len,
                                                   dim=1,
                                                   random=augmentation)
Exemple #13
0
def main():
    logging.basicConfig(filename='logs' + os.sep + 'example.log',
                        level=logging.DEBUG)
    data_transforms = {
        'train':
        T.Compose([
            T.ToOriginalHU(INTENSITY_OFFSET),
            T.IntensityWindowing(WINDOWING),
            T.SpacingResize(NORM_SPACING, MAX_SIZE),
            T.ToTensor()
        ]),
        'val':
        T.Compose([
            T.ToOriginalHU(INTENSITY_OFFSET),
            T.IntensityWindowing(WINDOWING),
            T.SpacingResize(NORM_SPACING, MAX_SIZE),
            T.ToTensor()
        ]),
        'test':
        T.Compose([
            T.ToOriginalHU(INTENSITY_OFFSET),
            T.IntensityWindowing(WINDOWING),
            T.SpacingResize(NORM_SPACING, MAX_SIZE),
            T.ToTensor()
        ])
    }

    logging.info('Loading data sets')
    image_datasets = {
        x: DeepLesion(DIR_IN + os.sep + x, GT_FN_DICT[x], data_transforms[x])
        for x in ['train', 'val', 'test']
    }
    logging.info('data sets loaded')
    logging.info('Loading data loaders')
    dl_dataloaders = {
        x: DataLoader(image_datasets[x],
                      batch_size=3,
                      shuffle=True,
                      num_workers=0,
                      collate_fn=BatchCollator)
        for x in ['train', 'val', 'test']
    }

    logging.info('data loaders loaded\n')
    dl_dataset_sizes = {
        x: len(image_datasets[x])
        for x in ['train', 'val', 'test']
    }

    # for batch_id, (inputs, targets) in enumerate(dl_dataloaders['train']):
    #     i = 0
    #     for i, (image, target) in enumerate(zip(inputs, targets)):
    #         img_copy = image.squeeze().numpy()
    #         images = [img_copy] * 3
    #         images = [im.astype(float) for im in images]
    #         img_copy = cv2.merge(images)
    #         for j, (bbox, pseudo_mask) in enumerate(zip(target["boxes"], target["masks"])):
    #             bbox = target["boxes"][j].squeeze().numpy()
    #             bbox = np.int16(bbox)
    #             mask = target["masks"][j].squeeze().numpy()
    #             cv2.rectangle(img_copy, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (0, 255, 0), 1)
    #             msk_idx = np.where(mask == 1)
    #             img_copy[msk_idx[0], msk_idx[1], 0] = 255
    #         cv2.imshow(str(batch_id) + " " + str(i), img_copy)
    #
    # cv2.waitKey(0)
    # cv2.destroyAllWindows()
    dl_model = get_model(False, True, 2)

    params = [p for p in dl_model.parameters() if p.requires_grad]

    # Observe that not all parameters are being optimized
    optimizer_ft = SGD(params, lr=0.001, momentum=0.9, weight_decay=0.0001)
    # optimizer_ft = Adam(params, lr=0.001)

    # Decay LR by a factor of 0.1 every 7 epochs
    # exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=4, gamma=0.1)
    # exp_lr_scheduler = lr_scheduler.CosineAnnealingLR(optimizer_ft, T_max=100)

    num_epochs = 10
    since = time.time()
    # best_model_wts = copy.deepcopy(dl_model.state_dict())
    # best_llf = 0
    # best_nlf = 999

    logging.info('momentum:' +
                 str(optimizer_ft.state_dict()['param_groups'][0]['momentum']))
    logging.info(
        'weight_decay:' +
        str(optimizer_ft.state_dict()['param_groups'][0]['weight_decay']))
    # logging.info('LR decay gamma:' + str(exp_lr_scheduler.state_dict()['gamma']))
    # logging.info('LR decay step size:' + str(exp_lr_scheduler.state_dict()['step_size']) + '\n')

    for epoch in range(num_epochs):
        # deep_copy_flag = False
        logging.info('Epoch {}/{}'.format(epoch, num_epochs - 1))
        logging.info('-' * 20)
        train_one_epoc(dl_model, optimizer_ft, dl_dataloaders['train'],
                       dl_dataset_sizes['train'])

        llf, nlf = evaluate(dl_model, dl_dataloaders['val'])

        logging.info('LLF: {}'.format(llf))
        logging.info('NLF: {}'.format(nlf) + '\n')

        # exp_lr_scheduler.step()

        # if llf > best_llf:
        #     deep_copy_flag = True
        #     best_nlf = nlf
        #     best_llf = llf
        # elif (llf == best_llf) & (nlf < best_nlf):
        #     deep_copy_flag = True
        #     best_nlf = nlf
        # if deep_copy_flag:
        best_model_wts = copy.deepcopy(dl_model.state_dict())
        torch.save(best_model_wts,
                   'saved_models' + os.sep + str(epoch) + '_deeplesion.pth')
    time_elapsed = time.time() - since
    logging.info('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    def __init__(
            self,
            file_names: Iterable[str],
            working_dir=None,
            cache_dir=None,
            sr=44100,
            n_fft=2048,  #4096
            hop_length=220,  #441
            freq_compression="linear",
            n_freq_bins=256,  # determines the width of the image
            f_min=0,
            f_max=18000,
            seq_len=128,  # shd be adjusted together with sequence_len in class StridedAudioDataset (called by predict.py)
            augmentation=False,
            noise_files=[],
            *args,
            **kwargs):
        super().__init__(file_names, working_dir, sr, *args, **kwargs)
        if self.dataset_name is not None:
            self._logger.info("Init dataset {}...".format(self.dataset_name))

        self.n_fft = n_fft
        self.hop_length = hop_length
        self.f_min = f_min
        self.f_max = f_max

        # mel: log transformation of freq (Hz scale to Mel scale)
        # attention: Mel-spectrograms as a network input led to an excessive loss of resolution in higher frequency bands, which was
        # a big problem considering the high-frequency pulsed calls and whistles.
        valid_freq_compressions = ["linear", "mel", "mfcc"]

        if freq_compression not in valid_freq_compressions:
            raise ValueError(
                "{} is not a valid freq_compression. Must be one of {}",
                format(freq_compression, valid_freq_compressions),
            )
        self.freq_compression = freq_compression

        # combine a RegExp pattern into pattern objects for pattern matching
        self.possible_call_labels = re.compile("|".join(["call"]))
        self.possible_nocall_labels = re.compile("|".join(["noise"]))

        self._logger.debug("Number of files : {}".format(len(self.file_names)))

        _n_calls = 0
        for f in self.file_names:
            if self.is_call(f):
                _n_calls += 1

        self._logger.debug("Number of calls: {}".format(_n_calls))
        self._logger.debug(
            "Number of noise: {}".format(len(self.file_names) - _n_calls))

        self.augmentation = augmentation

        spec_transforms = [
            lambda fn: T.load_audio_file(fn, sr=sr),  # return: a vector tensor
            T.PreEmphasize(DefaultSpecDatasetOps["preemphases"]),
            T.Spectrogram(n_fft, hop_length, center=False),
        ]
        self.file_reader = AsyncFileReader()
        # if user chooses to not cache .spec by omitting the directory
        if cache_dir is None:
            self.t_spectrogram = T.Compose(spec_transforms)
        else:
            # where .spec is created and stored
            # n_fft, hop_length: meta in spec_dict
            self.t_spectrogram = T.CachedSpectrogram(
                cache_dir=cache_dir,
                spec_transform=T.Compose(spec_transforms),
                n_fft=n_fft,
                hop_length=hop_length,
                file_reader=AsyncFileReader(),
            )
        if augmentation:
            self._logger.debug(
                "Init augmentation transforms for time and pitch shift")
            self.t_amplitude = T.RandomAmplitude(3, -6)
            self.t_timestretch = T.RandomTimeStretch()
            self.t_pitchshift = T.RandomPitchSift()
        else:
            self._logger.debug("Running without augmentation")
        if self.freq_compression == "linear":
            self.t_compr_f = T.Interpolate(n_freq_bins, sr, f_min, f_max)
        elif self.freq_compression == "mel":
            self.t_compr_f = T.F2M(sr=sr,
                                   n_mels=n_freq_bins,
                                   f_min=f_min,
                                   f_max=f_max)
        elif self.freq_compression == "mfcc":
            self.t_compr_f = T.Compose(
                T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max),
                T.M2MFCC())
        else:
            raise "Undefined frequency compression"
        if augmentation:
            if noise_files:
                self._logger.debug(
                    "Init augmentation transform for random noise addition")
                self.t_addnoise = T.RandomAddNoise(
                    noise_files,
                    self.t_spectrogram,
                    T.Compose(self.t_timestretch, self.t_pitchshift,
                              self.t_compr_f),
                    min_length=seq_len,
                    return_original=True
                )  # if return_original = True, both augmented and original specs are returned
            else:
                self.t_addnoise = None
        self.t_compr_a = T.Amp2Db(
            min_level_db=DefaultSpecDatasetOps["min_level_db"])
        self.t_norm = T.Normalize(
            min_level_db=DefaultSpecDatasetOps["min_level_db"],
            ref_level_db=DefaultSpecDatasetOps["ref_level_db"],
        )
        self.t_subseq = T.PaddedSubsequenceSampler(seq_len,
                                                   dim=1,
                                                   random=augmentation)
Exemple #15
0
    def __init__(
        self,
        file_names: Iterable[str],
        working_dir=None,
        cache_dir=None,
        sr=44100,
        n_fft=4096,
        hop_length=441,
        freq_compression="linear",
        n_freq_bins=256,
        f_min=0,
        f_max=18000,
        seq_len=128,
        augmentation=False,
        noise_files_train=[],
        noise_files_val=[],
        noise_files_test=[],
        random=False,
        *args,
        **kwargs
    ):
        super().__init__(file_names, working_dir, sr, *args, **kwargs)
        if self.dataset_name is not None:
            self._logger.info("Init dataset {}...".format(self.dataset_name))

        self.sp = signal.signal_proc()

        self.df = 15.0
        self.exp_e = 0.1
        self.bin_pow = 2.0
        self.gaus_mean = 0.0
        self.gaus_stdv = 12.5
        self.poisson_lambda = 15.0
        self.orig_noise_value = -5

        self.f_min = f_min
        self.f_max = f_max
        self.n_fft = n_fft
        self.random = random
        self.hop_length = hop_length
        self.augmentation = augmentation
        self.file_reader = AsyncFileReader()
        self.noise_files_val = noise_files_val
        self.noise_files_test = noise_files_test
        self.freq_compression = freq_compression
        self.noise_files_train = noise_files_train


        valid_freq_compressions = ["linear", "mel", "mfcc"]

        if self.freq_compression not in valid_freq_compressions:
            raise ValueError(
                "{} is not a valid freq_compression. Must be one of {}",
                format(self.freq_compressio, valid_freq_compressions),
            )

        self._logger.debug(
            "Number of files to denoise : {}".format(len(self.file_names))
        )

        spec_transforms = [
            lambda fn: T.load_audio_file(fn, sr=sr),
            T.PreEmphasize(DefaultSpecDatasetOps["preemphases"]),
            T.Spectrogram(n_fft, hop_length, center=False),
        ]

        if cache_dir is None:
            self.t_spectrogram = T.Compose(spec_transforms)
        else:
            self.t_spectrogram = T.CachedSpectrogram(
                cache_dir=cache_dir,
                spec_transform=T.Compose(spec_transforms),
                n_fft=n_fft,
                hop_length=hop_length,
                file_reader=AsyncFileReader())

        if self.augmentation:
            self._logger.debug("Init augmentation transforms for intensity, time, and pitch shift")
            self.t_amplitude = T.RandomAmplitude(3, -6)
            self.t_timestretch = T.RandomTimeStretch()
            self.t_pitchshift = T.RandomPitchSift()
        else:
            #only for noise augmentation during validation phase - intensity, time and pitch augmentation is not used during validation/test
            self.t_timestretch = T.RandomTimeStretch()
            self.t_pitchshift = T.RandomPitchSift()
            self._logger.debug("Running without intensity, time, and pitch augmentation")

        if self.freq_compression == "linear":
            self.t_compr_f = T.Interpolate(n_freq_bins, sr, f_min, f_max)
        elif self.freq_compression == "mel":
            self.t_compr_f = T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max)
        elif self.freq_compression == "mfcc":
            self.t_compr_f = T.Compose(T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max))
            self.t_compr_mfcc = T.M2MFCC(n_mfcc=32)
        else:
            raise "Undefined frequency compression"

        if self.augmentation and self.noise_files_train and self.dataset_name == "train":
            self._logger.debug("Init training real-world noise files for noise2noise adding")
            self.t_addnoise = T.RandomAddNoise(
                self.noise_files_train,
                self.t_spectrogram,
                T.Compose(self.t_timestretch, self.t_pitchshift, self.t_compr_f),
                min_length=seq_len,
                min_snr=-2,
                max_snr=-8,
                return_original=True
            )
        elif not self.augmentation and self.noise_files_val and self.dataset_name == "val":
            self._logger.debug("Init validation real-world noise files for noise2noise adding")
            self.t_addnoise = T.RandomAddNoise(
                self.noise_files_val,
                self.t_spectrogram,
                T.Compose(self.t_timestretch, self.t_pitchshift, self.t_compr_f),
                min_length=seq_len,
                min_snr=-2,
                max_snr=-8,
                return_original=True
            )
        elif not self.augmentation and self.noise_files_test and self.dataset_name == "test":
            self._logger.debug("Init test real-world noise files for noise2noise adding")
            self.t_addnoise = T.RandomAddNoise(
                self.noise_files_test,
                self.t_spectrogram,
                T.Compose(self.t_timestretch, self.t_pitchshift, self.t_compr_f),
                min_length=seq_len,
                min_snr=-2,
                max_snr=-8,
                return_original=True
            )
        else:
            self.t_addnoise = None
            raise "ERROR: Init noise files for noise adding does not have a proper setup per split!"

        self.t_compr_a = T.Amp2Db(min_level_db=DefaultSpecDatasetOps["min_level_db"])

        self.t_norm = T.Normalize(
            min_level_db=DefaultSpecDatasetOps["min_level_db"],
            ref_level_db=DefaultSpecDatasetOps["ref_level_db"],
        )

        self.t_subseq = T.PaddedSubsequenceSampler(seq_len, dim=1, random=augmentation)
Exemple #16
0
def main():
    print('starting denoising')

    noise_sigma = 4e-5  # sigma for the noise simulation
    batch_size = 8  # number of images to run for each minibach
    num_epochs = 200  # number of epochs to train
    validation_seed = 15  # rng seed for validation loop
    log_dir = 'logs/denoise/'  # log dir for models and tensorboard
    device = torch.device('cpu')  # model will run on this device
    dtype = torch.float  # dtype for data and model

    # set up tensorboard
    writer = SummaryWriter(log_dir=log_dir)

    # checkpoint file name
    checkpoint_file = os.path.join(log_dir + 'best_model.pt')

    # -------------------------------------------------------------------------
    # NOISE SIMULATION SETUP
    transform_list = [
        transforms.AddNoise(target_op=False, sigma=noise_sigma),
        transforms.Ifft(norm='ortho'),
        transforms.SquareRootSumSquare(),
        transforms.Normalize(),
        transforms.ToTensor(dat_complex=False, target_complex=False)
    ]

    # -------------------------------------------------------------------------
    # DATALOADER SETUP
    train_dataset = KneeDataSet('pytorch_tutorial_data/',
                                'train',
                                transform=transforms.Compose(transform_list))
    print('data set information:')
    print(train_dataset)
    val_dataset = KneeDataSet('pytorch_tutorial_data/',
                              'val',
                              transform=transforms.Compose(transform_list))
    # convert to a PyTorch dataloader
    # this handles batching, random shuffling, parallelization
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
    )
    val_loader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=True,
    )
    display_dat = val_dataset[15]['dat'].unsqueeze(0).to(device=device,
                                                         dtype=dtype)
    display_target = val_dataset[15]['target'].unsqueeze(0).to(device=device,
                                                               dtype=dtype)
    display_vmax = np.max(np.squeeze(display_dat.cpu().numpy()))

    # -------------------------------------------------------------------------
    # MODEL SETUP
    model = DenoiseCnn(num_chans=64,
                       num_layers=4,
                       magnitude_input=True,
                       magnitude_output=True)
    model = model.to(device)
    model = model.train()
    print('CNN model information:')
    print(model)

    # -------------------------------------------------------------------------
    # OPTIMIZER SETUP
    optimizer = torch.optim.Adam(model.parameters())
    loss_fn = torch.nn.MSELoss()

    # -------------------------------------------------------------------------
    # LOAD PREVIOUS STATE
    start_epoch, model, optimizer, min_val_loss = load_checkpoint(
        checkpoint_file, model, optimizer)
    current_seed = 20

    # -------------------------------------------------------------------------
    # NETWORK TRAINING
    for epoch_index in range(start_epoch, num_epochs):
        print('epoch {} of {}'.format(epoch_index + 1, num_epochs))

        # ---------------------------------------------------------------------
        # TRAINING LOOP
        model = model.train()

        # rng seed for noise generation
        torch.manual_seed(current_seed)
        np.random.seed(current_seed)
        torch.cuda.manual_seed(current_seed)

        # batch loop
        losses = []
        for batch in train_loader:
            target = batch['target'].to(device=device, dtype=dtype)
            dat = batch['dat'].to(device=device, dtype=dtype)

            est = model(dat)  # forward propagation
            loss = loss_fn(est, target)  # calculate the loss
            optimizer.zero_grad()  # clear out old gradients
            loss.backward()  # back propagation
            optimizer.step()  # update the CNN weights

            # keep last 10 minibatches to compute training loss
            losses.append(loss.item())
            losses = losses[-10:]

        print('trailing training loss: {}'.format(np.mean(losses)))

        # ---------------------------------------------------------------------
        # EVALUATION LOOP
        model = model.eval()

        # rng seed for noise generation
        current_seed = np.random.get_state()[1][0]
        torch.manual_seed(validation_seed)
        np.random.seed(validation_seed)
        torch.cuda.manual_seed(validation_seed)

        # batch loop
        val_losses = []
        with torch.no_grad():
            for batch in val_loader:
                target = batch['target'].to(device=device, dtype=dtype)
                dat = batch['dat'].to(device=device, dtype=dtype)

                est = model(dat)
                loss = loss_fn(est, target)

                val_losses.append(loss.item())

        print('validation loss: {}'.format(np.mean(val_losses)))

        # ---------------------------------------------------------------------
        # VISUALIZATIONS AND CHECKPOINTS
        if np.mean(val_losses) < min_val_loss:
            save_checkpoint(epoch_index, model, optimizer, np.mean(val_losses),
                            checkpoint_file)

        # write the losses
        writer.add_scalar('loss/train', np.mean(losses), epoch_index + 1)
        writer.add_scalar('loss/validation', np.mean(val_losses),
                          epoch_index + 1)

        # show an example image from the validation data
        model = model.eval()
        with torch.no_grad():
            display_est = model(display_dat)

        writer.add_image('validation/dat',
                         display_dat[0] / display_vmax,
                         global_step=epoch_index + 1)
        writer.add_image('validation/cnn',
                         display_est[0] / display_vmax,
                         global_step=epoch_index + 1)
        writer.add_image('validation/target',
                         display_target[0] / display_vmax,
                         global_step=epoch_index + 1)

    writer.close()
args.checkname = args.arc

# Define Saver
saver = Saver(args)
saver.save_experiment_config()

# Define Tensorboard Summary
summary = TensorboardSummary(saver.experiment_dir)
writer = summary.create_summary()

# Data
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
train_trans = transforms.Compose([transforms.Resize(321),
                                  transforms.RandomCrop(224),
                                  transforms.RandomHorizontalFlip(),
                                  transforms.ToTensor(),
                                  normalize,
                                  ])
val_trans = transforms.Compose([transforms.Resize(321),
                                transforms.CenterCrop(224),
                                transforms.ToTensor(),
                                normalize,
                                ])
train_ds = VOCSBDClassification('/path/to/VOC',
                                '/path/to/SBD/benchmark_RELEASE/dataset',
                                transform=train_trans, image_set='train')
train_dl = DataLoader(train_ds, batch_size=args.batch_size, shuffle=True, num_workers=4, drop_last=True)
val_ds = VOCClassification('/path/to/VOC', transform=val_trans, image_set='val')
val_dl = DataLoader(val_ds, batch_size=8, shuffle=True, num_workers=2, drop_last=True)

Exemple #18
0
def initialize_data_loader(DatasetClass,
                           config,
                           phase,
                           threads,
                           shuffle,
                           repeat,
                           augment_data,
                           batch_size,
                           limit_numpoints,
                           input_transform=None,
                           target_transform=None):
  if isinstance(phase, str):
    phase = str2datasetphase_type(phase)

  if config["return_transformation"]:
    collate_fn = t.cflt_collate_fn_factory(limit_numpoints)
  else:
    collate_fn = t.cfl_collate_fn_factory(limit_numpoints)

  prevoxel_transform_train = []
  if augment_data:
    prevoxel_transform_train.append(t.ElasticDistortion(DatasetClass.ELASTIC_DISTORT_PARAMS))

  if len(prevoxel_transform_train) > 0:
    prevoxel_transforms = t.Compose(prevoxel_transform_train)
  else:
    prevoxel_transforms = None

  input_transforms = []
  if input_transform is not None:
    input_transforms += input_transform

  if augment_data:
    input_transforms += [
        t.RandomHorizontalFlip(DatasetClass.ROTATION_AXIS, DatasetClass.IS_TEMPORAL),
        t.ChromaticAutoContrast(),
        t.ChromaticTranslation(config["data_aug_color_trans_ratio"]),
        t.ChromaticJitter(config["data_aug_color_jitter_std"]),
        t.HueSaturationTranslation(config["data_aug_hue_max"], config["data_aug_saturation_max"]),
    ]

  if len(input_transforms) > 0:
    input_transforms = t.Compose(input_transforms)
  else:
    input_transforms = None

  dataset = DatasetClass(
      config,
      prevoxel_transform=prevoxel_transforms,
      input_transform=input_transforms,
      target_transform=target_transform,
      cache=config["cache_data"],
      augment_data=augment_data,
      phase=phase)

  data_args = {
      'dataset': dataset,
      'num_workers': threads,
      'batch_size': batch_size,
      'collate_fn': collate_fn,
  }

  if repeat:
    data_args['sampler'] = InfSampler(dataset, shuffle)
  else:
    data_args['shuffle'] = shuffle

  data_loader = DataLoader(**data_args)

  return data_loader
Exemple #19
0
    # Define Saver
    saver = Saver(args)
    saver.save_experiment_config()

    # Define Tensorboard Summary
    summary = TensorboardSummary(saver.experiment_dir)
    args.exp = saver.experiment_dir.split('_')[-1]

    if args.train_dataset == 'cityscapes':
        # Data
        train_trans = transforms.Compose([
            transforms.ToPILImage(),
            # transforms.RandomResizedCrop((args.image_size, args.image_size), scale=(0.2, 2)),
            transforms.Resize((args.image_size, args.image_size)),
            transforms.RandomHorizontalFlip(),
            transforms.RandomAffine(22, scale=(0.75, 1.25)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[123.675, 116.28, 103.53],
                                 std=[58.395, 57.12, 57.375])
            # transforms.NormalizeInstance()
        ])
        val_trans = transforms.Compose([
            transforms.ToPILImage(),
            transforms.Resize((args.image_size, args.image_size),
                              do_mask=False),
            transforms.ToTensor(),
            transforms.Normalize(mean=[123.675, 116.28, 103.53],
                                 std=[58.395, 57.12, 57.375])
            # transforms.NormalizeInstance()
        ])
Exemple #20
0
def main(args):
    assert torch.cuda.is_available(), 'CUDA is not available.'
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True
    prepare_seed(args.rand_seed)
    logstr = 'seed-{:}-time-{:}'.format(args.rand_seed, time_for_file())
    logger = Logger(args.save_path, logstr)
    logger.log('Main Function with logger : {:}'.format(logger))
    logger.log('Arguments : -------------------------------')
    for name, value in args._get_kwargs():
        logger.log('{:16} : {:}'.format(name, value))
    logger.log("Python  version : {}".format(sys.version.replace('\n', ' ')))
    logger.log("Pillow  version : {}".format(PIL.__version__))
    logger.log("PyTorch version : {}".format(torch.__version__))
    logger.log("cuDNN   version : {}".format(torch.backends.cudnn.version()))

    # General Data Argumentation
    mean_fill = tuple([int(x * 255) for x in [0.485, 0.456, 0.406]])
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    assert args.arg_flip == False, 'The flip is : {}, rotate is {}'.format(args.arg_flip, args.rotate_max)
    train_transform = [transforms.PreCrop(args.pre_crop_expand)]
    train_transform += [transforms.TrainScale2WH((args.crop_width, args.crop_height))]
    train_transform += [transforms.AugScale(args.scale_prob, args.scale_min, args.scale_max)]
    # if args.arg_flip:
    #  train_transform += [transforms.AugHorizontalFlip()]
    if args.rotate_max:
        train_transform += [transforms.AugRotate(args.rotate_max)]
    train_transform += [transforms.AugCrop(args.crop_width, args.crop_height, args.crop_perturb_max, mean_fill)]
    train_transform += [transforms.ToTensor(), normalize]
    train_transform = transforms.Compose(train_transform)

    eval_transform = transforms.Compose(
        [transforms.PreCrop(args.pre_crop_expand), transforms.TrainScale2WH((args.crop_width, args.crop_height)),
         transforms.ToTensor(), normalize])
    assert (args.scale_min + args.scale_max) / 2 == args.scale_eval, 'The scale is not ok : {},{} vs {}'.format(
        args.scale_min, args.scale_max, args.scale_eval)

    # Model Configure Load
    model_config = load_configure(args.model_config, logger)
    args.sigma = args.sigma * args.scale_eval
    logger.log('Real Sigma : {:}'.format(args.sigma))

    # Training Dataset
    train_data = GeneralDataset(train_transform, args.sigma, model_config.downsample, args.heatmap_type, args.data_indicator)
    train_data.load_list(args.train_lists, args.num_pts, True)
    train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size,
                                               shuffle=True,num_workers=args.workers,
                                               pin_memory=True)
    # Evaluation Dataloader
    eval_loaders = []

    if args.eval_ilists is not None:
        for eval_ilist in args.eval_ilists:
            eval_idata = GeneralDataset(eval_transform, args.sigma, model_config.downsample, args.heatmap_type,
                                 args.data_indicator)
            eval_idata.load_list(eval_ilist, args.num_pts, True)
            eval_iloader = torch.utils.data.DataLoader(eval_idata, batch_size=args.batch_size, shuffle=False,
                                                       num_workers=args.workers, pin_memory=True)
            eval_loaders.append((eval_iloader, False))

    # Define network
    logger.log('configure : {:}'.format(model_config))
    net = obtain_model(model_config, args.num_pts + 1)

    assert model_config.downsample == net.downsample, 'downsample is not correct : {} vs {}'.format(
        model_config.downsample, net.downsample)
    logger.log("=> network :\n {}".format(net))

    logger.log('Training-data : {:}'.format(train_data))
    for i, eval_loader in enumerate(eval_loaders):
        eval_loader, is_video = eval_loader
        logger.log('The [{:2d}/{:2d}]-th testing-data [{:}] = {:}'.format(i, len(eval_loaders),
                                                                          'video' if is_video else 'image',
                                                                          eval_loader.dataset))
    logger.log('arguments : {:}'.format(args))
    opt_config = load_configure(args.opt_config, logger)

    if hasattr(net, 'specify_parameter'):
        net_param_dict = net.specify_parameter(opt_config.LR, opt_config.Decay)
    else:
        net_param_dict = net.parameters()

    optimizer, scheduler, criterion = obtain_optimizer(net_param_dict, opt_config, logger)
    logger.log('criterion : {:}'.format(criterion))
    net, criterion = net.cuda(), criterion.cuda()
    net = torch.nn.DataParallel(net)

    last_info = logger.last_info()
    if last_info.exists():
        logger.log("=> loading checkpoint of the last-info '{:}' start".format(last_info))
        last_info = torch.load(str(last_info))
        start_epoch = last_info['epoch'] + 1
        checkpoint = torch.load(last_info['last_checkpoint'])
        assert last_info['epoch'] == checkpoint['epoch'], 'Last-Info is not right {:} vs {:}'.format(last_info,
                                                                                                     checkpoint[
                                                                                                         'epoch'])
        net.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        scheduler.load_state_dict(checkpoint['scheduler'])
        logger.log("=> load-ok checkpoint '{:}' (epoch {:}) done".format(logger.last_info(), checkpoint['epoch']))
    else:
        logger.log("=> do not find the last-info file : {:}".format(last_info))
        start_epoch = 0

    if args.eval_once:
        logger.log("=> only evaluate the model once")
        eval_results = eval_all(args, eval_loaders, net, criterion, 'eval-once', logger, opt_config)
        logger.close()
        return

        # Main Training and Evaluation Loop
    start_time = time.time()
    epoch_time = AverageMeter()
    for epoch in range(start_epoch, opt_config.epochs):
        scheduler.step()
        need_time = convert_secs2time(epoch_time.avg * (opt_config.epochs - epoch), True)
        epoch_str = 'epoch-{:03d}-{:03d}'.format(epoch, opt_config.epochs)
        LRs = scheduler.get_lr()
        logger.log('\n==>>{:s} [{:s}], [{:s}], LR : [{:.5f} ~ {:.5f}], Config : {:}'.format(time_string(), epoch_str,
                                                                                            need_time, min(LRs),
                                                                                            max(LRs), opt_config))

        # train for one epoch
        train_loss, train_nme = train(args, train_loader, net, criterion,
                                      optimizer, epoch_str, logger, opt_config)
        # log the results
        logger.log(
            '==>>{:s} Train [{:}] Average Loss = {:.6f}, NME = {:.2f}'.format(time_string(), epoch_str, train_loss,
                                                                              train_nme * 100))

        # remember best prec@1 and save checkpoint
        save_path = save_checkpoint({
            'epoch': epoch,
            'args': deepcopy(args),
            'arch': model_config.arch,
            'state_dict': net.state_dict(),
            'scheduler': scheduler.state_dict(),
            'optimizer': optimizer.state_dict(),
        }, str(logger.path('model') / '{:}-{:}.pth'.format(model_config.arch, epoch_str)), logger)

        last_info = save_checkpoint({
            'epoch': epoch,
            'last_checkpoint': save_path,
        }, str(logger.last_info()), logger)

        eval_results = eval_all(args, eval_loaders, net, criterion, epoch_str, logger, opt_config)

        # measure elapsed time
        epoch_time.update(time.time() - start_time)
        start_time = time.time()

    logger.close()
Exemple #21
0
    def __init__(
        self,
        file_names: Iterable[str],
        working_dir=None,
        cache_dir=None,
        sr=44100,
        n_fft=1024,
        hop_length=512,
        freq_compression="linear",
        n_freq_bins=256,
        f_min=None,
        f_max=18000,
        *args,
        **kwargs
    ):
        super().__init__(file_names, working_dir, sr, *args, **kwargs)
        if self.dataset_name is not None:
            self._logger.info("Init dataset {}...".format(self.dataset_name))

        self.sp = signal.signal_proc()

        self.sr = sr
        self.f_min = f_min
        self.f_max = f_max
        self.n_fft = n_fft
        self.hop_length = hop_length
        self.sp = signal.signal_proc()
        self.freq_compression = freq_compression

        valid_freq_compressions = ["linear", "mel", "mfcc"]

        if self.freq_compression not in valid_freq_compressions:
            raise ValueError(
                "{} is not a valid freq_compression. Must be one of {}",
               format(self.freq_compression, valid_freq_compressions),
            )

        self._logger.debug(
            "Number of test files: {}".format(len(self.file_names))
        )

        spec_transforms = [
            lambda fn: T.load_audio_file(fn, sr=sr),
            T.PreEmphasize(DefaultSpecDatasetOps["preemphases"]),
            T.Spectrogram(n_fft, hop_length, center=False)
        ]

        self.file_reader = AsyncFileReader()

        if cache_dir is None:
            self.t_spectrogram = T.Compose(spec_transforms)
        else:
            self.t_spectrogram = T.CachedSpectrogram(
                cache_dir=cache_dir,
                spec_transform=T.Compose(spec_transforms),
                n_fft=n_fft,
                hop_length=hop_length,
                file_reader=AsyncFileReader(),
            )

        if self.freq_compression == "linear":
            self.t_compr_f = T.Interpolate(
                n_freq_bins, sr, f_min, f_max
            )
        elif self.freq_compression == "mel":
            self.t_compr_f = T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max)
        elif self.freq_compression == "mfcc":
            self.t_compr_f = T.Compose(
                T.F2M(sr=sr, n_mels=n_freq_bins, f_min=f_min, f_max=f_max), T.M2MFCC()
            )
        else:
            raise "Undefined frequency compression"

        self.t_compr_a = T.Amp2Db(min_level_db=DefaultSpecDatasetOps["min_level_db"])

        self.t_norm = T.Normalize(
            min_level_db=DefaultSpecDatasetOps["min_level_db"],
            ref_level_db=DefaultSpecDatasetOps["ref_level_db"],
        )