def __init__(self, filter_length=1024, hop_length=256, win_length=1024, n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0, mel_fmax=8000.0, dynamic_range_compression='nvidia'): super(MelTransformer, self).__init__() self.n_mel_channels = n_mel_channels self.sampling_rate = sampling_rate self.dynamic_range_compression = dynamic_range_compression self.stft_fn = STFT(filter_length, hop_length, win_length) mel_basis = librosa_mel_fn(sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax) mel_basis = torch.from_numpy(mel_basis).float() self.register_buffer('mel_basis', mel_basis)
def __init__( self, filter_length=1024, hop_length=256, win_length=1024, n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0, mel_fmax=None, ): super(TacotronSTFT, self).__init__() self.n_mel_channels = n_mel_channels self.sampling_rate = sampling_rate self.stft_fn = STFT(filter_length, hop_length, win_length) mel_basis = librosa_mel_fn(sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax) mel_basis = torch.from_numpy(mel_basis).float() self.register_buffer("mel_basis", mel_basis)
def get_mel(self, y, center=False): sampling_rate = self.target_sr n_mels = self.n_mels n_fft = self.n_fft win_size = self.win_size hop_length = self.hop_length fmin = self.fmin fmax = self.fmax clip_val = self.clip_val if torch.min(y) < -1.: print('min value is ', torch.min(y)) if torch.max(y) > 1.: print('max value is ', torch.max(y)) if fmax not in self.mel_basis: mel = librosa_mel_fn(sampling_rate, n_fft, n_mels, fmin, fmax) self.mel_basis[str(fmax) + '_' + str(y.device)] = torch.from_numpy(mel).float().to( y.device) self.hann_window[str(y.device)] = torch.hann_window( self.win_size).to(y.device) y = torch.nn.functional.pad(y.unsqueeze(1), (int( (n_fft - hop_length) / 2), int((n_fft - hop_length) / 2)), mode='reflect') y = y.squeeze(1) spec = torch.stft(y, n_fft, hop_length=hop_length, win_length=win_size, window=self.hann_window[str(y.device)], center=center, pad_mode='reflect', normalized=False, onesided=True) spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9)) spec = torch.matmul(self.mel_basis[str(fmax) + '_' + str(y.device)], spec) spec = dynamic_range_compression_torch(spec, clip_val=clip_val) return spec
def mel_spectrogram(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False): if torch.min(y) < -1.0: print("min value is ", torch.min(y)) if torch.max(y) > 1.0: print("max value is ", torch.max(y)) global mel_basis, hann_window if fmax not in mel_basis: mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax) mel_basis[str(fmax) + "_" + str(y.device)] = torch.from_numpy(mel).float().to(y.device) hann_window[str(y.device)] = torch.hann_window(win_size).to(y.device) y = torch.nn.functional.pad(y.unsqueeze(1), (int( (n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), mode="reflect") y = y.squeeze(1) spec = torch.stft( y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[str(y.device)], center=center, pad_mode="reflect", normalized=False, onesided=True, ) spec = torch.sqrt(spec.pow(2).sum(-1) + (1e-9)) spec = torch.matmul(mel_basis[str(fmax) + "_" + str(y.device)], spec) spec = spectral_normalize_torch(spec) return spec
def __init__(self, n_fft, hop_length, win_length, sample_rate, n_mels, f_min, f_max, preemph): super().__init__() window = torch.hann_window(win_length).float() self.register_buffer("window", window) mel_basis = torch.from_numpy( librosa_mel_fn(sample_rate, n_fft, n_mels, f_min, f_max)).float() self.register_buffer("mel_basis", mel_basis) preemph_kernel = torch.FloatTensor([[[-preemph, 1]]]) self.register_buffer("preemph_kernel", preemph_kernel) self.n_fft = n_fft self.hop_length = hop_length self.win_length = win_length self.sample_rate = sample_rate self.n_mels = n_mels
def __init__(self): super(TacotronSTFT, self).__init__() self.n_mel_channels = hps.n_mel_channels self.sampling_rate = hps.sampling_rate self.filter_length = hps.filter_length self.hop_length = hps.hop_length self.win_length = hps.win_length self.mel_fmin = hps.mel_fmin self.mel_fmax = hps.mel_fmax self.stft_fn = STFT(filter_length=self.filter_length, hop_length=self.hop_length, win_length=self.win_length) # numpy mel_basis = librosa_mel_fn(self.sampling_rate, self.filter_length, self.n_mel_channels, self.mel_fmin, self.mel_fmax) # np -> torch mel_basis = torch.from_numpy(mel_basis).float() self.register_buffer('mel_basis', mel_basis)
def __init__(self, filter_length=1024, hop_length=256, win_length=1024, n_mel_channels=40, sampling_rate=16000, mel_fmin=0.0, mel_fmax=8000.0): """ mel 特征抽取 :param filter_length: fft采样点数 :param hop_length: 移动 stride :param win_length: 窗长 :param n_mel_channels: mel channel 个数 :param sampling_rate: 采样率 :param mel_fmin: 最小截止频率 :param mel_fmax: 最大截止频率 """ super(MelSpec, self).__init__() self.n_mel_channels = n_mel_channels self.sampling_rate = sampling_rate self.stft_fn = STFT(filter_length=filter_length, hop_length=hop_length, win_length=win_length) mel_bias = librosa_mel_fn(sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax) mel_bias = torch.from_numpy(mel_bias).float() self.register_buffer('mel_bias', mel_bias)
def __init__(self, filter_length=1024, hop_length=256, win_length=1024, n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0, mel_fmax=8000.0): super(TacotronSTFT, self).__init__() self.n_mel_channels = n_mel_channels self.sampling_rate = sampling_rate self.stft_fn = STFT( filter_length, hop_length, win_length) # hop and window length are in samples. mel_basis = librosa_mel_fn( sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax) ### filter_length = number of FFT components mel_basis = torch.from_numpy(mel_basis).float() self.register_buffer('mel_basis', mel_basis)
def __init__( self, n_fft=1024, hop_length=256, win_length=1024, sampling_rate=22050, n_mel_channels=80, mel_fmin=0.0, mel_fmax=None, ): super(Audio2Mel, self).__init__() ############################################## # FFT Parameters # ############################################## mel_basis = librosa_mel_fn(sampling_rate, n_fft, n_mel_channels, mel_fmin, mel_fmax) mel_basis = torch.from_numpy(mel_basis).float() self.register_buffer("mel_basis", mel_basis) self.stft = STFT(n_fft, hop_length, win_length, sampling_rate) self.n_mel_channels = n_mel_channels
def __init__(self, hparams: TSTFTHParams, logger: Logger): super().__init__() self.logger = logger self.n_mel_channels = hparams.n_mel_channels self.sampling_rate = hparams.sampling_rate self.stft_fn = STFT( filter_length=hparams.filter_length, hop_length=hparams.hop_length, win_length=hparams.win_length, window=hparams.window, ) mel_basis = librosa_mel_fn( sr=hparams.sampling_rate, n_fft=hparams.filter_length, n_mels=hparams.n_mel_channels, fmin=hparams.mel_fmin, fmax=hparams.mel_fmax, ) mel_basis = torch.from_numpy(mel_basis).float() self.register_buffer('mel_basis', mel_basis)
def __init__(self, hparams): self.hparams = hparams # Module does not support ModuleList yet. for i in range(hparams.n_flows): setattr(self, f'WN_{i}', WN(hparams)) n_half = hparams.n_samples_per_group // 2 n_remaining_channels = hparams.n_samples_per_group for k in range(hparams.n_flows): if k % hparams.n_early_every == 0 and k > 0: n_half = n_half - hparams.n_early_size // 2 n_remaining_channels = n_remaining_channels - hparams.n_early_size self.n_remaining_channels = n_remaining_channels mel_basis = librosa_mel_fn(hparams.sr, hparams.n_fft, n_mels=hparams.n_mels, fmin=hparams.mel_fmin, fmax=hparams.mel_fmax) self.basis = nn.Variable.from_numpy_array(mel_basis[None, ...]) self.rng = np.random.RandomState(hparams.seed)
def __init__(self, filter_length=1024, hop_length=256, win_length=1024, n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0, mel_fmax=8000.0, clamp_val=1e-5, stft_dtype=torch.float32): super(TacotronSTFT, self).__init__() self.n_mel_channels = n_mel_channels self.sampling_rate = sampling_rate self.clip_val = clamp_val self.stft_fn = STFT(filter_length, hop_length, win_length, dtype=stft_dtype) mel_basis = librosa_mel_fn(sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax) mel_basis = torch.from_numpy(mel_basis).float() self.register_buffer('mel_basis', mel_basis)
def __init__(self, filter_length=1024, hop_length=256, win_length=1024, n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0, mel_fmax=None, ref_level_db=10., min_level_db=-100.): super(TacotronSTFT, self).__init__() self.n_mel_channels = n_mel_channels self.sampling_rate = sampling_rate self.stft_fn = STFT(filter_length, hop_length, win_length) mel_basis = librosa_mel_fn(sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax) mel_basis = torch.from_numpy(mel_basis).float() self.register_buffer('mel_basis', mel_basis) # to be used for mel_spectrogram_dbver self.ref_level_db = ref_level_db self.min_level_db = min_level_db
def __init__(self, metadata, hparams, shuffle=False, rng=None): if rng is None: rng = np.random.RandomState(hparams.seed) super().__init__(shuffle=shuffle, rng=rng) # read text and wave files texts, waves = list(), list() path = Path(hparams.data_dir) with open(path / metadata, encoding='utf-8') as f: for line in f: inputs = line.strip().split('|') waves.append(str(path / 'wavs' / f'{inputs[0]}.wav')) texts.append(inputs[2]) # split data n = len(waves) index = self._rng.permutation(n) if shuffle else np.arange(n) if hasattr(hparams, 'comm'): # distributed learning num = n // hparams.comm.n_procs index = index[num * hparams.comm.rank:num * (hparams.comm.rank + 1)] self._waves = [waves[i] for i in index] self._texts = [texts[i] for i in index] self._path = Path(hparams.save_data_dir) self._size = len(self._waves) self._variables = hparams.out_variables self.hparams = hparams self._char2idx = {ch: i for i, ch in enumerate(hparams.vocab)} self._idx2char = {i: ch for i, ch in enumerate(hparams.vocab)} # compute the mel basis self.mel_basis = librosa_mel_fn(hparams.sr, hparams.n_fft, n_mels=hparams.n_mels, fmin=hparams.mel_fmin, fmax=hparams.mel_fmax) self.reset()
def main(): parser = argparse.ArgumentParser() parser.add_argument("--in-dir", default="data/raw/", help="Path to raw audio.") valid_methods = ["mel", "if", "taco"] parser.add_argument("--method", default=valid_methods[2], help="Preprocessing method to use.", choices=valid_methods) parser.add_argument("--out-dir", default="data/", help="Output base directory") args = parser.parse_args() args.fs = 22050 # 48000 if args.method not in valid_methods: raise ValueError( "Expected method to be one of {}".format(valid_methods)) args.out_dir = join(args.out_dir, args.method) makedirs(args.out_dir, exist_ok=True) if args.method == valid_methods[0]: preprocess_mel(args.in_dir, args) if args.method == valid_methods[1]: preprocess_if(args._in_dir, args) if args.method == valid_methods[2]: mel_basis = librosa_mel_fn(22050, 1024, 80, 0, 8000) mel_basis = torch.from_numpy(mel_basis).float() stft = STFT(1024, 256, 1024) embed_taco = partial(taco_mel, mel_basis, stft) preprocess_taco = partial(preprocess_wrap, embed_taco, 'taco') preprocess_taco(args.in_dir, args)
def __init__( self, n_fft=1024, hop_length=256, win_length=1024, sampling_rate=22050, n_mel_channels=80, mel_fmin=0.0, mel_fmax=None, ): super().__init__() window = torch.hann_window(win_length).float() mel_basis = librosa_mel_fn(sampling_rate, n_fft, n_mel_channels, mel_fmin, mel_fmax) mel_basis = torch.from_numpy(mel_basis).float() self.register_buffer("mel_basis", mel_basis) self.register_buffer("window", window) self.n_fft = n_fft self.hop_length = hop_length self.win_length = win_length self.sampling_rate = sampling_rate self.n_mel_channels = n_mel_channels self.pad_size = (n_fft - hop_length) // 2
def __init__( self, n_fft: int = 1024, hop_length: int = 256, win_length: int = 1024, sampling_rate: int = 22050, n_mel_channels: int = 80, mel_fmin: float = 0.0, mel_fmax: float = None, ): """ @TODO Args: n_fft: hop_length: win_length: sampling_rate: n_mel_channels: mel_fmin: mel_fmax: """ super().__init__() ############################################## # FFT Parameters # ############################################## window = torch.hann_window(win_length).float() mel_basis = librosa_mel_fn( sampling_rate, n_fft, n_mel_channels, mel_fmin, mel_fmax ) mel_basis = torch.from_numpy(mel_basis).float() self.register_buffer("mel_basis", mel_basis) self.register_buffer("window", window) self.n_fft = n_fft self.hop_length = hop_length self.win_length = win_length self.sampling_rate = sampling_rate self.n_mel_channels = n_mel_channels
def __init__(self, gen, dis, gen_optim, dis_optim, dataloader, hp): self.gen = gen self.dis = dis self.gen_optim = gen_optim self.dis_optim = dis_optim self.dataloader = dataloader self.hp = hp # compute mel basis mel_basis = librosa_mel_fn(hp.sr, hp.n_fft, n_mels=hp.n_mels, fmin=hp.mel_fmin, fmax=hp.mel_fmax) self.mel_basis = nn.Variable.from_numpy_array(mel_basis[None, ...]) self.one_epoch_train = dataloader['train'].size // hp.batch_size self.one_epoch_valid = dataloader['valid'].size // hp.batch_size self.placeholder = dict() self.monitor = ProgressMeter(self.one_epoch_train, hp.output_path, quiet=hp.comm.rank > 0) hp.save(Path(hp.output_path) / 'settings.json')
def log_mel_spectrogram(wave, sr, window_size, n_mels=80): """Return log mel-spectrogram. Args: wave (nn.Variable): Input waveform of shape (B, 1, L). sr (int): Sampling rate. window_size (int): Window size. n_mels (int): Number of mel banks. jitter (bool): Whether to apply random crop. Defaults to False. max_jitter_steps (int): Maximum number of jitter steps if jitter is set to `True`. Returns: nn.Variable: Log mel-spectrogram. """ linear = spectrogram(wave, window_size) mel_basis = librosa_mel_fn(sr, window_size, n_mels=n_mels, fmin=80.0, fmax=7600.0) basis = nn.Variable.from_numpy_array(mel_basis[None, ...]) mels = F.batch_matmul(basis, linear) return F.log(mels * 1e4 + 1.0)
def __init__(self, type, sampling_rate, max_wave_value, sfft={ 'filter_length': 1024, 'hop_length': 256, 'win_length': 1024 }, mel=None): """Everything that we need to init""" self.type = type self.sampling_rate = sampling_rate self.max_wave_value = max_wave_value self.sfft = sfft assert not mel is None mel_fmax = None if mel['mel_fmax'] == 'None' else mel['mel_fmax'] self.mel_basis = librosa_mel_fn(self.sampling_rate, sfft['filter_length'], mel['n_mel_channels'], mel['mel_fmin'], mel_fmax) self.inv_mel_basis = np.linalg.pinv(self.mel_basis) self.num_channels = mel['n_mel_channels']
def __init__(self, n_fft=hp.n_fft, # filter length hop_length=hp.hop_size, win_length=hp.win_size, sampling_rate=hp.sample_rate, n_mel=hp.n_mel, mel_fmin=hp.mel_fmin, mel_fmax=hp.mel_fmax): super(MySTFT, self).__init__() self.n_mel_channels = n_mel self.n_fft = n_fft self.hop_length = hop_length self.sampling_rate = sampling_rate # does reflection padding with n_fft // 2 on both sides of signal self.stft = STFT(n_fft, hop_length, win_length, window='hann') mel_basis = librosa_mel_fn( sampling_rate, n_fft, n_mel, mel_fmin, mel_fmax) mel_basis = torch.from_numpy(mel_basis).float() mel_inverse = torch.pinverse(mel_basis) self.register_buffer('mel_basis', mel_basis) self.register_buffer('mel_inverse', mel_inverse)
def __init__(self, config): super(Network, self).__init__() input_shape = config['input_shape'] n_classes = config['n_classes'] base_channels = config['base_channels'] block_type = config['block_type'] depth = config['depth'] self.pooling_padding = config.get("pooling_padding", 0) or 0 self.use_raw_spectograms = config.get("use_raw_spectograms") or False self.apply_softmax = config.get("apply_softmax") or False assert block_type in ['basic', 'bottleneck'] if self.use_raw_spectograms: mel_basis = librosa_mel_fn( 22050, 2048, 256) mel_basis = torch.from_numpy(mel_basis).float() self.register_buffer('mel_basis', mel_basis) if block_type == 'basic': block = BasicBlock n_blocks_per_stage = (depth - 2) // 6 assert n_blocks_per_stage * 6 + 2 == depth else: block = BottleneckBlock n_blocks_per_stage = (depth - 2) // 9 assert n_blocks_per_stage * 9 + 2 == depth n_blocks_per_stage = [n_blocks_per_stage, n_blocks_per_stage, n_blocks_per_stage] if config.get("n_blocks_per_stage") is not None: shared_globals.console.warning( "n_blocks_per_stage is specified ignoring the depth param, nc=" + str(config.get("n_channels"))) n_blocks_per_stage = config.get("n_blocks_per_stage") n_channels = config.get("n_channels") if n_channels is None: n_channels = [ base_channels, base_channels * 2 * block.expansion, base_channels * 4 * block.expansion ] if config.get("grow_a_lot"): n_channels[2] = base_channels * 8 * block.expansion self.in_c = nn.Sequential(nn.Conv2d( input_shape[1], n_channels[0], kernel_size=5, stride=2, padding=1, bias=False), nn.BatchNorm2d(n_channels[0]), nn.ReLU(True) ) self.stage1 = self._make_stage( n_channels[0], n_channels[0], n_blocks_per_stage[0], block, stride=1, maxpool=config['stage1']['maxpool'], k1s=config['stage1']['k1s'], k2s=config['stage1']['k2s']) if n_blocks_per_stage[1] == 0: self.stage2 = nn.Sequential() n_channels[1] = n_channels[0] print("WARNING: stage2 removed") else: self.stage2 = self._make_stage( n_channels[0], n_channels[1], n_blocks_per_stage[1], block, stride=1, maxpool=config['stage2']['maxpool'], k1s=config['stage2']['k1s'], k2s=config['stage2']['k2s']) if n_blocks_per_stage[2] == 0: self.stage3 = nn.Sequential() n_channels[2] = n_channels[1] print("WARNING: stage3 removed") else: self.stage3 = self._make_stage( n_channels[1], n_channels[2], n_blocks_per_stage[2], block, stride=1, maxpool=config['stage3']['maxpool'], k1s=config['stage3']['k1s'], k2s=config['stage3']['k2s']) ff_list = [] ff_list += [nn.Conv2d( n_channels[2], n_classes, kernel_size=1, stride=1, padding=0, bias=False), nn.BatchNorm2d(n_classes), ] self.stop_before_global_avg_pooling = False if config.get("stop_before_global_avg_pooling"): self.stop_before_global_avg_pooling = True else: ff_list.append(nn.AdaptiveAvgPool2d((1, 1))) self.feed_forward = nn.Sequential( *ff_list ) # initialize weights if config.get("weight_init") == "fixup": self.apply(initialize_weights) if isinstance(self.feed_forward[0], nn.Conv2d): self.feed_forward[0].weight.data.zero_() self.apply(initialize_weights_fixup) else: self.apply(initialize_weights) self.use_check_point = config.get("use_check_point") or False
# limitations under the License. import multiprocessing import random from concurrent.futures import ProcessPoolExecutor, as_completed from pathlib import Path import librosa as lr import numpy as np from librosa.filters import mel as librosa_mel_fn from tqdm import tqdm from hparams import hparams as hp mel_basis = librosa_mel_fn( hp.sr, hp.n_fft, n_mels=hp.n_mels, fmin=hp.mel_fmin, fmax=hp.mel_fmax ) def process(line): r"""Read audio waveform and preprocess it. Args: line (str): A line from metadata. """ path = Path(hp.corpus_path) / 'wavs' meta = line.strip().split('|') wave = lr.load(path / f'{meta[0]}.wav', sr=hp.sr)[0] np.savez( Path(hp.precomputed_path) / 'data' / (meta[0] + '.npz'), wave=wave
default="mag", required=False, help='data_type') args = parser.parse_args() os.environ["CUDA_VISIBLE_DEVICES"] = args.rank os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' ## hyperparamerter hp = create_hparams(f"hp_config/{args.hp_config}") ## create logger logger = prepare_directories_and_logger( Logger, output_directory=f'output/{args.output_directory}') if args.feat_type == 'mel': mel_basis = librosa_mel_fn(22050, 1024, 80, 0, None) inv_mel_basis = np.linalg.pinv(mel_basis) #################################################################### """ Data Loader Part """ def make_inf_iterator(data_iterator): while True: for data in data_iterator: yield data class AudioLoader(torch.utils.data.Dataset):
def __init__(self, config): super(Network, self).__init__() input_shape = config['input_shape'] n_classes = config['n_classes'] base_channels = config['base_channels'] block_type = config['block_type'] depth = config['depth'] self.pooling_padding = config.get("pooling_padding", 0) or 0 self.use_raw_spectograms = config.get("use_raw_spectograms") or False global shift_augment shift_augment = config.get("features_shift") or False assert block_type in ['basic', 'bottleneck'] if self.use_raw_spectograms: mel_basis = librosa_mel_fn(22050, 2048, 256) mel_basis = torch.from_numpy(mel_basis).float() self.register_buffer('mel_basis', mel_basis) if block_type == 'basic': block = BasicBlock n_blocks_per_stage = (depth - 2) // 6 assert n_blocks_per_stage * 6 + 2 == depth else: block = BottleneckBlock n_blocks_per_stage = (depth - 2) // 9 assert n_blocks_per_stage * 9 + 2 == depth n_blocks_per_stage = [ n_blocks_per_stage, n_blocks_per_stage, n_blocks_per_stage ] if config.get("n_blocks_per_stage") is not None: shared_globals.console.warning( "n_blocks_per_stage is specified ignoring the depth param") n_blocks_per_stage = config.get("n_blocks_per_stage") n_channels = config.get("n_channels") if n_channels is None: n_channels = [ base_channels, base_channels * 2 * block.expansion, base_channels * 4 * block.expansion ] self.in_c = nn.Sequential( nn.Conv2d(input_shape[1], n_channels[0], kernel_size=5, stride=2, padding=1, bias=False), nn.BatchNorm2d(n_channels[0]), nn.ReLU(True)) self.stage1 = self._make_stage(n_channels[0], n_channels[0], n_blocks_per_stage[0], block, stride=1, maxpool=config['stage1']['maxpool'], k1s=config['stage1']['k1s'], k2s=config['stage1']['k2s']) self.stage2 = self._make_stage(n_channels[0], n_channels[1], n_blocks_per_stage[1], block, stride=1, maxpool=config['stage2']['maxpool'], k1s=config['stage2']['k1s'], k2s=config['stage2']['k2s']) self.stage3 = self._make_stage(n_channels[1], n_channels[2], n_blocks_per_stage[2], block, stride=1, maxpool=config['stage3']['maxpool'], k1s=config['stage3']['k1s'], k2s=config['stage3']['k2s']) ff_list = [] if config.get("attention_avg"): if config.get("attention_avg") == "sum_all": ff_list.append( AttentionAvg(n_channels[2], n_classes, sum_all=True)) else: ff_list.append( AttentionAvg(n_channels[2], n_classes, sum_all=False)) else: ff_list += [ nn.Conv2d(n_channels[2], n_classes, kernel_size=1, stride=1, padding=0, bias=False), nn.BatchNorm2d(n_classes), ] self.stop_before_global_avg_pooling = False if config.get("stop_before_global_avg_pooling"): self.stop_before_global_avg_pooling = True else: ff_list.append(nn.AdaptiveAvgPool2d((1, 1))) self.feed_forward = nn.Sequential(*ff_list) # # compute conv feature size # with torch.no_grad(): # self.feature_size = self._forward_conv( # torch.zeros(*input_shape)).view(-1).shape[0] # # self.fc = nn.Linear(self.feature_size, n_classes) # initialize weights if config.get("weight_init") == "fixup": self.apply(initialize_weights) if isinstance(self.feed_forward[0], nn.Conv2d): self.feed_forward[0].weight.data.zero_() self.apply(initialize_weights_fixup) else: self.apply(initialize_weights)