Beispiel #1
0
    def __init__(self, input_dim, output_dim, norm="none", activation="relu"):
        super(LinearBlock, self).__init__()
        use_bias = True
        # initialize fully connected layer
        if norm == "sn":
            self.fc = SpectralNorm(nn.Linear(input_dim, output_dim, bias=use_bias))
        else:
            self.fc = nn.Linear(input_dim, output_dim, bias=use_bias)

        # initialize normalization
        norm_dim = output_dim
        if norm == "bn":
            self.norm = nn.BatchNorm1d(norm_dim)
        elif norm == "in":
            self.norm = nn.InstanceNorm1d(norm_dim)
        elif norm == "ln":
            self.norm = LayerNorm(norm_dim)
        elif norm == "none" or norm == "sn":
            self.norm = None
        else:
            assert 0, "Unsupported normalization: {}".format(norm)

        # initialize activation
        if activation == "relu":
            self.activation = nn.ReLU(inplace=True)
        elif activation == "lrelu":
            self.activation = nn.LeakyReLU(0.2, inplace=True)
        elif activation == "prelu":
            self.activation = nn.PReLU()
        elif activation == "selu":
            self.activation = nn.SELU(inplace=True)
        elif activation == "tanh":
            self.activation = nn.Tanh()
        elif activation == "none":
            self.activation = None
        else:
            assert 0, "Unsupported activation: {}".format(activation)
    def __init__(self, in_dim, out_dim, activation='none', norm='none', use_bias=True,
                 dropout='none'):
        super(LinearLayer, self).__init__()
        self.fc = nn.Linear(in_dim, out_dim, bias=use_bias)
        if dropout != 'none':
            self.dropout = nn.Dropout(dropout)
        else:
            self.dropout = None

        support_act = ['relu', 'prelu', 'lrelu', 'selu', 'tanh', 'sigmoid', 'none']
        support_norm = ['bn', 'in', 'none']
        assert support_act.__contains__(activation)
        assert support_norm.__contains__(norm)

        # set activation function
        if activation == 'relu':
            self.activation = nn.ReLU(inplace=True)
        elif activation == 'prelu':
            self.activation = nn.PReLU()
        elif activation == 'lrelu':
            self.activation = nn.LeakyReLU(0.2, inplace=True)
        elif activation == 'selu':
            self.activation = nn.SELU(inplace=True)
        elif activation == 'tanh':
            self.activation = nn.Tanh()
        elif activation == 'sigmoid':
            self.activation = nn.Sigmoid()
        elif activation == 'none':
            self.activation = None

        # set normalization function
        if norm == 'bn':
            self.norm = nn.BatchNorm1d(out_dim)
        elif norm == 'in':
            self.norm = nn.InstanceNorm1d(out_dim)
        elif norm == 'none':
            self.norm = None
Beispiel #3
0
    def __init__(self,
                 n_units,
                 dropout,
                 pretrained_emb,
                 vertex_feature,
                 use_vertex_feature,
                 fine_tune=False,
                 instance_normalization=False):
        super(BatchGCN, self).__init__()
        self.num_layer = len(n_units) - 1
        self.dropout = dropout
        self.inst_norm = instance_normalization
        if self.inst_norm:
            self.norm = nn.InstanceNorm1d(pretrained_emb.size(1),
                                          momentum=0.0,
                                          affine=True)

        # https://discuss.pytorch.org/t/can-we-use-pre-trained-word-embeddings-for-weight-initialization-in-nn-embedding/1222/2
        self.embedding = nn.Embedding(pretrained_emb.size(0),
                                      pretrained_emb.size(1))
        self.embedding.weight = nn.Parameter(pretrained_emb)
        self.embedding.weight.requires_grad = fine_tune
        n_units[0] += pretrained_emb.size(1)

        self.use_vertex_feature = use_vertex_feature
        if self.use_vertex_feature:
            self.vertex_feature = nn.Embedding(vertex_feature.size(0),
                                               vertex_feature.size(1))
            self.vertex_feature.weight = nn.Parameter(vertex_feature)
            self.vertex_feature.weight.requires_grad = False
            n_units[0] += vertex_feature.size(1)

        self.layer_stack = nn.ModuleList()

        for i in range(self.num_layer):
            self.layer_stack.append(
                BatchGraphConvolution(n_units[i], n_units[i + 1]))
Beispiel #4
0
    def __init__(self, input_dim, output_dim, norm='none', activation='relu'):
        super(LinearBlock, self).__init__()
        use_bias = True
        # initialize fully connected layer
        if norm == 'sn':
            self.fc = SpectralNorm(nn.Linear(input_dim, output_dim, bias=use_bias))
        else:
            self.fc = nn.Linear(input_dim, output_dim, bias=use_bias)

        # initialize normalization
        norm_dim = output_dim
        if norm == 'bn':
            self.norm = nn.BatchNorm1d(norm_dim)
        elif norm == 'in':
            self.norm = nn.InstanceNorm1d(norm_dim)
        elif norm == 'ln':
            self.norm = LayerNorm(norm_dim)
        elif norm == 'none' or norm == 'sn':
            self.norm = None
        else:
            assert 0, "Unsupported normalization: {}".format(norm)

        # initialize activation
        if activation == 'relu':
            self.activation = nn.ReLU(inplace=True)
        elif activation == 'lrelu':
            self.activation = nn.LeakyReLU(0.2, inplace=True)
        elif activation == 'prelu':
            self.activation = nn.PReLU()
        elif activation == 'selu':
            self.activation = nn.SELU(inplace=True)
        elif activation == 'tanh':
            self.activation = nn.Tanh()
        elif activation == 'none':
            self.activation = None
        else:
            assert 0, "Unsupported activation: {}".format(activation)
Beispiel #5
0
    def __init__(self, dataset_file_name, max_frames, train_path, musan_path, augment_anchor, augment_type):
        self.dataset_file_name = dataset_file_name;
        self.max_frames = max_frames;

        self.data_dict = {};
        self.data_list = [];
        self.nFiles = 0;

        self.torchfb        = transforms.MelSpectrogram(sample_rate=16000, n_fft=512, win_length=400, hop_length=160, f_min=0.0, f_max=8000, pad=0, n_mels=40);
        self.instancenorm   = nn.InstanceNorm1d(40);

        self.noisetypes = ['noise','speech','music']

        self.noisesnr = {'noise':[0,15],'speech':[13,20],'music':[5,15]}
        self.noiselist = {}
        self.augment_anchor = augment_anchor
        self.augment_type   = augment_type

        augment_files   = glob.glob(os.path.join(musan_path,'*/*/*/*.wav'));

        for file in augment_files:
            if not file.split('/')[-4] in self.noiselist:
                self.noiselist[file.split('/')[-4]] = []
            self.noiselist[file.split('/')[-4]].append(file)

        self.rir = numpy.load('rir.npy')

        ### Read Training Files...
        with open(dataset_file_name) as dataset_file:
            while True:
                line = dataset_file.readline();
                if not line:
                    break;
                
                data = line.split();
                filename = os.path.join(train_path,data[0]);
                self.data_list.append(filename)
Beispiel #6
0
    def __init__(
        self,
        n_fft,
        hop_length,
        sample_rate=16000,
        n_mels=128,
        n_mfcc=30,
        gru_units=512,
        z_units=16,
        bidirectional=False,
    ):
        super().__init__()
        self.mfcc = torchaudio.transforms.MFCC(
            sample_rate=sample_rate,
            n_mfcc=n_mfcc,
            log_mels=True,
            melkwargs=dict(
                n_fft=n_fft,
                hop_length=hop_length,
                n_mels=n_mels,
                f_min=20.0,
                f_max=8000.0,
            ),
        )

        self.norm = nn.InstanceNorm1d(n_mfcc, affine=True)
        self.permute = lambda x: x.permute(0, 2, 1)
        self.gru = nn.GRU(
            input_size=n_mfcc,
            hidden_size=gru_units,
            num_layers=1,
            batch_first=True,
            bidirectional=bidirectional,
        )
        self.dense = nn.Linear(gru_units * 2 if bidirectional else gru_units,
                               z_units)
Beispiel #7
0
    def __init__(self,
                 in_features,
                 out_features,
                 activation=None,
                 normalization=None,
                 momentum=0.1,
                 bn_momentum_decay_step=None,
                 bn_momentum_decay=1):
        super(MyLinear, self).__init__()
        self.activation = activation
        self.normalization = normalization

        self.linear = nn.Linear(in_features, out_features, bias=True)
        if self.normalization == 'batch':
            self.norm = MyBatchNorm1d(
                out_features,
                momentum=momentum,
                affine=True,
                momentum_decay_step=bn_momentum_decay_step,
                momentum_decay=bn_momentum_decay)
        elif self.normalization == 'instance':
            self.norm = nn.InstanceNorm1d(out_features,
                                          momentum=momentum,
                                          affine=True)
        if self.activation == 'relu':
            self.act = nn.ReLU()
        elif 'elu' == activation:
            self.act = nn.ELU(alpha=1.0)
        elif 'swish' == self.activation:
            self.act = Swish()
        elif 'leakyrelu' == self.activation:
            self.act = nn.LeakyReLU(0.01)
        elif 'selu' == self.activation:
            self.act = nn.SELU()

        self.weight_init()
Beispiel #8
0
def get_normalization_1d(name, channels):
    flags = {}
    if name.find(';') >= 0:
        parts = name.split(';')
        name = parts[0].strip()
        flags = dict(item.split('=') for item in parts[1].strip().split(' '))

    if name == 'batch':
        return nn.BatchNorm1d(channels)

    if name == 'group':
        groups = int(flags['groups']) if 'groups' in flags else 16
        return nn.GroupNorm(groups, channels)

    if name == 'layer':
        return nn.LayerNorm((channels,))

    if name == 'instance':
        return nn.InstanceNorm1d(channels)

    if name == 'none':
        return nn.Identity()

    assert False
Beispiel #9
0
def build_encoder(input_filts,
                  conv_filts,
                  conv_strides,
                  conv_filt_lens,
                  activation,
                  out_norm=True,
                  z_filts=25,
                  output_z=True,
                  init=True,
                  use_cuda=True):

    layers = []

    # Conv layers
    for filts, strides, filter_length in zip(conv_filts, conv_strides,
                                             conv_filt_lens):
        layers.append(nn.Conv1d(input_filts, filts, filter_length, strides))
        layers.append(activation)
        input_filts = filts

    if output_z == True:
        # Latent output
        layers.append(nn.Conv1d(input_filts, z_filts, 1, 1))
        if out_norm:
            layers.append(nn.InstanceNorm1d(z_filts))
    model = torch.nn.Sequential(*layers)

    if init:
        # Initialize weights and biases
        model.apply(init_weights)

    if use_cuda:
        # Switch to GPU
        model = model.cuda()

    return model
 def __init__(
     self,
     c_in,
     c_h,
     c_out,
     kernel_size,
     bank_size,
     bank_scale,
     c_bank,
     n_conv_blocks,
     subsample,
     act,
     dropout_rate,
 ):
     super(ContentEncoder, self).__init__()
     self.n_conv_blocks = n_conv_blocks
     self.subsample = subsample
     self.act = get_act(act)
     self.conv_bank = nn.ModuleList([
         nn.Conv1d(c_in, c_bank, kernel_size=k)
         for k in range(bank_scale, bank_size + 1, bank_scale)
     ])
     in_channels = c_bank * (bank_size // bank_scale) + c_in
     self.in_conv_layer = nn.Conv1d(in_channels, c_h, kernel_size=1)
     self.first_conv_layers = nn.ModuleList([
         nn.Conv1d(c_h, c_h, kernel_size=kernel_size)
         for _ in range(n_conv_blocks)
     ])
     self.second_conv_layers = nn.ModuleList([
         nn.Conv1d(c_h, c_h, kernel_size=kernel_size, stride=sub)
         for sub, _ in zip(subsample, range(n_conv_blocks))
     ])
     self.norm_layer = nn.InstanceNorm1d(c_h, affine=False)
     self.mean_layer = nn.Conv1d(c_h, c_out, kernel_size=1)
     self.std_layer = nn.Conv1d(c_h, c_out, kernel_size=1)
     self.dropout_layer = nn.Dropout(p=dropout_rate)
Beispiel #11
0
def normalization(fn_name, input_shape):
    """
        :param input_shape: (C) or (C,L) or (C,H,W)
    """
    num_channels = input_shape[0]
    fn = None
    if fn_name == 'batch':
        if len(input_shape) == 3:
            fn = nn.BatchNorm2d(num_channels)  # Input: (N,C,H,W)
        else:
            fn = nn.BatchNorm1d(num_channels)  # Input: (N,C) or (N,C,L)
    elif fn_name == 'instance':
        if len(input_shape) == 2:
            fn = nn.InstanceNorm1d(num_channels)  # Input: (N,C,L)
        elif len(input_shape) == 3:
            fn = nn.InstanceNorm2d(num_channels)  # Input: (N,C,H,W)
    elif fn_name == 'layer':
        from_axis = 1
        fn = nn.LayerNorm(input_shape[from_axis:])  # Input: (N,∗)
    elif fn_name == 'group':
        num_groups = 1  # num_groups = 1: equivalent to LayerNorm along all axes: nn.LayerNorm(input_shape)
        # num_groups = num_channels: equivalent to either InstanceNorm1d(num_channels) or InstanceNorm2d(num_channels)
        fn = nn.GroupNorm(num_groups, num_channels)  # Input: (N,C,*)
    return fn
Beispiel #12
0
    def __init__(self,
                 hidden_size,
                 num_layers,
                 input_size,
                 device='cpu',
                 drop_prob=0,
                 lstm=True,
                 feature_norm=False,
                 output_size=300,
                 bidirectional=True):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.device = device

        if lstm:
            memory_cell = nn.LSTM
        else:
            memory_cell = nn.GRU

        self.memory_cell = memory_cell(
            input_size=hidden_size * 2,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            # make dropout 0 if num_layers is 1
            dropout=drop_prob * (num_layers != 1),
            bidirectional=bidirectional)

        #deprecated arg
        if feature_norm:
            self.norm = nn.InstanceNorm1d(num_features=input_size)
        else:
            self.norm = nn.Identity()

        self.linear = nn.Linear(hidden_size * 2, output_size)
Beispiel #13
0
def createMLP(dims, norm='bn', activation='relu', last_op=nn.Tanh(), dropout=False):
    act = None
    if activation == 'relu':
        act = nn.ReLU()
    if activation == 'lrelu':
        act = nn.LeakyReLU()
    if activation == 'selu':
        act = nn.SELU()
    if activation == 'elu':
        act = nn.ELU()
    if activation == 'prelu':
        act = nn.PReLU()

    mlp = []
    for i in range(1,len(dims)):
        if norm == 'bn':
            mlp += [  nn.Linear(dims[i-1], dims[i]),
                    nn.BatchNorm1d(dims[i])]
        if norm == 'in':
            mlp += [  nn.Linear(dims[i-1], dims[i]),
                    nn.InstanceNorm1d(dims[i])]
        if norm == 'wn':
            mlp += [  nn.utils.weight_norm(nn.Linear(dims[i-1], dims[i]), name='weight')]
        if norm == 'none':
            mlp += [ nn.Linear(dims[i-1], dims[i])]
        
        if i != len(dims)-1:
            if act is not None:
                mlp += [act]
            if dropout:
                mlp += [nn.Dropout(0.2)]

    if last_op is not None:
        mlp += [last_op]

    return mlp
Beispiel #14
0
    def __init__(
        self,
        input_dim,
        proj_dim=256,
        lstm_dim=768,
        num_lstm_layers=3,
        use_lstm_with_projection=True,
        use_torch_spec=False,
        audio_config=None,
    ):
        super().__init__()
        self.use_lstm_with_projection = use_lstm_with_projection
        self.use_torch_spec = use_torch_spec
        self.audio_config = audio_config
        self.proj_dim = proj_dim

        layers = []
        # choise LSTM layer
        if use_lstm_with_projection:
            layers.append(LSTMWithProjection(input_dim, lstm_dim, proj_dim))
            for _ in range(num_lstm_layers - 1):
                layers.append(LSTMWithProjection(proj_dim, lstm_dim, proj_dim))
            self.layers = nn.Sequential(*layers)
        else:
            self.layers = LSTMWithoutProjection(input_dim, lstm_dim, proj_dim,
                                                num_lstm_layers)

        self.instancenorm = nn.InstanceNorm1d(input_dim)

        if self.use_torch_spec:
            self.torch_spec = self.get_torch_mel_spectrogram_class(
                audio_config)
        else:
            self.torch_spec = None

        self._init_layers()
Beispiel #15
0
    def __init__(
        self,
        input_shape=None,
        input_size=None,
        eps=1e-05,
        momentum=0.1,
        track_running_stats=True,
        affine=False,
    ):
        super().__init__()

        if input_shape is None and input_size is None:
            raise ValueError("Expected input_shape or input_size as input")

        if input_size is None:
            input_size = input_shape[-1]

        self.norm = nn.InstanceNorm1d(
            input_size,
            eps=eps,
            momentum=momentum,
            track_running_stats=track_running_stats,
            affine=affine,
        )
 def __init__(self,
              c_in=None,
              c_h1=128,
              c_h2=512,
              c_h3=128,
              ns=0.2,
              dp=0.5):
     super(Encoder, self).__init__()
     self.ns = ns
     self.conv1s = nn.ModuleList(
         [nn.Conv1d(c_in, c_h1, kernel_size=k) for k in range(1, 8)])
     self.conv2 = nn.Conv1d(len(self.conv1s) * c_h1 + c_in,
                            c_h2,
                            kernel_size=1)
     self.conv3 = nn.Conv1d(c_h2, c_h2, kernel_size=5)
     self.conv4 = nn.Conv1d(c_h2, c_h2, kernel_size=5, stride=2)
     self.conv5 = nn.Conv1d(c_h2, c_h2, kernel_size=5)
     self.conv6 = nn.Conv1d(c_h2, c_h2, kernel_size=5, stride=2)
     self.conv7 = nn.Conv1d(c_h2, c_h2, kernel_size=5)
     self.conv8 = nn.Conv1d(c_h2, c_h2, kernel_size=5, stride=2)
     self.dense1 = nn.Linear(c_h2, c_h2)
     self.dense2 = nn.Linear(c_h2, c_h2)
     self.dense3 = nn.Linear(c_h2, c_h2)
     self.dense4 = nn.Linear(c_h2, c_h2)
     self.RNN = nn.GRU(input_size=c_h2,
                       hidden_size=c_h3,
                       num_layers=1,
                       bidirectional=True)
     self.linear = nn.Linear(c_h2 + 2 * c_h3, c_h2)
     # normalization layer
     self.ins_norm1 = nn.InstanceNorm1d(c_h2)
     self.ins_norm2 = nn.InstanceNorm1d(c_h2)
     self.ins_norm3 = nn.InstanceNorm1d(c_h2)
     self.ins_norm4 = nn.InstanceNorm1d(c_h2)
     self.ins_norm5 = nn.InstanceNorm1d(c_h2)
     self.ins_norm6 = nn.InstanceNorm1d(c_h2)
     # dropout layer
     self.drop1 = nn.Dropout(p=dp)
     self.drop2 = nn.Dropout(p=dp)
     self.drop3 = nn.Dropout(p=dp)
     self.drop4 = nn.Dropout(p=dp)
     self.drop5 = nn.Dropout(p=dp)
     self.drop6 = nn.Dropout(p=dp)
Beispiel #17
0
    def __init__(self, in_size, num_features, num_heads):
        super(SelfAttentionBlock, self).__init__()

        self.in_size = in_size
        self.num_features = num_features

        self.QueryLayer = nn.Sequential(
            nn.Linear(in_size, num_features),
            nn.InstanceNorm1d(num_features)
        )

        self.KeyLayer = nn.Sequential(
            nn.Linear(in_size, num_features),
            nn.InstanceNorm1d(num_features)
        )

        self.ValueLayer = nn.Sequential(
            nn.Linear(in_size, num_features),
            nn.InstanceNorm1d(num_features)
        )

        self.QueryNorm = nn.InstanceNorm1d(num_features)
        self.KeyNorm = nn.InstanceNorm1d(num_features)
        self.ValueNorm = nn.InstanceNorm1d(num_features)
Beispiel #18
0
    def __init__(self, nOut=1024, stride=1):
        super(SyncNetModel, self).__init__()

        self.netcnnaud = nn.Sequential(
            # (b, 1, 128, time)
            nn.Conv2d(1, 96, kernel_size=(5, 7), stride=(1, 1),
                      padding=(2, 2)),
            # (b, 96, 128, t-2)
            nn.BatchNorm2d(96),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=(1, 3), stride=(1, 2)),
            # (b, 96,
            nn.Conv2d(96,
                      256,
                      kernel_size=(5, 5),
                      stride=(2, 1),
                      padding=(1, 1)),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=(3, 3), stride=(2, 1)),
            nn.Conv2d(256, 384, kernel_size=(3, 3), padding=(1, 1)),
            nn.BatchNorm2d(384),
            nn.ReLU(inplace=True),
            nn.Conv2d(384, 256, kernel_size=(3, 3), padding=(1, 1)),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=(3, 3), padding=(1, 1)),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=(3, 3), stride=(2, 2)),
            nn.Conv2d(256,
                      512,
                      kernel_size=(4, 1),
                      padding=(0, 0),
                      stride=(1, stride)),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
        )

        self.netfcaud = nn.Sequential(
            nn.Conv1d(512, 512, kernel_size=1),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Conv1d(512, nOut, kernel_size=1),
        )

        self.netfclip = nn.Sequential(
            nn.Conv1d(512, 512, kernel_size=1),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Conv1d(512, nOut, kernel_size=1),
        )

        self.netfcspk = nn.Sequential(
            nn.Conv1d(512, 512, kernel_size=1),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Conv1d(512, nOut, kernel_size=1),
        )

        self.netfcface = nn.Sequential(
            nn.Conv1d(512, 512, kernel_size=1),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Conv1d(512, nOut, kernel_size=1),
        )

        self.netcnnlip = nn.Sequential(
            nn.Conv3d(3,
                      96,
                      kernel_size=(5, 7, 7),
                      stride=(stride, 2, 2),
                      padding=0),
            nn.BatchNorm3d(96),
            nn.ReLU(inplace=True),
            nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2)),
            nn.Conv3d(96,
                      256,
                      kernel_size=(1, 5, 5),
                      stride=(1, 2, 2),
                      padding=(0, 1, 1)),
            nn.BatchNorm3d(256),
            nn.ReLU(inplace=True),
            nn.MaxPool3d(kernel_size=(1, 3, 3),
                         stride=(1, 2, 2),
                         padding=(0, 1, 1)),
            nn.Conv3d(256, 256, kernel_size=(1, 3, 3), padding=(0, 1, 1)),
            nn.BatchNorm3d(256),
            nn.ReLU(inplace=True),
            nn.Conv3d(256, 256, kernel_size=(1, 3, 3), padding=(0, 1, 1)),
            nn.BatchNorm3d(256),
            nn.ReLU(inplace=True),
            nn.Conv3d(256, 256, kernel_size=(1, 3, 3), padding=(0, 1, 1)),
            nn.BatchNorm3d(256),
            nn.ReLU(inplace=True),
            nn.MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2)),
            nn.Conv3d(256, 512, kernel_size=(1, 6, 6), padding=0),
            nn.BatchNorm3d(512),
            nn.ReLU(inplace=True),
        )

        self.instancenorm = nn.InstanceNorm1d(40)
        self.torchfb = torchaudio.transforms.MelSpectrogram(sample_rate=16000,
                                                            n_fft=512,
                                                            win_length=400,
                                                            hop_length=160,
                                                            f_min=0.0,
                                                            f_max=8000,
                                                            pad=0,
                                                            n_mels=40)
Beispiel #19
0
    def __init__(self):
        # isize=512, nz=100, nc=20, ndf=32
        """
        Args:
            isize: Image size (must be a power of two; defaults to 256).
            nz: Dimensionality of latent variable ``z``.
            nc: Dimensionality of input variable ``x``.
            ndf:

        Notes:
            Use non-overlapping stride to avoid checkboard artifacts:
                https://distill.pub/2016/deconv-checkerboard/
        """
        super().__init__()

        model = OrderedDict()

        # 20 x 64+
        for i in range(1):
            model[f"adjacency_conv_{i}"] = AdjacencyConv(self.x_in, self.x_hidden)
            model[f"conv_{i}"] = nn.Conv1d(
                self.x_hidden,
                self.x_out,
                kernel_size=self.kernel_size,
                stride=self.stride,
                padding=self.padding,
                bias=False,
            )
            model[f"leaky_relu_{i}"] = nn.LeakyReLU(0.2)
            in_feat = self.x_out

        for i in range(1, 4):
            out_feat = in_feat * 2
            model[f"adjacency_conv_{i}"] = AdjacencyConv(in_feat, in_feat)
            model[f"conv_{i}"] = nn.Conv1d(
                in_feat,
                out_feat,
                kernel_size=self.kernel_size,
                stride=self.stride,
                padding=self.padding,
                bias=False,
            )
            model[f"instance_norm_{i}"] = nn.InstanceNorm1d(out_feat, affine=True)
            model[f"leaky_relu_{i}"] = nn.LeakyReLU(0.2)
            in_feat = out_feat

        for i in range(4, 6):
            out_feat = in_feat * 2
            model[f"conv_{i}"] = nn.Conv1d(
                in_feat,
                out_feat,
                kernel_size=self.kernel_size,
                stride=self.stride,
                padding=self.padding,
                bias=False,
            )
            model[f"instance_norm_{i}"] = nn.InstanceNorm1d(out_feat, affine=True)
            model[f"leaky_relu_{i}"] = nn.LeakyReLU(0.2)
            in_feat = out_feat

        assert in_feat == self.z_out, in_feat

        for i in range(6, 7):
            out_feat = 1
            model[f"conv_{i}"] = nn.Conv1d(
                in_feat, out_feat, kernel_size=1, stride=1, padding=0, bias=False
            )
            model[f"sigmoid_{i}"] = nn.Sigmoid()

        assert (i + 1) == self.n_layers

        for key, value in model.items():
            setattr(self, key, value)
        self.model = model
Beispiel #20
0
    def __init__(self, in_channels=2048, out_channels=2048):
        super(DGAdaIN, self).__init__()

        self.affine_scale = nn.Linear(in_channels, out_channels, bias=True)
        self.affine_bias = nn.Linear(in_channels, out_channels, bias=True)
        self.norm = nn.InstanceNorm1d(in_channels, affine = False, momentum=0.9,  track_running_stats=False)
Beispiel #21
0
def get_norm(norm_type, size):
	if(norm_type == 'batchnorm'):
		return nn.BatchNorm1d(size)
	elif(norm_type == 'instancenorm'):
		return nn.InstanceNorm1d(size)
Beispiel #22
0
def make_block(insize, arch, last=False):
    if arch == 'a':  #a\item BPF + x
        return nn.Sequential(
            nn.BatchNorm1d(insize),
            nn.PReLU(),
            nn.Linear(insize, insize),
        )
    elif arch == 'b':  #b\item BPF-BF + x
        return nn.Sequential(
            nn.BatchNorm1d(insize),
            nn.PReLU(),
            nn.Linear(insize, insize),
            nn.BatchNorm1d(insize),
            nn.Linear(insize, insize),
        )
    elif arch == 'c':  #c\item BPF-BPF$_{bottleneck}$-BF +
        return nn.Sequential(
            nn.BatchNorm1d(insize),
            nn.PReLU(),
            nn.Linear(insize, insize),
            nn.BatchNorm1d(insize),
            nn.PReLU(),
            nn.Linear(insize, insize // 2),
            nn.BatchNorm1d(insize // 2),
            nn.Linear(insize // 2, insize),
        )
    elif arch == 'd':  #d\item FBP + x
        return nn.Sequential(
            nn.Linear(insize, insize),
            nn.BatchNorm1d(insize),
            nn.PReLU(),
        )
    elif arch == 'e':  #e\item FBP-FB + x
        return nn.Sequential(
            nn.Linear(insize, insize),
            nn.BatchNorm1d(insize),
            nn.PReLU(),
            nn.Linear(insize, insize),
            nn.BatchNorm1d(insize),
        )
    elif arch == 'f':  #f\item FBP-F_${bottleneck}$BP-FB +
        return nn.Sequential(
            nn.Linear(insize, insize),
            nn.BatchNorm1d(insize),
            nn.PReLU(),
            nn.Linear(insize, insize // 2),
            nn.BatchNorm1d(insize // 2),
            nn.PReLU(),
            nn.Linear(insize // 2, insize),
            nn.BatchNorm1d(insize),
        )
    elif arch == 'linear' or last:
        return nn.Sequential(nn.Linear(insize, insize), )
    elif arch == 'relu':
        return nn.Sequential(
            nn.Linear(insize, insize),
            nn.ReLU(),
        )
    elif arch == 'elu':
        return nn.Sequential(
            nn.Linear(insize, insize),
            nn.ELU(),
        )
    elif arch == 'prelu':
        return nn.Sequential(
            nn.Linear(insize, insize),
            nn.PReLU(),
        )
    elif arch == 'sigmoid':
        return nn.Sequential(
            nn.Linear(insize, insize),
            nn.Sigmoid(),
        )
    elif arch == 'tanh':
        return nn.Sequential(
            nn.Linear(insize, insize),
            nn.Tanh(),
        )
    elif arch == 'batchnorm':
        return nn.Sequential(
            nn.Linear(insize, insize),
            nn.BatchNorm1d(insize),
        )
    elif arch == 'groupnorm':
        return nn.Sequential(
            nn.Linear(insize, insize),
            nn.GroupNorm(6, 1),
        )
    elif arch == 'instancenorm':
        return nn.Sequential(
            nn.Linear(insize, insize),
            nn.InstanceNorm1d(insize),
        )
    elif arch == 'layernorm':
        return nn.Sequential(
            nn.Linear(insize, insize),
            nn.LayerNorm(insize),
        )
 def __init__(self, dim_in, dim_out):
     super(ResidualBlock, self).__init__()
     self.conv = nn.Conv1d(dim_in, dim_out, kernel_size=3, stride=1, padding=1, bias=False)
     self.inst_norm = nn.InstanceNorm1d(dim_out, affine = True)
     self.glu = GLU()
     '''
Beispiel #24
0
 def __init__(self,
              num_inputs=1,
              sincnet=True,
              kwidths=[251, 10, 5, 5, 5, 5, 5, 5],
              strides=[1, 10, 2, 1, 2, 1, 2, 2],
              dilations=[1, 1, 1, 1, 1, 1, 1, 1],
              fmaps=[64, 64, 128, 128, 256, 256, 512, 512],
              norm_type='bnorm',
              pad_mode='reflect',
              sr=16000,
              emb_dim=256,
              activation=None,
              rnn_pool=False,
              rnn_layers=1,
              rnn_dropout=0,
              rnn_type='qrnn',
              vq_K=None,
              vq_beta=0.25,
              vq_gamma=0.99,
              norm_out=False,
              tanh_out=False,
              resblocks=False,
              denseskips=False,
              densemerge='sum',
              name='WaveFe'):
     super().__init__(name=name)
     # apply sincnet at first layer
     self.sincnet = sincnet
     self.kwidths = kwidths
     self.strides = strides
     self.fmaps = fmaps
     self.densemerge = densemerge
     if denseskips:
         self.denseskips = nn.ModuleList()
     self.blocks = nn.ModuleList()
     assert len(kwidths) == len(strides)
     assert len(strides) == len(fmaps)
     concat_emb_dim = emb_dim
     ninp = num_inputs
     for n, (kwidth, stride, dilation,
             fmap) in enumerate(zip(kwidths, strides, dilations, fmaps),
                                start=1):
         if n > 1:
             # make sure sincnet is deactivated after first layer
             sincnet = False
         if resblocks and not sincnet:
             feblock = FeResBlock(ninp,
                                  fmap,
                                  kwidth,
                                  dilation,
                                  act=activation,
                                  pad_mode=pad_mode,
                                  norm_type=norm_type)
         else:
             feblock = FeBlock(ninp,
                               fmap,
                               kwidth,
                               stride,
                               dilation,
                               act=activation,
                               pad_mode=pad_mode,
                               norm_type=norm_type,
                               sincnet=sincnet,
                               sr=sr)
         self.blocks.append(feblock)
         if denseskips and n < len(kwidths):
             # add projection adapter
             self.denseskips.append(nn.Conv1d(fmap, emb_dim, 1, bias=False))
             if densemerge == 'concat':
                 concat_emb_dim += emb_dim
         ninp = fmap
     # last projection
     if rnn_pool:
         self.rnn = build_rnn_block(fmap,
                                    emb_dim // 2,
                                    rnn_layers=rnn_layers,
                                    rnn_type=rnn_type,
                                    bidirectional=True,
                                    dropout=rnn_dropout)
         self.W = nn.Conv1d(emb_dim, emb_dim, 1)
     else:
         self.W = nn.Conv1d(fmap, emb_dim, 1)
     self.emb_dim = concat_emb_dim
     self.rnn_pool = rnn_pool
     if vq_K is not None and vq_K > 0:
         self.quantizer = VQEMA(vq_K, self.emb_dim, vq_beta, vq_gamma)
     else:
         self.quantizer = None
     # ouptut vectors are normalized to norm^2 1
     if norm_out:
         if norm_type == 'bnorm':
             self.norm_out = nn.BatchNorm1d(self.emb_dim, affine=False)
         else:
             self.norm_out = nn.InstanceNorm1d(self.emb_dim)
     self.tanh_out = tanh_out
Beispiel #25
0
    def __init__(self,
                 block,
                 layers,
                 num_filters,
                 nOut,
                 encoder_type='SAP',
                 n_mels=40,
                 log_input=True,
                 **kwargs):
        super(STDUCNN_H, self).__init__()

        print('Embedding size is %d, encoder %s.' % (nOut, encoder_type))

        self.inplanes = num_filters[0]
        self.encoder_type = encoder_type
        self.n_mels = n_mels
        self.log_input = log_input

        self.conv1 = nn.Conv2d(1,
                               num_filters[0],
                               kernel_size=3,
                               stride=1,
                               padding=1)
        self.relu = nn.ReLU(inplace=True)
        self.bn1 = nn.BatchNorm2d(num_filters[0])

        self.layer1 = self._make_layer(block, num_filters[0], layers[0])
        self.layer2 = self._make_layer(block,
                                       num_filters[1],
                                       layers[1],
                                       stride=(2, 2))
        self.layer3 = self._make_layer(block,
                                       num_filters[2],
                                       layers[2],
                                       stride=(2, 2))
        self.layer4 = self._make_layer(block,
                                       num_filters[3],
                                       layers[3],
                                       stride=(2, 2))

        self.instancenorm = nn.InstanceNorm1d(n_mels)
        self.torchfb = torch.nn.Sequential(
            PreEmphasis(),
            torchaudio.transforms.MelSpectrogram(
                sample_rate=16000,
                n_fft=512,
                win_length=400,
                hop_length=160,
                window_fn=torch.hamming_window,
                n_mels=n_mels))

        outmap_size = int(self.n_mels / 8)

        self.attention = nn.Sequential(
            nn.Conv1d(num_filters[3] * outmap_size, 128, kernel_size=1),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Conv1d(128, num_filters[3] * outmap_size, kernel_size=1),
            nn.Softmax(dim=2),
        )

        if self.encoder_type == "SAP":
            out_dim = num_filters[3] * outmap_size
        elif self.encoder_type == "ASP":
            out_dim = num_filters[3] * outmap_size * 2
        else:
            raise ValueError('Undefined encoder')

        self.fc = nn.Linear(out_dim, nOut)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight,
                                        mode='fan_out',
                                        nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
Beispiel #26
0
    def __init__(self,
                 word_feature,
                 interaction_item,
                 interaction_word,
                 use_user_feature,
                 use_item_feature,
                 use_word_feature,
                 n_units=[16, 16],
                 n_heads=[8, 8, 1],
                 item_dim=200,
                 user_dim=200,
                 dropout=0.1,
                 attn_dropout=0.0,
                 fine_tune=False,
                 instance_normalization=False):
        super(BatchGAT, self).__init__()
        self.n_layer = len(n_units)
        self.dropout = dropout
        self.inst_norm = instance_normalization

        f_item, f_user = item_dim, user_dim
        n_units = [f_user] + n_units

        if self.inst_norm:
            self.norm = nn.InstanceNorm1d(f_user, momentum=0.0, affine=True)

        self.use_user_feature = use_user_feature
        self.use_item_feature = use_item_feature
        self.use_word_feature = use_word_feature

        self.interaction_item = interaction_item
        self.interaction_word = interaction_word

        self.word_feature = nn.Embedding(word_feature.size(0),
                                         word_feature.size(1))
        self.word_feature.weight = nn.Parameter(word_feature)
        self.word_feature.weight.requires_grad = False

        if self.use_item_feature:
            self.w = Parameter(torch.Tensor(word_feature.size(1), f_user))
            self.bias = Parameter(torch.Tensor(f_user))
            self.attn = Parameter(torch.Tensor(f_user, 1))

            self.softmax = nn.Softmax(dim=-1)

            init.xavier_uniform_(self.w)
            init.constant_(self.bias, 0)
            init.xavier_uniform_(self.attn)

        if self.use_word_feature:
            self.w1 = Parameter(torch.Tensor(word_feature.size(1), f_item))
            self.bias1 = Parameter(torch.Tensor(f_item))
            self.attn1 = Parameter(torch.Tensor(f_item, 1))

            self.w2 = Parameter(torch.Tensor(f_item, f_user))
            self.bias2 = Parameter(torch.Tensor(f_user))
            self.attn2 = Parameter(torch.Tensor(f_user, 1))

            self.softmax = nn.Softmax(dim=-1)

            init.xavier_uniform_(self.w1)
            init.constant_(self.bias1, 0)
            init.xavier_uniform_(self.attn1)

            init.xavier_uniform_(self.w2)
            init.constant_(self.bias2, 0)
            init.xavier_uniform_(self.attn2)

        self.layer_stack = nn.ModuleList()
        for i in range(self.n_layer):
            f_in = n_units[i] * n_heads[i - 1] if i else n_units[i]
            self.layer_stack.append(
                BatchMultiHeadGraphAttention(n_heads[i],
                                             f_in=f_in,
                                             f_out=n_units[i + 1],
                                             attn_dropout=attn_dropout))
Beispiel #27
0
    def __init__(self,
                 num_classes,
                 embedding_size,
                 input_dim,
                 alpha=0.,
                 input_norm='',
                 channels=[512, 512, 512, 512, 512, 1536],
                 context=[5, 3, 3, 5],
                 downsample=None,
                 resnet_size=17,
                 stride=[1],
                 dropout_p=0.0,
                 dropout_layer=False,
                 encoder_type='STAP',
                 block_type='Basic',
                 mask='None',
                 mask_len=20,
                 **kwargs):
        super(RET_v2, self).__init__()
        self.num_classes = num_classes
        self.dropout_p = dropout_p
        self.dropout_layer = dropout_layer
        self.input_dim = input_dim
        self.alpha = alpha
        self.mask = mask
        self.channels = channels
        self.context = context
        self.stride = stride
        if len(self.stride) == 1:
            while len(self.stride) < 4:
                self.stride.append(self.stride[0])

        self.tdnn_size = resnet_size
        tdnn_type = {14: [1, 1, 1, 0], 17: [1, 1, 1, 1]}
        self.layers = tdnn_type[
            resnet_size] if resnet_size in tdnn_type else tdnn_type[17]

        if input_norm == 'Instance':
            self.inst_layer = nn.InstanceNorm1d(input_dim)
        elif input_norm == 'Mean':
            self.inst_layer = Mean_Norm()
        else:
            self.inst_layer = None

        if self.mask == "time":
            self.maks_layer = TimeMaskLayer(mask_len=mask_len)
        elif self.mask == "freq":
            self.mask = FreqMaskLayer(mask_len=mask_len)
        elif self.mask == "time_freq":
            self.mask_layer = nn.Sequential(TimeMaskLayer(mask_len=mask_len),
                                            FreqMaskLayer(mask_len=mask_len))
        else:
            self.mask_layer = None

        TDNN_layer = TimeDelayLayer_v5
        if block_type == 'Basic':
            Blocks = TDNNBlock
        elif block_type == 'Basic_v6':
            Blocks = TDNNBlock_v6
            TDNN_layer = TimeDelayLayer_v6
        elif block_type == 'Agg':
            Blocks = TDNNBottleBlock
        elif block_type == 'cbam':
            Blocks = TDNNCBAMBlock
        else:
            raise ValueError(block_type)

        self.frame1 = TDNN_layer(input_dim=self.input_dim,
                                 output_dim=self.channels[0],
                                 context_size=5,
                                 dilation=1,
                                 stride=self.stride[0])
        self.frame2 = self._make_block(block=Blocks,
                                       inplanes=self.channels[0],
                                       planes=self.channels[0],
                                       downsample=downsample,
                                       dilation=1,
                                       blocks=self.layers[0])

        self.frame4 = TDNN_layer(input_dim=self.channels[0],
                                 output_dim=self.channels[1],
                                 context_size=3,
                                 dilation=1,
                                 stride=self.stride[1])
        self.frame5 = self._make_block(block=Blocks,
                                       inplanes=self.channels[1],
                                       planes=self.channels[1],
                                       downsample=downsample,
                                       dilation=1,
                                       blocks=self.layers[1])

        self.frame7 = TDNN_layer(input_dim=self.channels[1],
                                 output_dim=self.channels[2],
                                 context_size=3,
                                 dilation=1,
                                 stride=self.stride[2])
        self.frame8 = self._make_block(block=Blocks,
                                       inplanes=self.channels[2],
                                       planes=self.channels[2],
                                       downsample=downsample,
                                       dilation=1,
                                       blocks=self.layers[2])

        if self.layers[3] != 0:
            self.frame10 = TDNN_layer(input_dim=self.channels[2],
                                      output_dim=self.channels[3],
                                      context_size=5,
                                      dilation=1,
                                      stride=self.stride[3])
            self.frame11 = self._make_block(block=Blocks,
                                            inplanes=self.channels[3],
                                            planes=self.channels[3],
                                            downsample=downsample,
                                            dilation=1,
                                            blocks=self.layers[3])

        self.frame13 = TDNN_layer(input_dim=self.channels[3],
                                  output_dim=self.channels[4],
                                  context_size=1,
                                  dilation=1)
        self.frame14 = TDNN_layer(input_dim=self.channels[4],
                                  output_dim=self.channels[5],
                                  context_size=1,
                                  dilation=1)

        self.drop = nn.Dropout(p=self.dropout_p)

        if encoder_type == 'STAP':
            self.encoder = StatisticPooling(input_dim=self.channels[5])
        elif encoder_type == 'SASP':
            self.encoder = AttentionStatisticPooling(
                input_dim=self.channels[5], hidden_dim=512)
        else:
            raise ValueError(encoder_type)

        self.segment1 = nn.Sequential(nn.Linear(self.channels[5] * 2, 512),
                                      nn.ReLU(), nn.BatchNorm1d(512))

        self.segment2 = nn.Sequential(nn.Linear(512,
                                                embedding_size), nn.ReLU(),
                                      nn.BatchNorm1d(embedding_size))

        if self.alpha:
            self.l2_norm = L2_Norm(self.alpha)

        self.classifier = nn.Linear(embedding_size, num_classes)
        # self.bn = nn.BatchNorm1d(num_classes)

        for m in self.modules():  # 对于各层参数的初始化
            if isinstance(m, nn.BatchNorm1d):  # weight设置为1,bias为0
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, TimeDelayLayer_v5):
                # nn.init.normal(m.kernel.weight, mean=0., std=1.)
                nn.init.kaiming_normal_(m.kernel.weight,
                                        mode='fan_out',
                                        nonlinearity='relu')
Beispiel #28
0
 def __init__(self, _unused=0, affine=True):
     super().__init__()
     self.norm = nn.InstanceNorm1d(1, affine=affine)
    def __init__(self,
                 conv_dim=64,
                 num_speakers=70,
                 repeat_num=6):  #**************************************
        super(Generator, self).__init__()
        self.downsample = nn.Sequential(
            Down2d_initial(1, 128, (3, 9), (1, 1), (1, 4)),
            Down2d(64, 256, (4, 8), (2, 2), (1, 3)),
            Down2d(128, 512, (4, 8), (2, 2), (1, 3)))

        # Down-conversion layers.
        self.down_conversion = nn.Sequential(
            nn.Conv1d(in_channels=2304,
                      out_channels=256,
                      kernel_size=1,
                      stride=1,
                      padding=0,
                      bias=False),
            nn.InstanceNorm1d(num_features=256, affine=True))

        # Bottleneck layers.
        self.residual_1 = ResidualBlock(in_channel=256,
                                        out_channel=512,
                                        n_speakers=num_speakers)
        self.residual_2 = ResidualBlock(in_channel=256,
                                        out_channel=512,
                                        n_speakers=num_speakers)
        self.residual_3 = ResidualBlock(in_channel=256,
                                        out_channel=512,
                                        n_speakers=num_speakers)
        self.residual_4 = ResidualBlock(in_channel=256,
                                        out_channel=512,
                                        n_speakers=num_speakers)
        self.residual_5 = ResidualBlock(in_channel=256,
                                        out_channel=512,
                                        n_speakers=num_speakers)
        self.residual_6 = ResidualBlock(in_channel=256,
                                        out_channel=512,
                                        n_speakers=num_speakers)
        self.residual_7 = ResidualBlock(in_channel=256,
                                        out_channel=512,
                                        n_speakers=num_speakers)
        self.residual_8 = ResidualBlock(in_channel=256,
                                        out_channel=512,
                                        n_speakers=num_speakers)
        self.residual_9 = ResidualBlock(in_channel=256,
                                        out_channel=512,
                                        n_speakers=num_speakers)

        # Up-conversion layers.
        self.up_conversion = nn.Conv1d(in_channels=256,
                                       out_channels=2304,
                                       kernel_size=1,
                                       stride=1,
                                       padding=0,
                                       bias=False)

        self.up1 = Up2d(256, 256, (4, 4), (2, 2), (1, 1))
        self.up2 = Up2d(128, 128, (4, 4), (2, 2), (1, 1))

        self.deconv = nn.Conv2d(in_channels=64,
                                out_channels=1,
                                kernel_size=7,
                                stride=1,
                                padding=3,
                                bias=False)
 def __init__(self, planes, ratio=0.5):
     super(IBN, self).__init__()
     self.half = int(planes * (1 - ratio))
     self.BN = nn.BatchNorm1d(self.half)
     self.IN = nn.InstanceNorm1d(planes - self.half, affine=True)