def __init__(self, num_classes, embedding_size, input_dim, alpha=0., input_norm='', channels=[512, 512, 512, 512, 512, 1536], context=[5, 3, 3, 5], downsample=None, resnet_size=17, stride=[1], dropout_p=0.0, dropout_layer=False, encoder_type='STAP', block_type='Basic', mask='None', mask_len=20, **kwargs): super(RET_v2, self).__init__() self.num_classes = num_classes self.dropout_p = dropout_p self.dropout_layer = dropout_layer self.input_dim = input_dim self.alpha = alpha self.mask = mask self.channels = channels self.context = context self.stride = stride if len(self.stride) == 1: while len(self.stride) < 4: self.stride.append(self.stride[0]) self.tdnn_size = resnet_size tdnn_type = {14: [1, 1, 1, 0], 17: [1, 1, 1, 1]} self.layers = tdnn_type[ resnet_size] if resnet_size in tdnn_type else tdnn_type[17] if input_norm == 'Instance': self.inst_layer = nn.InstanceNorm1d(input_dim) elif input_norm == 'Mean': self.inst_layer = Mean_Norm() else: self.inst_layer = None if self.mask == "time": self.maks_layer = TimeMaskLayer(mask_len=mask_len) elif self.mask == "freq": self.mask = FreqMaskLayer(mask_len=mask_len) elif self.mask == "time_freq": self.mask_layer = nn.Sequential(TimeMaskLayer(mask_len=mask_len), FreqMaskLayer(mask_len=mask_len)) else: self.mask_layer = None TDNN_layer = TimeDelayLayer_v5 if block_type == 'Basic': Blocks = TDNNBlock elif block_type == 'Basic_v6': Blocks = TDNNBlock_v6 TDNN_layer = TimeDelayLayer_v6 elif block_type == 'Agg': Blocks = TDNNBottleBlock elif block_type == 'cbam': Blocks = TDNNCBAMBlock else: raise ValueError(block_type) self.frame1 = TDNN_layer(input_dim=self.input_dim, output_dim=self.channels[0], context_size=5, dilation=1, stride=self.stride[0]) self.frame2 = self._make_block(block=Blocks, inplanes=self.channels[0], planes=self.channels[0], downsample=downsample, dilation=1, blocks=self.layers[0]) self.frame4 = TDNN_layer(input_dim=self.channels[0], output_dim=self.channels[1], context_size=3, dilation=1, stride=self.stride[1]) self.frame5 = self._make_block(block=Blocks, inplanes=self.channels[1], planes=self.channels[1], downsample=downsample, dilation=1, blocks=self.layers[1]) self.frame7 = TDNN_layer(input_dim=self.channels[1], output_dim=self.channels[2], context_size=3, dilation=1, stride=self.stride[2]) self.frame8 = self._make_block(block=Blocks, inplanes=self.channels[2], planes=self.channels[2], downsample=downsample, dilation=1, blocks=self.layers[2]) if self.layers[3] != 0: self.frame10 = TDNN_layer(input_dim=self.channels[2], output_dim=self.channels[3], context_size=5, dilation=1, stride=self.stride[3]) self.frame11 = self._make_block(block=Blocks, inplanes=self.channels[3], planes=self.channels[3], downsample=downsample, dilation=1, blocks=self.layers[3]) self.frame13 = TDNN_layer(input_dim=self.channels[3], output_dim=self.channels[4], context_size=1, dilation=1) self.frame14 = TDNN_layer(input_dim=self.channels[4], output_dim=self.channels[5], context_size=1, dilation=1) self.drop = nn.Dropout(p=self.dropout_p) if encoder_type == 'STAP': self.encoder = StatisticPooling(input_dim=self.channels[5]) elif encoder_type == 'SASP': self.encoder = AttentionStatisticPooling( input_dim=self.channels[5], hidden_dim=512) else: raise ValueError(encoder_type) self.segment1 = nn.Sequential(nn.Linear(self.channels[5] * 2, 512), nn.ReLU(), nn.BatchNorm1d(512)) self.segment2 = nn.Sequential(nn.Linear(512, embedding_size), nn.ReLU(), nn.BatchNorm1d(embedding_size)) if self.alpha: self.l2_norm = L2_Norm(self.alpha) self.classifier = nn.Linear(embedding_size, num_classes) # self.bn = nn.BatchNorm1d(num_classes) for m in self.modules(): # 对于各层参数的初始化 if isinstance(m, nn.BatchNorm1d): # weight设置为1,bias为0 m.weight.data.fill_(1) m.bias.data.zero_() elif isinstance(m, TimeDelayLayer_v5): # nn.init.normal(m.kernel.weight, mean=0., std=1.) nn.init.kaiming_normal_(m.kernel.weight, mode='fan_out', nonlinearity='relu')
def __init__(self, num_classes, embedding_size, input_dim, alpha=0., input_norm='', filter=None, sr=16000, feat_dim=64, exp=False, filter_fix=False, dropout_p=0.0, dropout_layer=False, encoder_type='STAP', num_classes_b=0, block_type='basic', first_2d=False, stride=[1], mask='None', mask_len=20, channels=[512, 512, 512, 512, 1500], **kwargs): super(TDNN_v5, self).__init__() self.num_classes = num_classes self.num_classes_b = num_classes_b self.dropout_p = dropout_p self.dropout_layer = dropout_layer self.input_dim = input_dim self.channels = channels self.alpha = alpha self.mask = mask self.filter = filter self.feat_dim = feat_dim self.block_type = block_type.lower() self.stride = stride if len(self.stride) == 1: while len(self.stride) < 4: self.stride.append(self.stride[0]) if np.sum((self.stride)) > 4: print('The stride for tdnn layers are: ', str(self.stride)) if self.filter == 'fDLR': self.filter_layer = fDLR(input_dim=input_dim, sr=sr, num_filter=feat_dim, exp=exp, filter_fix=filter_fix) elif self.filter == 'fBLayer': self.filter_layer = fBLayer(input_dim=input_dim, sr=sr, num_filter=feat_dim, exp=exp, filter_fix=filter_fix) elif self.filter == 'fBPLayer': self.filter_layer = fBPLayer(input_dim=input_dim, sr=sr, num_filter=feat_dim, exp=exp, filter_fix=filter_fix) elif self.filter == 'fLLayer': self.filter_layer = fLLayer(input_dim=input_dim, num_filter=feat_dim, exp=exp) elif self.filter == 'Avg': self.filter_layer = nn.AvgPool2d(kernel_size=(1, 7), stride=(1, 3)) else: self.filter_layer = None if input_norm == 'Instance': self.inst_layer = nn.InstanceNorm1d(input_dim) elif input_norm == 'Mean': self.inst_layer = Mean_Norm() else: self.inst_layer = None if self.mask == "time": self.maks_layer = TimeMaskLayer(mask_len=mask_len) elif self.mask == "freq": self.mask = FreqMaskLayer(mask_len=mask_len) elif self.mask == "time_freq": self.mask_layer = nn.Sequential(TimeMaskLayer(mask_len=mask_len), FreqMaskLayer(mask_len=mask_len)) else: self.mask_layer = None if self.filter_layer != None: self.input_dim = feat_dim if self.block_type == 'basic': TDlayer = TimeDelayLayer_v5 elif self.block_type == 'basic_v6': TDlayer = TimeDelayLayer_v6 elif self.block_type == 'shuffle': TDlayer = ShuffleTDLayer else: raise ValueError(self.block_type) if not first_2d: self.frame1 = TimeDelayLayer_v5(input_dim=self.input_dim, output_dim=self.channels[0], context_size=5, stride=self.stride[0], dilation=1) else: self.frame1 = Conv2DLayer(input_dim=self.input_dim, output_dim=self.channels[0], stride=self.stride[0]) self.frame2 = TDlayer(input_dim=self.channels[0], output_dim=self.channels[1], context_size=3, stride=self.stride[1], dilation=2) self.frame3 = TDlayer(input_dim=self.channels[1], output_dim=self.channels[2], context_size=3, stride=self.stride[2], dilation=3) self.frame4 = TDlayer(input_dim=self.channels[2], output_dim=self.channels[3], context_size=1, stride=self.stride[0], dilation=1) self.frame5 = TimeDelayLayer_v5(input_dim=self.channels[3], output_dim=self.channels[4], context_size=1, stride=self.stride[3], dilation=1) self.drop = nn.Dropout(p=self.dropout_p) if encoder_type == 'STAP': self.encoder = StatisticPooling(input_dim=self.channels[4]) self.encoder_output = self.channels[4] * 2 elif encoder_type == 'ASP': self.encoder = AttentionStatisticPooling( input_dim=self.channels[4], hidden_dim=self.channels[4]) self.encoder_output = self.channels[4] * 2 elif encoder_type == 'SAP': self.encoder = SelfAttentionPooling(input_dim=self.channels[4], hidden_dim=self.channels[4]) self.encoder_output = self.channels[4] elif encoder_type == 'Ghos_v3': self.encoder = GhostVLAD_v3(num_clusters=self.num_classes_b, gost=1, dim=self.channels[4]) self.encoder_output = self.channels[4] * 2 else: raise ValueError(encoder_type) self.segment6 = nn.Sequential(nn.Linear(self.encoder_output, 512), nn.ReLU(), nn.BatchNorm1d(512)) self.segment7 = nn.Sequential(nn.Linear(512, embedding_size), nn.ReLU(), nn.BatchNorm1d(embedding_size)) if self.alpha: self.l2_norm = L2_Norm(self.alpha) self.classifier = nn.Linear(embedding_size, num_classes) # self.bn = nn.BatchNorm1d(num_classes) for m in self.modules(): # 对于各层参数的初始化 if isinstance(m, nn.BatchNorm1d): # weight设置为1,bias为0 m.weight.data.fill_(1) m.bias.data.zero_() elif isinstance(m, TimeDelayLayer_v5): # nn.init.normal(m.kernel.weight, mean=0., std=1.) nn.init.kaiming_normal_(m.kernel.weight, mode='fan_out', nonlinearity='relu')
def __init__(self, num_classes, embedding_size, input_dim, alpha=0., input_norm='', dropout_p=0.0, encoder_type='STAP', **kwargs): super(TDNN_v4, self).__init__() self.num_classes = num_classes self.dropout_p = dropout_p self.input_dim = input_dim self.alpha = alpha if input_norm == 'Instance': self.inst_layer = nn.InstanceNorm1d(input_dim) elif input_norm == 'Mean': self.inst_layer = Mean_Norm() else: self.inst_layer = None self.frame1 = TimeDelayLayer_v4(input_dim=self.input_dim, output_dim=512, context_size=5, dilation=1) self.frame2 = TimeDelayLayer_v4(input_dim=512, output_dim=512, context_size=3, dilation=2) self.frame3 = TimeDelayLayer_v4(input_dim=512, output_dim=512, context_size=3, dilation=3) self.frame4 = TimeDelayLayer_v4(input_dim=512, output_dim=512, context_size=1, dilation=1) self.frame5 = TimeDelayLayer_v4(input_dim=512, output_dim=1500, context_size=1, dilation=1) self.drop = nn.Dropout(p=self.dropout_p) if encoder_type == 'STAP': self.encoder = StatisticPooling(input_dim=1500) elif encoder_type == 'SASP': self.encoder = AttentionStatisticPooling(input_dim=1500, hidden_dim=512) else: raise ValueError(encoder_type) self.segment6 = nn.Sequential(nn.Linear(3000, 512), nn.ReLU(), nn.BatchNorm1d(512)) self.segment7 = nn.Sequential(nn.Linear(512, embedding_size), nn.ReLU(), nn.BatchNorm1d(embedding_size)) if self.alpha: self.l2_norm = L2_Norm(self.alpha) self.classifier = nn.Linear(embedding_size, num_classes) # self.bn = nn.BatchNorm1d(num_classes) for m in self.modules(): # 对于各层参数的初始化 if isinstance(m, nn.BatchNorm1d): # weight设置为1,bias为0 m.weight.data.fill_(1) m.bias.data.zero_() elif isinstance(m, TimeDelayLayer_v2): # nn.init.normal(m.kernel.weight, mean=0., std=1.) nn.init.kaiming_normal_(m.kernel.weight, mode='fan_out', nonlinearity='relu')
def __init__(self, num_classes, embedding_size=256, batch_norm=True, input_norm='Mean', input_dim=80, dropout_p=0.0, encoder_type='STAP', activation='leakyrelu', **kwargs): super(ETDNN_v5, self).__init__() self.num_classes = num_classes self.input_dim = input_dim if input_norm == 'Instance': self.inst_layer = nn.InstanceNorm1d(input_dim) elif input_norm == 'Mean': self.inst_layer = Mean_Norm() else: self.inst_layer = None self.dropout_p = dropout_p self.frame1 = TimeDelayLayer_v5(input_dim=input_dim, output_dim=512, context_size=5, dilation=1, activation=activation, batch_norm=batch_norm, dropout_p=dropout_p) self.affine2 = TimeDelayLayer_v5(input_dim=512, output_dim=512, context_size=1, dilation=1, activation=activation, batch_norm=batch_norm, dropout_p=dropout_p) self.frame3 = TimeDelayLayer_v5(input_dim=512, output_dim=512, context_size=3, dilation=2, activation=activation, batch_norm=batch_norm, dropout_p=dropout_p) self.affine4 = TimeDelayLayer_v5(input_dim=512, output_dim=512, context_size=1, dilation=1, activation=activation, batch_norm=batch_norm, dropout_p=dropout_p) self.frame5 = TimeDelayLayer_v5(input_dim=512, output_dim=512, context_size=3, dilation=3, activation=activation, batch_norm=batch_norm, dropout_p=dropout_p) self.affine6 = TimeDelayLayer_v5(input_dim=512, output_dim=512, context_size=1, dilation=1, activation=activation, batch_norm=batch_norm, dropout_p=dropout_p) self.frame7 = TimeDelayLayer_v5(input_dim=512, output_dim=512, context_size=3, dilation=4, activation=activation, batch_norm=batch_norm, dropout_p=dropout_p) self.frame8 = TimeDelayLayer_v5(input_dim=512, output_dim=512, context_size=1, dilation=1, activation=activation, batch_norm=batch_norm, dropout_p=dropout_p) self.frame9 = TimeDelayLayer_v5(input_dim=512, output_dim=512, context_size=1, dilation=1, activation=activation, batch_norm=batch_norm, dropout_p=dropout_p) self.frame10 = TimeDelayLayer_v5(input_dim=512, output_dim=1500, context_size=1, dilation=1, activation=activation, batch_norm=batch_norm, dropout_p=dropout_p) # self.segment11 = nn.Linear(3000, embedding_size) # self.leakyrelu = nn.LeakyReLU() # self.batchnorm = nn.BatchNorm1d(embedding_size) if encoder_type == 'STAP': self.encoder = StatisticPooling(input_dim=1500) else: self.encoder = nn.AdaptiveAvgPool2d((1, None)) self.segment12 = nn.Sequential(nn.Linear(3000, embedding_size)) self.segment13 = nn.Sequential( nn.Linear(embedding_size, embedding_size)) if activation == 'relu': act_fn = nn.ReLU elif activation == 'leakyrelu': act_fn = nn.LeakyReLU elif activation == 'prelu': act_fn = nn.PReLU self.segment12.add_module('seg12_act', act_fn()) self.segment13.add_module('seg13_act', act_fn()) self.segment12.add_module('seg13_bn', nn.BatchNorm1d(embedding_size)) self.segment13.add_module('seg13_bn', nn.BatchNorm1d(embedding_size)) self.classifier = nn.Linear(embedding_size, num_classes) for m in self.modules(): # 对于各层参数的初始化 if isinstance(m, nn.BatchNorm1d): # weight设置为1,bias为0 m.weight.data.fill_(1) m.bias.data.zero_()
def __init__(self, num_classes, embedding_size=256, batch_norm=True, input_norm='Mean', input_dim=80, dropout_p=0.0, dropout_layer=False, encoder_type='STAP', **kwargs): super(ETDNN, self).__init__() self.num_classes = num_classes self.input_dim = input_dim if input_norm == 'Instance': self.inst_layer = nn.InstanceNorm1d(input_dim) elif input_norm == 'Mean': self.inst_layer = Mean_Norm() else: self.inst_layer = None self.dropout_layer = dropout_layer self.dropout_p = dropout_p self.frame1 = TimeDelayLayer_v2(input_dim=input_dim, output_dim=512, context_size=5, dilation=1, activation='leakyrelu', batch_norm=batch_norm, dropout_p=dropout_p) self.affine2 = TimeDelayLayer_v2(input_dim=512, output_dim=512, context_size=1, dilation=1, activation='leakyrelu', batch_norm=batch_norm, dropout_p=dropout_p) self.frame3 = TimeDelayLayer_v2(input_dim=512, output_dim=512, context_size=3, dilation=2, activation='leakyrelu', batch_norm=batch_norm, dropout_p=dropout_p) self.affine4 = TimeDelayLayer_v2(input_dim=512, output_dim=512, context_size=1, dilation=1, activation='leakyrelu', batch_norm=batch_norm, dropout_p=dropout_p) self.frame5 = TimeDelayLayer_v2(input_dim=512, output_dim=512, context_size=3, dilation=3, activation='leakyrelu', batch_norm=batch_norm, dropout_p=dropout_p) self.affine6 = TimeDelayLayer_v2(input_dim=512, output_dim=512, context_size=1, dilation=1, activation='leakyrelu', batch_norm=batch_norm, dropout_p=dropout_p) self.frame7 = TimeDelayLayer_v2(input_dim=512, output_dim=512, context_size=3, dilation=4, activation='leakyrelu', batch_norm=batch_norm, dropout_p=dropout_p) self.frame8 = TimeDelayLayer_v2(input_dim=512, output_dim=512, context_size=1, dilation=1, activation='leakyrelu', batch_norm=batch_norm, dropout_p=dropout_p) self.frame9 = TimeDelayLayer_v2(input_dim=512, output_dim=1500, context_size=1, dilation=1, activation='leakyrelu', batch_norm=batch_norm, dropout_p=dropout_p) # self.segment11 = nn.Linear(3000, embedding_size) # self.leakyrelu = nn.LeakyReLU() # self.batchnorm = nn.BatchNorm1d(embedding_size) self.drop = nn.Dropout(p=self.dropout_p) if encoder_type == 'STAP': self.encoder = StatisticPooling(input_dim=1500) else: self.encoder = nn.AdaptiveAvgPool2d((1, None)) self.segment11 = nn.Sequential(nn.Linear(3000, embedding_size), nn.LeakyReLU(), nn.BatchNorm1d(embedding_size)) self.classifier = nn.Linear(embedding_size, num_classes) for m in self.modules(): # 对于各层参数的初始化 if isinstance(m, nn.BatchNorm1d): # weight设置为1,bias为0 m.weight.data.fill_(1) m.bias.data.zero_() elif isinstance(m, TimeDelayLayer_v2): nn.init.kaiming_normal_(m.kernel.weight, mode='fan_out', nonlinearity='leaky_relu')
def __init__(self, num_classes, embedding_size=512, input_dim=80, input_norm='', filter=None, sr=16000, feat_dim=64, exp=False, filter_fix=False, dropout_p=0.0, dropout_layer=False, encoder_type='STAP', num_classes_b=0, block_type='basic', alpha=0., mask='None', mask_len=20, channels=[512, 512, 512, 512, 1536], **kwargs): super().__init__() self.num_classes = num_classes self.num_classes_b = num_classes_b self.dropout_p = dropout_p self.dropout_layer = dropout_layer self.input_dim = input_dim self.channels = channels self.alpha = alpha self.mask = mask self.filter = filter self.feat_dim = feat_dim self.block_type = block_type.lower() self.embedding_size = embedding_size if input_norm == 'Inst': self.inst_layer = nn.InstanceNorm1d(input_dim) elif input_norm == 'Mean': self.inst_layer = Mean_Norm() else: self.inst_layer = None self.layer1 = Conv1dReluBn(input_dim, self.channels[0], kernel_size=5, padding=2) self.layer2 = SE_Res2Block(self.channels[1], kernel_size=3, stride=1, padding=2, dilation=2, scale=8) self.layer3 = SE_Res2Block(self.channels[2], kernel_size=3, stride=1, padding=3, dilation=3, scale=8) self.layer4 = SE_Res2Block(self.channels[3], kernel_size=3, stride=1, padding=4, dilation=4, scale=8) self.conv = nn.Conv1d(self.channels[4], self.channels[4], kernel_size=1) self.pooling = AttentiveStatsPool(self.channels[4], 128) self.bn0 = nn.BatchNorm1d(self.channels[4] * 2) self.fc1 = nn.Linear(self.channels[4] * 2, self.embedding_size) self.bn1 = nn.BatchNorm1d(self.embedding_size) self.classifier = nn.Linear(self.embedding_size, self.num_classes)