Exemple #1
0
    def __init__(self, in_dim, out_dim, args, mean_std=None):
        super(Model, self).__init__()

        ##### required part, no need to change #####

        # mean std of input and output
        in_m, in_s, out_m, out_s = self.prepare_mean_std(in_dim,out_dim,\
                                                         args, mean_std)
        self.input_mean = torch_nn.Parameter(in_m, requires_grad=False)
        self.input_std = torch_nn.Parameter(in_s, requires_grad=False)
        self.output_mean = torch_nn.Parameter(out_m, requires_grad=False)
        self.output_std = torch_nn.Parameter(out_s, requires_grad=False)

        # a flag for debugging (by default False)
        self.model_debug = False
        self.validation = False
        #####

        # target data
        protocol_file = prj_conf.optional_argument[0]
        self.protocol_parser = protocol_parse(protocol_file)

        # working sampling rate, torchaudio is used to change sampling rate
        self.m_target_sr = 16000

        # re-sampling (optional)
        self.m_resampler = torchaudio.transforms.Resample(
            prj_conf.wav_samp_rate, self.m_target_sr)

        # vad (optional)
        self.m_vad = torchaudio.transforms.Vad(sample_rate=self.m_target_sr)

        # flag for balanced class (temporary use)
        self.v_flag = 1

        # frame shift (number of points)
        self.frame_hops = [160]
        # frame length
        self.frame_lens = [320]
        # FFT length
        self.fft_n = [512]

        # LFCC dim (base component)
        self.lfcc_dim = [20]
        self.lfcc_with_delta = True

        # window type
        self.win = torch.hann_window
        # floor in log-spectrum-amplitude calculating
        self.amp_floor = 0.00001

        # manual choose the first 600 frames in the data
        self.v_truncate_lens = [10 * 16 * 750 // x for x in self.frame_hops]

        # number of sub-models
        self.v_submodels = len(self.frame_lens)

        # dimension of embedding vectors
        self.v_emd_dim = 1

        # output class
        self.v_out_class = 1

        self.m_transform = []
        self.m_output_act = []
        self.m_frontend = []
        #self.m_a_softmax = []

        for idx, (trunc_len, fft_n, lfcc_dim) in enumerate(
                zip(self.v_truncate_lens, self.fft_n, self.lfcc_dim)):

            fft_n_bins = fft_n // 2 + 1
            if self.lfcc_with_delta:
                lfcc_dim = lfcc_dim * 3

            self.m_transform.append(
                torch_nn.Sequential(
                    torch_nn.Conv2d(1, 64, [5, 5], 1, padding=[2, 2]),
                    nii_nn.MaxFeatureMap2D(), torch.nn.MaxPool2d([2, 2],
                                                                 [2, 2]),
                    torch_nn.Conv2d(32, 64, [1, 1], 1, padding=[0, 0]),
                    nii_nn.MaxFeatureMap2D(),
                    torch_nn.BatchNorm2d(32, affine=False),
                    torch_nn.Conv2d(32, 96, [3, 3], 1, padding=[1, 1]),
                    nii_nn.MaxFeatureMap2D(), torch.nn.MaxPool2d([2, 2],
                                                                 [2, 2]),
                    torch_nn.BatchNorm2d(48, affine=False),
                    torch_nn.Conv2d(48, 96, [1, 1], 1, padding=[0, 0]),
                    nii_nn.MaxFeatureMap2D(),
                    torch_nn.BatchNorm2d(48, affine=False),
                    torch_nn.Conv2d(48, 128, [3, 3], 1, padding=[1, 1]),
                    nii_nn.MaxFeatureMap2D(), torch.nn.MaxPool2d([2, 2],
                                                                 [2, 2]),
                    torch_nn.Conv2d(64, 128, [1, 1], 1, padding=[0, 0]),
                    nii_nn.MaxFeatureMap2D(),
                    torch_nn.BatchNorm2d(64, affine=False),
                    torch_nn.Conv2d(64, 64, [3, 3], 1, padding=[1, 1]),
                    nii_nn.MaxFeatureMap2D(),
                    torch_nn.BatchNorm2d(32, affine=False),
                    torch_nn.Conv2d(32, 64, [1, 1], 1, padding=[0, 0]),
                    nii_nn.MaxFeatureMap2D(),
                    torch_nn.BatchNorm2d(32, affine=False),
                    torch_nn.Conv2d(32, 64, [3, 3], 1, padding=[1, 1]),
                    nii_nn.MaxFeatureMap2D(), torch_nn.MaxPool2d([2, 2],
                                                                 [2, 2])))
            self.m_output_act.append(
                torch_nn.Sequential(
                    torch_nn.Dropout(0.7),
                    torch_nn.Linear((trunc_len // 16) * (lfcc_dim // 16) * 32,
                                    160), nii_nn.MaxFeatureMap2D(),
                    torch_nn.Linear(80, self.v_emd_dim)))

            self.m_frontend.append(
                nii_front_end.LFCC(self.frame_lens[idx],
                                   self.frame_hops[idx],
                                   self.fft_n[idx],
                                   self.m_target_sr,
                                   self.lfcc_dim[idx],
                                   with_energy=True))

            #self.m_a_softmax.append(
            #    nii_a_softmax.AngleLayer(self.v_emd_dim, self.v_out_class)
            #)

        self.m_transform = torch_nn.ModuleList(self.m_transform)
        self.m_output_act = torch_nn.ModuleList(self.m_output_act)
        self.m_frontend = torch_nn.ModuleList(self.m_frontend)
        #self.m_a_softmax = torch_nn.ModuleList(self.m_a_softmax)

        # output

        # done
        return
    def __init__(self, in_dim, out_dim, args, mean_std=None):
        super(Model, self).__init__()

        ##### required part, no need to change #####

        # mean std of input and output
        in_m, in_s, out_m, out_s = self.prepare_mean_std(in_dim,out_dim,\
                                                         args, mean_std)
        self.input_mean = torch_nn.Parameter(in_m, requires_grad=False)
        self.input_std = torch_nn.Parameter(in_s, requires_grad=False)
        self.output_mean = torch_nn.Parameter(out_m, requires_grad=False)
        self.output_std = torch_nn.Parameter(out_s, requires_grad=False)

        # a flag for debugging (by default False)
        #self.model_debug = False
        #self.validation = False
        #####

        ####
        # on input waveform and output target
        ####
        # Load protocol and prepare the target data for network training
        protocol_file = prj_conf.optional_argument[0]
        self.protocol_parser = protocol_parse(protocol_file)

        # Working sampling rate
        #  torchaudio may be used to change sampling rate
        self.m_target_sr = 16000

        ####
        # optional configs (not used)
        ####
        # re-sampling (optional)
        #self.m_resampler = torchaudio.transforms.Resample(
        #    prj_conf.wav_samp_rate, self.m_target_sr)

        # vad (optional)
        #self.m_vad = torchaudio.transforms.Vad(sample_rate = self.m_target_sr)

        # flag for balanced class (temporary use)
        #self.v_flag = 1

        ####
        # front-end configuration
        #  multiple front-end configurations may be used
        #  by default, use a single front-end
        ####
        # frame shift (number of waveform points)
        self.frame_hops = [160]
        # frame length
        self.frame_lens = [320]
        # FFT length
        self.fft_n = [512]

        # LFCC dim (base component)
        self.lfcc_dim = [20]
        self.lfcc_with_delta = True

        # window type
        self.win = torch.hann_window
        # floor in log-spectrum-amplitude calculating (not used)
        self.amp_floor = 0.00001

        # number of frames to be kept for each trial
        # no truncation
        self.v_truncate_lens = [None for x in self.frame_hops]

        # number of sub-models (by default, a single model)
        self.v_submodels = len(self.frame_lens)

        # dimension of embedding vectors
        self.v_emd_dim = 64

        # output classes
        self.v_out_class = 1

        ####
        # create network
        ####
        # 1st part of the classifier
        self.m_transform = []
        #
        self.m_before_pooling = []
        # 2nd part of the classifier
        self.m_output_act = []
        # front-end
        self.m_frontend = []
        # final part on training
        self.m_angle = []

        # it can handle models with multiple front-end configuration
        # by default, only a single front-end
        for idx, (trunc_len, fft_n, lfcc_dim) in enumerate(
                zip(self.v_truncate_lens, self.fft_n, self.lfcc_dim)):

            fft_n_bins = fft_n // 2 + 1
            if self.lfcc_with_delta:
                lfcc_dim = lfcc_dim * 3

            self.m_transform.append(
                torch_nn.Sequential(
                    torch_nn.Conv2d(1, 64, [5, 5], 1, padding=[2, 2]),
                    nii_nn.MaxFeatureMap2D(), torch.nn.MaxPool2d([2, 2],
                                                                 [2, 2]),
                    torch_nn.Conv2d(32, 64, [1, 1], 1, padding=[0, 0]),
                    nii_nn.MaxFeatureMap2D(),
                    torch_nn.BatchNorm2d(32, affine=False),
                    torch_nn.Conv2d(32, 96, [3, 3], 1, padding=[1, 1]),
                    nii_nn.MaxFeatureMap2D(), torch.nn.MaxPool2d([2, 2],
                                                                 [2, 2]),
                    torch_nn.BatchNorm2d(48, affine=False),
                    torch_nn.Conv2d(48, 96, [1, 1], 1, padding=[0, 0]),
                    nii_nn.MaxFeatureMap2D(),
                    torch_nn.BatchNorm2d(48, affine=False),
                    torch_nn.Conv2d(48, 128, [3, 3], 1, padding=[1, 1]),
                    nii_nn.MaxFeatureMap2D(), torch.nn.MaxPool2d([2, 2],
                                                                 [2, 2]),
                    torch_nn.Conv2d(64, 128, [1, 1], 1, padding=[0, 0]),
                    nii_nn.MaxFeatureMap2D(),
                    torch_nn.BatchNorm2d(64, affine=False),
                    torch_nn.Conv2d(64, 64, [3, 3], 1, padding=[1, 1]),
                    nii_nn.MaxFeatureMap2D(),
                    torch_nn.BatchNorm2d(32, affine=False),
                    torch_nn.Conv2d(32, 64, [1, 1], 1, padding=[0, 0]),
                    nii_nn.MaxFeatureMap2D(),
                    torch_nn.BatchNorm2d(32, affine=False),
                    torch_nn.Conv2d(32, 64, [3, 3], 1, padding=[1, 1]),
                    nii_nn.MaxFeatureMap2D(), torch_nn.MaxPool2d([2, 2],
                                                                 [2, 2]),
                    torch_nn.Dropout(0.7)))

            self.m_before_pooling.append(
                torch_nn.Sequential(
                    nii_nn.BLSTMLayer((lfcc_dim // 16) * 32,
                                      (lfcc_dim // 16) * 32),
                    nii_nn.BLSTMLayer((lfcc_dim // 16) * 32,
                                      (lfcc_dim // 16) * 32)))

            self.m_output_act.append(
                torch_nn.Linear((lfcc_dim // 16) * 32, self.v_emd_dim))

            self.m_angle.append(nii_ocsoftmax.OCAngleLayer(self.v_emd_dim))

            self.m_frontend.append(
                nii_front_end.LFCC(self.frame_lens[idx],
                                   self.frame_hops[idx],
                                   self.fft_n[idx],
                                   self.m_target_sr,
                                   self.lfcc_dim[idx],
                                   with_energy=True))

        self.m_frontend = torch_nn.ModuleList(self.m_frontend)
        self.m_transform = torch_nn.ModuleList(self.m_transform)
        self.m_output_act = torch_nn.ModuleList(self.m_output_act)
        self.m_angle = torch_nn.ModuleList(self.m_angle)
        self.m_before_pooling = torch_nn.ModuleList(self.m_before_pooling)
        # done
        return
    def __init__(self, in_dim, out_dim, args, mean_std=None):
        super(Model, self).__init__()

        ##### required part, no need to change #####

        # mean std of input and output
        in_m, in_s, out_m, out_s = self.prepare_mean_std(in_dim,out_dim,\
                                                         args, mean_std)
        self.input_mean = torch_nn.Parameter(in_m, requires_grad=False)
        self.input_std = torch_nn.Parameter(in_s, requires_grad=False)
        self.output_mean = torch_nn.Parameter(out_m, requires_grad=False)
        self.output_std = torch_nn.Parameter(out_s, requires_grad=False)

        # a flag for debugging (by default False)
        self.model_debug = False
        self.flag_validation = False
        #####

        ####
        # on input waveform and output target
        ####
        # Load protocol and prepare the target data for network training
        protocol_file = prj_conf.optional_argument[0]
        self.protocol_parser = protocol_parse(protocol_file)

        # Working sampling rate
        #  torchaudio may be used to change sampling rate
        self.m_target_sr = 16000

        ####
        # optional configs (not used)
        ####
        # re-sampling (optional)
        self.m_resampler = torchaudio.transforms.Resample(
            prj_conf.wav_samp_rate, self.m_target_sr)

        # vad (optional)
        self.m_vad = torchaudio.transforms.Vad(sample_rate=self.m_target_sr)

        # flag for balanced class (temporary use)
        self.v_flag = 1

        ####
        # front-end configuration
        #  multiple front-end configurations may be used
        #  by default, use a single front-end
        ####
        # frame shift (number of waveform points)
        self.frame_hops = [160]
        # frame length
        self.frame_lens = [320]
        # FFT length
        self.fft_n = [512]

        # LFCC dim (base component)
        self.lfcc_dim = [20]
        self.lfcc_with_delta = True

        # window type
        self.win = torch.hann_window
        # floor in log-spectrum-amplitude calculating (not used)
        self.amp_floor = 0.00001

        # number of frames to be kept for each trial
        # 750 frames are quite long for ASVspoof2019 LA with frame_shift = 10ms
        self.v_truncate_lens = [10 * 16 * 750 // x for x in self.frame_hops]

        # number of sub-models (by default, a single model)
        self.v_submodels = len(self.frame_lens)

        # dimension of embedding vectors, which will be into to oc-softmax layer
        self.v_emd_dim = 256

        # output class (1 for one-class softmax)
        self.v_out_class = 1

        ####
        # create network
        ####
        # backend
        self.m_model = []
        # fronend
        self.m_frontend = []
        # softmax layer for backend
        self.m_a_softmax = []

        for idx, (trunc_len, fft_n, lfcc_dim) in enumerate(
                zip(self.v_truncate_lens, self.fft_n, self.lfcc_dim)):

            fft_n_bins = fft_n // 2 + 1
            if self.lfcc_with_delta:
                lfcc_dim = lfcc_dim * 3

            self.m_model.append(nii_resnet.ResNet(self.v_emd_dim))

            self.m_frontend.append(
                nii_front_end.LFCC(self.frame_lens[idx],
                                   self.frame_hops[idx],
                                   self.fft_n[idx],
                                   self.m_target_sr,
                                   self.lfcc_dim[idx],
                                   with_energy=True))

            self.m_a_softmax.append(nii_oc_softmax.OCAngleLayer(
                self.v_emd_dim))

        self.m_model = torch_nn.ModuleList(self.m_model)
        self.m_frontend = torch_nn.ModuleList(self.m_frontend)
        self.m_a_softmax = torch_nn.ModuleList(self.m_a_softmax)

        # output

        # done
        return