Esempio n. 1
0
    def __init__(self, in_dim, out_dim, args, mean_std=None):
        super(Model, self).__init__()

        ##### required part, no need to change #####

        # mean std of input and output
        in_m, in_s, out_m, out_s = self.prepare_mean_std(in_dim,out_dim,\
                                                         args, mean_std)
        self.input_mean = torch_nn.Parameter(in_m, requires_grad=False)
        self.input_std = torch_nn.Parameter(in_s, requires_grad=False)
        self.output_mean = torch_nn.Parameter(out_m, requires_grad=False)
        self.output_std = torch_nn.Parameter(out_s, requires_grad=False)

        # a flag for debugging (by default False)
        #self.model_debug = False
        #self.validation = False
        #####

        ####
        # on input waveform and output target
        ####
        # Load protocol and prepare the target data for network training
        # target data
        protocol_file = prj_conf.optional_argument[0]
        self.protocol_parser = protocol_parse(protocol_file)

        # working sampling rate, torchaudio is used to change sampling rate
        self.m_target_sr = 16000

        ####
        # optional configs (not used)
        ####

        # re-sampling (optional)
        #self.m_resampler = torchaudio.transforms.Resample(
        #    prj_conf.wav_samp_rate, self.m_target_sr)

        # vad (optional)
        #self.m_vad = torchaudio.transforms.Vad(sample_rate = self.m_target_sr)

        # flag for balanced class (temporary use)
        #self.v_flag = 1

        ####
        # front-end configuration
        #  multiple front-end configurations may be used
        #  by default, use a single front-end
        ####
        # frame shift (number of points)
        self.frame_hops = [160]
        # frame length
        self.frame_lens = [320]
        # FFT length
        self.fft_n = [512]

        self.spec_with_delta = False
        self.spec_fb_dim = 60

        # window type
        self.win = torch.hann_window
        # floor in log-spectrum-amplitude calculating
        self.amp_floor = 0.00001

        # manual choose the first 600 frames in the data
        self.v_truncate_lens = [10 * 16 * 750 // x for x in self.frame_hops]

        # number of sub-models
        self.v_submodels = len(self.frame_lens)

        # dimension of embedding vectors
        self.v_emd_dim = 64

        # output class
        self.v_out_class = 2

        ####
        # create network
        ####
        self.m_transform = []
        self.m_output_act = []
        self.m_frontend = []
        self.m_angle = []

        for idx, (trunc_len,
                  fft_n) in enumerate(zip(self.v_truncate_lens, self.fft_n)):

            fft_n_bins = fft_n // 2 + 1

            self.m_transform.append(
                torch_nn.Sequential(
                    TrainableLinearFb(fft_n, self.m_target_sr,
                                      self.spec_fb_dim),
                    torch_nn.Conv2d(1, 64, [5, 5], 1, padding=[2, 2]),
                    nii_nn.MaxFeatureMap2D(), torch.nn.MaxPool2d([2, 2],
                                                                 [2, 2]),
                    torch_nn.Conv2d(32, 64, [1, 1], 1, padding=[0, 0]),
                    nii_nn.MaxFeatureMap2D(),
                    torch_nn.BatchNorm2d(32, affine=False),
                    torch_nn.Conv2d(32, 96, [3, 3], 1, padding=[1, 1]),
                    nii_nn.MaxFeatureMap2D(), torch.nn.MaxPool2d([2, 2],
                                                                 [2, 2]),
                    torch_nn.BatchNorm2d(48, affine=False),
                    torch_nn.Conv2d(48, 96, [1, 1], 1, padding=[0, 0]),
                    nii_nn.MaxFeatureMap2D(),
                    torch_nn.BatchNorm2d(48, affine=False),
                    torch_nn.Conv2d(48, 128, [3, 3], 1, padding=[1, 1]),
                    nii_nn.MaxFeatureMap2D(), torch.nn.MaxPool2d([2, 2],
                                                                 [2, 2]),
                    torch_nn.Conv2d(64, 128, [1, 1], 1, padding=[0, 0]),
                    nii_nn.MaxFeatureMap2D(),
                    torch_nn.BatchNorm2d(64, affine=False),
                    torch_nn.Conv2d(64, 64, [3, 3], 1, padding=[1, 1]),
                    nii_nn.MaxFeatureMap2D(),
                    torch_nn.BatchNorm2d(32, affine=False),
                    torch_nn.Conv2d(32, 64, [1, 1], 1, padding=[0, 0]),
                    nii_nn.MaxFeatureMap2D(),
                    torch_nn.BatchNorm2d(32, affine=False),
                    torch_nn.Conv2d(32, 64, [3, 3], 1, padding=[1, 1]),
                    nii_nn.MaxFeatureMap2D(), torch_nn.MaxPool2d([2, 2],
                                                                 [2, 2])))
            self.m_output_act.append(
                torch_nn.Sequential(
                    torch_nn.Dropout(0.7),
                    torch_nn.Linear(
                        (trunc_len // 16) * (self.spec_fb_dim // 16) * 32,
                        512), nii_nn.MaxFeatureMap2D(),
                    torch_nn.Linear(256, self.v_emd_dim)))

            self.m_frontend.append(
                nii_front_end.Spectrogram(self.frame_lens[idx],
                                          self.frame_hops[idx],
                                          self.fft_n[idx], self.m_target_sr))

            self.m_angle.append(
                nii_amsoftmax.AMAngleLayer(self.v_emd_dim, self.v_out_class))

        self.m_transform = torch_nn.ModuleList(self.m_transform)
        self.m_output_act = torch_nn.ModuleList(self.m_output_act)
        self.m_frontend = torch_nn.ModuleList(self.m_frontend)
        self.m_angle = torch_nn.ModuleList(self.m_angle)

        # output

        # done
        return
Esempio n. 2
0
    def __init__(self, in_dim, out_dim, args, mean_std=None):
        super(Model, self).__init__()

        ##### required part, no need to change #####

        # mean std of input and output
        in_m, in_s, out_m, out_s = self.prepare_mean_std(in_dim,out_dim,\
                                                         args, mean_std)
        self.input_mean = torch_nn.Parameter(in_m, requires_grad=False)
        self.input_std = torch_nn.Parameter(in_s, requires_grad=False)
        self.output_mean = torch_nn.Parameter(out_m, requires_grad=False)
        self.output_std = torch_nn.Parameter(out_s, requires_grad=False)

        # a flag for debugging (by default False)
        # self.model_debug = False
        # self.flag_validation = False
        #####

        ####
        # on input waveform and output target
        ####
        # Load protocol and prepare the target data for network training
        protocol_file = prj_conf.optional_argument[0]
        self.protocol_parser = protocol_parse(protocol_file)

        # Working sampling rate
        #  torchaudio may be used to change sampling rate
        self.m_target_sr = 16000

        ####
        # optional configs (not used)
        ####
        # re-sampling (optional)
        #self.m_resampler = torchaudio.transforms.Resample(
        #    prj_conf.wav_samp_rate, self.m_target_sr)

        # vad (optional)
        #self.m_vad = torchaudio.transforms.Vad(sample_rate = self.m_target_sr)

        # flag for balanced class (temporary use)
        #self.v_flag = 1

        ####
        # front-end configuration
        #  multiple front-end configurations may be used
        #  by default, use a single front-end
        ####
        # frame shift (number of waveform points)
        self.frame_hops = [160]
        # frame length
        self.frame_lens = [320]
        # FFT length
        self.fft_n = [512]

        # LFB dim (base component)
        self.lfb_dim = [60]
        self.lfb_with_delta = False

        # window type
        self.win = torch.hann_window
        # floor in log-spectrum-amplitude calculating (not used)
        self.amp_floor = 0.00001

        # number of frames to be kept for each trial
        # no truncation
        self.v_truncate_lens = [None for x in self.frame_hops]

        # number of sub-models (by default, a single model)
        self.v_submodels = len(self.frame_lens)

        # dimension of embedding vectors
        self.v_emd_dim = 64

        # output classes
        self.v_out_class = 2

        ####
        # create network
        ####
        # 1st part of the classifier
        self.m_transform = []
        # pooling layer
        self.m_pooling = []
        # 2nd part of the classifier
        self.m_output_act = []
        # front-end
        self.m_frontend = []
        # final part for output layer
        self.m_angle = []

        # it can handle models with multiple front-end configuration
        # by default, only a single front-end
        for idx, (trunc_len, fft_n, lfb_dim) in enumerate(
                zip(self.v_truncate_lens, self.fft_n, self.lfb_dim)):

            fft_n_bins = fft_n // 2 + 1
            if self.lfb_with_delta:
                lfb_dim = lfb_dim * 3

            self.m_transform.append(
                torch_nn.Sequential(
                    torch_nn.Conv2d(1, 64, [5, 5], 1, padding=[2, 2]),
                    nii_nn.MaxFeatureMap2D(), torch.nn.MaxPool2d([2, 2],
                                                                 [2, 2]),
                    torch_nn.Conv2d(32, 64, [1, 1], 1, padding=[0, 0]),
                    nii_nn.MaxFeatureMap2D(),
                    torch_nn.BatchNorm2d(32, affine=False),
                    torch_nn.Conv2d(32, 96, [3, 3], 1, padding=[1, 1]),
                    nii_nn.MaxFeatureMap2D(), torch.nn.MaxPool2d([2, 2],
                                                                 [2, 2]),
                    torch_nn.BatchNorm2d(48, affine=False),
                    torch_nn.Conv2d(48, 96, [1, 1], 1, padding=[0, 0]),
                    nii_nn.MaxFeatureMap2D(),
                    torch_nn.BatchNorm2d(48, affine=False),
                    torch_nn.Conv2d(48, 128, [3, 3], 1, padding=[1, 1]),
                    nii_nn.MaxFeatureMap2D(), torch.nn.MaxPool2d([2, 2],
                                                                 [2, 2]),
                    torch_nn.Conv2d(64, 128, [1, 1], 1, padding=[0, 0]),
                    nii_nn.MaxFeatureMap2D(),
                    torch_nn.BatchNorm2d(64, affine=False),
                    torch_nn.Conv2d(64, 64, [3, 3], 1, padding=[1, 1]),
                    nii_nn.MaxFeatureMap2D(),
                    torch_nn.BatchNorm2d(32, affine=False),
                    torch_nn.Conv2d(32, 64, [1, 1], 1, padding=[0, 0]),
                    nii_nn.MaxFeatureMap2D(),
                    torch_nn.BatchNorm2d(32, affine=False),
                    torch_nn.Conv2d(32, 64, [3, 3], 1, padding=[1, 1]),
                    nii_nn.MaxFeatureMap2D(), torch_nn.MaxPool2d([2, 2],
                                                                 [2, 2]),
                    torch_nn.Dropout(0.7)))

            self.m_pooling.append(
                nii_nn.SelfWeightedPooling((lfb_dim // 16) * 32))

            self.m_output_act.append(
                torch_nn.Linear((lfb_dim // 16) * 32 * 2, self.v_emd_dim))

            self.m_angle.append(
                nii_amsoftmax.AMAngleLayer(self.v_emd_dim,
                                           self.v_out_class,
                                           s=10,
                                           m=0.35))

            self.m_frontend.append(
                nii_front_end.LFB(self.frame_lens[idx],
                                  self.frame_hops[idx],
                                  self.fft_n[idx],
                                  self.m_target_sr,
                                  self.lfb_dim[idx],
                                  with_energy=False,
                                  with_emphasis=True,
                                  with_delta=self.lfb_with_delta))

        self.m_frontend = torch_nn.ModuleList(self.m_frontend)
        self.m_transform = torch_nn.ModuleList(self.m_transform)
        self.m_output_act = torch_nn.ModuleList(self.m_output_act)
        self.m_pooling = torch_nn.ModuleList(self.m_pooling)
        self.m_angle = torch_nn.ModuleList(self.m_angle)

        # done
        return
Esempio n. 3
0
    def __init__(self, in_dim, out_dim, args, mean_std=None):
        super(Model, self).__init__()

        ##### required part, no need to change #####

        # mean std of input and output
        in_m, in_s, out_m, out_s = self.prepare_mean_std(in_dim,out_dim,\
                                                         args, mean_std)
        self.input_mean = torch_nn.Parameter(in_m, requires_grad=False)
        self.input_std = torch_nn.Parameter(in_s, requires_grad=False)
        self.output_mean = torch_nn.Parameter(out_m, requires_grad=False)
        self.output_std = torch_nn.Parameter(out_s, requires_grad=False)

        # a flag for debugging (by default False)
        self.model_debug = False
        self.validation = False
        #####

        # target data
        protocol_file = prj_conf.optional_argument[0]
        self.protocol_parser = protocol_parse(protocol_file)

        # working sampling rate, torchaudio is used to change sampling rate
        self.m_target_sr = 16000

        # re-sampling (optional)
        self.m_resampler = torchaudio.transforms.Resample(
            prj_conf.wav_samp_rate, self.m_target_sr)

        # vad (optional)
        self.m_vad = torchaudio.transforms.Vad(sample_rate=self.m_target_sr)

        # flag for balanced class (temporary use)
        self.v_flag = 1

        # frame shift (number of points)
        self.frame_hops = [160]
        # frame length
        self.frame_lens = [320]
        # FFT length
        self.fft_n = [512]

        # LFCC dim (base component)
        self.lfcc_dim = [20]
        self.lfcc_with_delta = True

        # window type
        self.win = torch.hann_window
        # floor in log-spectrum-amplitude calculating
        self.amp_floor = 0.00001

        # manual choose the first 600 frames in the data
        self.v_truncate_lens = [10 * 16 * 750 // x for x in self.frame_hops]

        # number of sub-models
        self.v_submodels = len(self.frame_lens)

        # dimension of embedding vectors
        self.v_emd_dim = 1

        # output class
        self.v_out_class = 1

        self.m_transform = []
        self.m_output_act = []
        self.m_frontend = []
        #self.m_a_softmax = []

        for idx, (trunc_len, fft_n, lfcc_dim) in enumerate(
                zip(self.v_truncate_lens, self.fft_n, self.lfcc_dim)):

            fft_n_bins = fft_n // 2 + 1
            if self.lfcc_with_delta:
                lfcc_dim = lfcc_dim * 3

            self.m_transform.append(
                torch_nn.Sequential(
                    torch_nn.Conv2d(1, 64, [5, 5], 1, padding=[2, 2]),
                    nii_nn.MaxFeatureMap2D(), torch.nn.MaxPool2d([2, 2],
                                                                 [2, 2]),
                    torch_nn.Conv2d(32, 64, [1, 1], 1, padding=[0, 0]),
                    nii_nn.MaxFeatureMap2D(),
                    torch_nn.BatchNorm2d(32, affine=False),
                    torch_nn.Conv2d(32, 96, [3, 3], 1, padding=[1, 1]),
                    nii_nn.MaxFeatureMap2D(), torch.nn.MaxPool2d([2, 2],
                                                                 [2, 2]),
                    torch_nn.BatchNorm2d(48, affine=False),
                    torch_nn.Conv2d(48, 96, [1, 1], 1, padding=[0, 0]),
                    nii_nn.MaxFeatureMap2D(),
                    torch_nn.BatchNorm2d(48, affine=False),
                    torch_nn.Conv2d(48, 128, [3, 3], 1, padding=[1, 1]),
                    nii_nn.MaxFeatureMap2D(), torch.nn.MaxPool2d([2, 2],
                                                                 [2, 2]),
                    torch_nn.Conv2d(64, 128, [1, 1], 1, padding=[0, 0]),
                    nii_nn.MaxFeatureMap2D(),
                    torch_nn.BatchNorm2d(64, affine=False),
                    torch_nn.Conv2d(64, 64, [3, 3], 1, padding=[1, 1]),
                    nii_nn.MaxFeatureMap2D(),
                    torch_nn.BatchNorm2d(32, affine=False),
                    torch_nn.Conv2d(32, 64, [1, 1], 1, padding=[0, 0]),
                    nii_nn.MaxFeatureMap2D(),
                    torch_nn.BatchNorm2d(32, affine=False),
                    torch_nn.Conv2d(32, 64, [3, 3], 1, padding=[1, 1]),
                    nii_nn.MaxFeatureMap2D(), torch_nn.MaxPool2d([2, 2],
                                                                 [2, 2])))
            self.m_output_act.append(
                torch_nn.Sequential(
                    torch_nn.Dropout(0.7),
                    torch_nn.Linear((trunc_len // 16) * (lfcc_dim // 16) * 32,
                                    160), nii_nn.MaxFeatureMap2D(),
                    torch_nn.Linear(80, self.v_emd_dim)))

            self.m_frontend.append(
                nii_front_end.LFCC(self.frame_lens[idx],
                                   self.frame_hops[idx],
                                   self.fft_n[idx],
                                   self.m_target_sr,
                                   self.lfcc_dim[idx],
                                   with_energy=True))

            #self.m_a_softmax.append(
            #    nii_a_softmax.AngleLayer(self.v_emd_dim, self.v_out_class)
            #)

        self.m_transform = torch_nn.ModuleList(self.m_transform)
        self.m_output_act = torch_nn.ModuleList(self.m_output_act)
        self.m_frontend = torch_nn.ModuleList(self.m_frontend)
        #self.m_a_softmax = torch_nn.ModuleList(self.m_a_softmax)

        # output

        # done
        return