def __init__(self, idim, odim, args): """Initialize multi-speaker modules. Args: idim (int): dimension of inputs odim (int): dimension of outputs args (Namespace): argument Namespace containing options """ torch.nn.Module.__init__(self) self.mtlalpha = args.mtlalpha assert 0.0 <= self.mtlalpha <= 1.0, "mtlalpha should be [0.0, 1.0]" self.rnnt_mode = args.rnnt_mode self.etype = args.etype self.verbose = args.verbose self.char_list = args.char_list self.outdir = args.outdir self.reporter = Reporter() self.num_spkrs = args.num_spkrs self.spa = args.spa self.pit = PIT(self.num_spkrs) # below means the last number becomes eos/sos ID # note that sos/eos IDs are identical self.sos = odim - 1 self.eos = odim - 1 # subsample info # +1 means input (+1) and layers outputs (args.elayer_sd + args.elayers) subsample = np.ones(args.elayers_sd + args.elayers + 1, dtype=np.int) if args.etype.endswith("p") and not args.etype.startswith("vgg"): ss = args.subsample.split("_") for j in range(min(args.elayers_sd + args.elayers + 1, len(ss))): subsample[j] = int(ss[j]) else: logging.warning( 'Subsampling is not performed for vgg*. It is performed in max pooling layers at CNN.' ) logging.info('subsample: ' + ' '.join([str(x) for x in subsample])) self.subsample = subsample # label smoothing info if args.lsm_type and os.path.isfile(args.train_json): logging.info("Use label smoothing with " + args.lsm_type) labeldist = label_smoothing_dist(odim, args.lsm_type, transcript=args.train_json) else: labeldist = None if getattr(args, "use_frontend", False): # use getattr to keep compatibility # Relative importing because of using python3 syntax from espnet.nets.pytorch_backend.frontends.feature_transform \ import feature_transform_for from espnet.nets.pytorch_backend.frontends.frontend \ import frontend_for self.frontend = frontend_for(args, idim) self.feature_transform = feature_transform_for( args, (idim - 1) * 2) idim = args.n_mels else: self.frontend = None # encoder self.enc = encoder_for(args, idim, self.subsample) # ctc self.ctc = ctc_for(args, odim, reduce=False) if args.rnnt_mode == 'rnnt-att': # attention num_att = self.num_spkrs if args.spa else 1 self.att = att_for(args, num_att) # decoder self.dec = decoder_for(args, odim, self.att) else: # prediction self.dec = decoder_for(args, odim) # weight initialization self.init_like_chainer() # options for beam search if 'report_cer' in vars(args) and (args.report_cer or args.report_wer): recog_args = { 'beam_size': args.beam_size, 'penalty': args.penalty, 'ctc_weight': args.ctc_weight, 'maxlenratio': args.maxlenratio, 'minlenratio': args.minlenratio, 'lm_weight': args.lm_weight, 'rnnlm': args.rnnlm, 'nbest': args.nbest, 'space': args.sym_space, 'blank': args.sym_blank } self.recog_args = argparse.Namespace(**recog_args) self.report_cer = args.report_cer self.report_wer = args.report_wer else: self.report_cer = False self.report_wer = False self.rnnlm = None self.logzero = -10000000000.0 self.loss = None self.acc = None
def __init__(self, idim, odim, args): """Initialize transducer modules. Args: idim (int): dimension of inputs odim (int): dimension of outputs args (Namespace): argument Namespace containing options """ super(E2E, self).__init__() torch.nn.Module.__init__(self) self.rnnt_mode = args.rnnt_mode self.etype = args.etype self.verbose = args.verbose self.char_list = args.char_list self.outdir = args.outdir self.space = args.sym_space self.blank = args.sym_blank self.reporter = Reporter() # note that eos is the same as sos (equivalent ID) self.sos = odim - 1 self.eos = odim - 1 # subsample info # +1 means input (+1) and layers outputs (args.elayer) subsample = np.ones(args.elayers + 1, dtype=np.int) if args.etype.endswith("p") and not args.etype.startswith("vgg"): ss = args.subsample.split("_") for j in range(min(args.elayers + 1, len(ss))): subsample[j] = int(ss[j]) else: logging.warning( 'Subsampling is not performed for vgg*. It is performed in max pooling layers at CNN.') logging.info('subsample: ' + ' '.join([str(x) for x in subsample])) self.subsample = subsample if args.use_frontend: # Relative importing because of using python3 syntax from espnet.nets.pytorch_backend.frontends.feature_transform \ import feature_transform_for from espnet.nets.pytorch_backend.frontends.frontend \ import frontend_for self.frontend = frontend_for(args, idim) self.feature_transform = feature_transform_for(args, (idim - 1) * 2) idim = args.n_mels else: self.frontend = None # encoder self.enc = encoder_for(args, idim, self.subsample) if args.rnnt_mode == 'rnnt-att': # attention self.att = att_for(args) # decoder self.dec = decoder_for(args, odim, self.att) else: # prediction self.dec = decoder_for(args, odim) # weight initialization self.init_like_chainer() # options for beam search if 'report_cer' in vars(args) and (args.report_cer or args.report_wer): recog_args = {'beam_size': args.beam_size, 'nbest': args.nbest, 'space': args.sym_space, 'score_norm_transducer': args.score_norm_transducer} self.recog_args = argparse.Namespace(**recog_args) self.report_cer = args.report_cer self.report_wer = args.report_wer else: self.report_cer = False self.report_wer = False self.logzero = -10000000000.0 self.rnnlm = None self.loss = None
def __init__(self, idim, odim, args): """Initialize transducer modules. Args: idim (int): dimension of inputs odim (int): dimension of outputs args (Namespace): argument Namespace containing options """ super(E2E, self).__init__() torch.nn.Module.__init__(self) self.rnnt_mode = args.rnnt_mode self.etype = args.etype self.verbose = args.verbose self.char_list = args.char_list self.outdir = args.outdir self.space = args.sym_space self.blank = args.sym_blank self.reporter = Reporter() self.beam_size = args.beam_size # note that eos is the same as sos (equivalent ID) self.sos = odim - 1 self.eos = odim - 1 # subsample info self.subsample = get_subsample(args, mode='asr', arch='rnn-t') if args.use_frontend: # Relative importing because of using python3 syntax from espnet.nets.pytorch_backend.frontends.feature_transform \ import feature_transform_for from espnet.nets.pytorch_backend.frontends.frontend \ import frontend_for self.frontend = frontend_for(args, idim) self.feature_transform = feature_transform_for( args, (idim - 1) * 2) idim = args.n_mels else: self.frontend = None # encoder self.enc = encoder_for(args, idim, self.subsample) if args.rnnt_mode == 'rnnt-att': # attention self.att = att_for(args) # decoder self.dec = decoder_for(args, odim, self.att) else: # prediction self.dec = decoder_for(args, odim) # weight initialization self.init_like_chainer() # options for beam search if 'report_cer' in vars(args) and (args.report_cer or args.report_wer): recog_args = { 'beam_size': args.beam_size, 'nbest': args.nbest, 'space': args.sym_space, 'score_norm_transducer': args.score_norm_transducer } self.recog_args = argparse.Namespace(**recog_args) self.report_cer = args.report_cer self.report_wer = args.report_wer else: self.report_cer = False self.report_wer = False self.logzero = -10000000000.0 self.rnnlm = None self.loss = None