def build_model(cls, args, task): """Build a new model instance.""" # make sure that all args are properly defaulted (in case there are any new ones) base_architecture(args) decoder_embed_dict = None if args.decoder_embed_path: decoder_embed_dict = utils.parse_embedding(args.decoder_embed_path) utils.print_embed_overlap(decoder_embed_dict, task.target_dictionary) out_channels = speech_utils.eval_str_nested_list_or_tuple( args.encoder_conv_channels, type=int) kernel_sizes = speech_utils.eval_str_nested_list_or_tuple( args.encoder_conv_kernel_sizes, type=int) strides = speech_utils.eval_str_nested_list_or_tuple( args.encoder_conv_strides, type=int) logger.info('input feature dimension: {}, channels: {}'.format( task.feat_dim, task.feat_in_channels)) assert task.feat_dim % task.feat_in_channels == 0 conv_layers = ConvBNReLU( out_channels, kernel_sizes, strides, in_channels=task.feat_in_channels, ) if out_channels is not None else None fconv_encoder_input_size = task.feat_dim // task.feat_in_channels if conv_layers is not None: for stride in strides: if isinstance(stride, (list, tuple)): assert len(stride) > 0 s = stride[1] if len(stride) > 1 else stride[0] else: assert isinstance(stride, int) s = stride fconv_encoder_input_size = (fconv_encoder_input_size + s - 1) // s fconv_encoder_input_size *= out_channels[-1] encoder = SpeechFConvEncoder( conv_layers_before=conv_layers, input_size=fconv_encoder_input_size, embed_dim=args.encoder_embed_dim, convolutions=eval(args.encoder_layers), dropout=args.dropout, ) decoder = SpeechFConvDecoder( dictionary=task.target_dictionary, embed_dim=args.decoder_embed_dim, embed_dict=decoder_embed_dict, convolutions=eval(args.decoder_layers), out_embed_dim=args.decoder_out_embed_dim, attention=eval(args.decoder_attention), dropout=args.dropout, max_positions=args.max_target_positions, share_embed=args.share_input_output_embed, positional_embeddings=args.decoder_positional_embed, ) return cls(encoder, decoder)
def build_model(cls, args, task): """Build a new model instance.""" # make sure that all args are properly defaulted (in case there are any new ones) base_architecture(args) max_source_positions = getattr(args, "max_source_positions", DEFAULT_MAX_SOURCE_POSITIONS) out_channels = speech_utils.eval_str_nested_list_or_tuple( args.encoder_conv_channels, type=int) kernel_sizes = speech_utils.eval_str_nested_list_or_tuple( args.encoder_conv_kernel_sizes, type=int) strides = speech_utils.eval_str_nested_list_or_tuple( args.encoder_conv_strides, type=int) logger.info("input feature dimension: {}, channels: {}".format( task.feat_dim, task.feat_in_channels)) assert task.feat_dim % task.feat_in_channels == 0 conv_layers = ConvBNReLU( out_channels, kernel_sizes, strides, in_channels=task.feat_in_channels, ) if out_channels is not None else None rnn_encoder_input_size = task.feat_dim // task.feat_in_channels if conv_layers is not None: for stride in strides: if isinstance(stride, (list, tuple)): assert len(stride) > 0 s = stride[1] if len(stride) > 1 else stride[0] else: assert isinstance(stride, int) s = stride rnn_encoder_input_size = (rnn_encoder_input_size + s - 1) // s rnn_encoder_input_size *= out_channels[-1] else: rnn_encoder_input_size = task.feat_dim encoder = SpeechChunkLSTMEncoder( conv_layers_before=conv_layers, input_size=rnn_encoder_input_size, hidden_size=args.encoder_rnn_hidden_size, num_layers=args.encoder_rnn_layers, dropout_in=args.encoder_rnn_dropout_in, dropout_out=args.encoder_rnn_dropout_out, bidirectional=args.encoder_rnn_bidirectional, residual=args.encoder_rnn_residual, src_bucketed=(getattr(task.args, "num_batch_buckets", 0) > 0), num_targets=getattr(task, "num_targets", None), # targets for encoder-only model chunk_width=getattr(task, "chunk_width", None), chunk_left_context=getattr(task, "chunk_left_context", 0), training_stage=getattr(task, "training_stage", True), max_source_positions=max_source_positions, ) return cls(encoder, state_prior=getattr(task, "initial_state_prior", None))
def build_model(cls, args, task): """Build a new model instance.""" # make sure all arguments are present in older models base_architecture(args) if args.encoder_layers_to_keep: args.encoder_layers = len(args.encoder_layers_to_keep.split(",")) if args.decoder_layers_to_keep: args.decoder_layers = len(args.decoder_layers_to_keep.split(",")) if getattr(args, "max_source_positions", None) is None: args.max_source_positions = DEFAULT_MAX_SOURCE_POSITIONS if getattr(args, "max_target_positions", None) is None: args.max_target_positions = DEFAULT_MAX_TARGET_POSITIONS tgt_dict = task.target_dictionary decoder_embed_tokens = cls.build_embedding(args, tgt_dict, args.decoder_input_dim, args.decoder_embed_path) out_channels = speech_utils.eval_str_nested_list_or_tuple( args.encoder_conv_channels, type=int) kernel_sizes = speech_utils.eval_str_nested_list_or_tuple( args.encoder_conv_kernel_sizes, type=int) strides = speech_utils.eval_str_nested_list_or_tuple( args.encoder_conv_strides, type=int) logger.info("input feature dimension: {}, channels: {}".format( task.feat_dim, task.feat_in_channels)) assert task.feat_dim % task.feat_in_channels == 0 conv_layers = ConvBNReLU( out_channels, kernel_sizes, strides, in_channels=task.feat_in_channels, ) if out_channels is not None else None transformer_encoder_input_size = task.feat_dim // task.feat_in_channels if conv_layers is not None: for stride in strides: if isinstance(stride, (list, tuple)): assert len(stride) > 0 s = stride[1] if len(stride) > 1 else stride[0] else: assert isinstance(stride, int) s = stride transformer_encoder_input_size = ( transformer_encoder_input_size + s - 1) // s transformer_encoder_input_size *= out_channels[-1] else: transformer_encoder_input_size = task.feat_dim encoder_transformer_context = speech_utils.eval_str_nested_list_or_tuple( args.encoder_transformer_context, type=int, ) if encoder_transformer_context is not None: assert len(encoder_transformer_context) == 2 for i in range(2): assert (encoder_transformer_context[i] is None or (isinstance(encoder_transformer_context[i], int) and encoder_transformer_context[i] >= 0)) scheduled_sampling_rate_scheduler = ScheduledSamplingRateScheduler( args.scheduled_sampling_probs, args.start_scheduled_sampling_epoch, ) encoder = cls.build_encoder( args, conv_layers_before=conv_layers, input_size=transformer_encoder_input_size, transformer_context=encoder_transformer_context, ) decoder = cls.build_decoder( args, tgt_dict, decoder_embed_tokens, scheduled_sampling_rate_scheduler=scheduled_sampling_rate_scheduler, ) return cls(args, encoder, decoder)
def build_model(cls, args, task): """Build a new model instance.""" # make sure that all args are properly defaulted (in case there are any new ones) base_architecture(args) if args.encoder_layers_to_keep: args.encoder_layers = len(args.encoder_layers_to_keep.split(",")) if getattr(args, "max_source_positions", None) is None: args.max_source_positions = DEFAULT_MAX_SOURCE_POSITIONS out_channels = speech_utils.eval_str_nested_list_or_tuple( args.encoder_conv_channels, type=int) kernel_sizes = speech_utils.eval_str_nested_list_or_tuple( args.encoder_conv_kernel_sizes, type=int) strides = speech_utils.eval_str_nested_list_or_tuple( args.encoder_conv_strides, type=int) logger.info("input feature dimension: {}, channels: {}".format( task.feat_dim, task.feat_in_channels)) assert task.feat_dim % task.feat_in_channels == 0 conv_layers = ConvBNReLU( out_channels, kernel_sizes, strides, in_channels=task.feat_in_channels, ) if out_channels is not None else None transformer_encoder_input_size = task.feat_dim // task.feat_in_channels if conv_layers is not None: for stride in strides: if isinstance(stride, (list, tuple)): assert len(stride) > 0 s = stride[1] if len(stride) > 1 else stride[0] else: assert isinstance(stride, int) s = stride transformer_encoder_input_size = ( transformer_encoder_input_size + s - 1) // s transformer_encoder_input_size *= out_channels[-1] else: transformer_encoder_input_size = task.feat_dim encoder_transformer_context = speech_utils.eval_str_nested_list_or_tuple( args.encoder_transformer_context, type=int, ) if encoder_transformer_context is not None: assert len(encoder_transformer_context) == 2 for i in range(2): assert (encoder_transformer_context[i] is None or (isinstance(encoder_transformer_context[i], int) and encoder_transformer_context[i] >= 0)) encoder = cls.build_encoder( args, conv_layers_before=conv_layers, input_size=transformer_encoder_input_size, transformer_context=encoder_transformer_context, num_targets=getattr(task, "num_targets", None), # targets for encoder-only model chunk_width=getattr(task, "chunk_width", None), chunk_left_context=getattr(task, "chunk_left_context", 0), training_stage=getattr(task, "training_stage", True), ) return cls(args, encoder, state_prior=getattr(task, "initial_state_prior", None))
def build_model(cls, args, task): """Build a new model instance.""" # make sure all arguments are present in older models base_architecture(args) if not hasattr(args, 'max_source_positions'): args.max_source_positions = DEFAULT_MAX_SOURCE_POSITIONS if not hasattr(args, 'max_target_positions'): args.max_target_positions = DEFAULT_MAX_TARGET_POSITIONS dict = task.target_dictionary def build_embedding(dictionary, embed_dim, path=None): num_embeddings = len(dictionary) padding_idx = dictionary.pad() emb = Embedding(num_embeddings, embed_dim, padding_idx) # if provided, load from preloaded dictionaries if path: embed_dict = utils.parse_embedding(path) utils.load_embedding(embed_dict, dictionary, emb) return emb decoder_embed_tokens = build_embedding(dict, args.decoder_embed_dim, args.decoder_embed_path) out_channels = speech_utils.eval_str_nested_list_or_tuple( args.encoder_conv_channels, type=int) kernel_sizes = speech_utils.eval_str_nested_list_or_tuple( args.encoder_conv_kernel_sizes, type=int) strides = speech_utils.eval_str_nested_list_or_tuple( args.encoder_conv_strides, type=int) logger.info('input feature dimension: {}, channels: {}'.format( task.feat_dim, task.feat_in_channels)) assert task.feat_dim % task.feat_in_channels == 0 conv_layers = ConvBNReLU( out_channels, kernel_sizes, strides, in_channels=task.feat_in_channels, ) if out_channels is not None else None transformer_encoder_input_size = task.feat_dim // task.feat_in_channels if conv_layers is not None: for stride in strides: if isinstance(stride, (list, tuple)): assert len(stride) > 0 s = stride[1] if len(stride) > 1 else stride[0] else: assert isinstance(stride, int) s = stride transformer_encoder_input_size = \ (transformer_encoder_input_size + s - 1) // s transformer_encoder_input_size *= out_channels[-1] encoder = cls.build_encoder( args, conv_layers_before=conv_layers, input_size=transformer_encoder_input_size, ) decoder = cls.build_decoder(args, dict, decoder_embed_tokens) return cls(encoder, decoder)