def setup_task(cls, args, **kwargs): """Setup the task (e.g., load dictionaries). Args: args (argparse.Namespace): parsed command-line arguments """ args.left_pad_source = utils.eval_bool(args.left_pad_source) args.left_pad_target = utils.eval_bool(args.left_pad_target) paths = utils.split_paths(args.data) assert len(paths) > 0 # set language pair args.source_lang = "char" args.target_lang = "label" # load dictionaries src_dict = cls.load_dictionary( os.path.join(paths[0], "dict.{}.txt".format(args.source_lang)) ) tgt_dict = cls.load_dictionary( os.path.join(paths[0], "dict.{}.txt".format(args.target_lang)) ) assert src_dict.pad() == tgt_dict.pad() assert src_dict.eos() == tgt_dict.eos() assert src_dict.unk() == tgt_dict.unk() logger.info("[{}] dictionary: {} types".format(args.source_lang, len(src_dict))) logger.info("[{}] dictionary: {} types".format(args.target_lang, len(tgt_dict))) return cls(args, src_dict, tgt_dict)
def add_args(parser): """Add model-specific arguments to the parser.""" # fmt: off parser.add_argument("--dropout", type=float, metavar="D", help="dropout probability") parser.add_argument("--encoder-conv-channels", type=str, metavar="EXPR", help="list of encoder convolution's out channels") parser.add_argument("--encoder-conv-kernel-sizes", type=str, metavar="EXPR", help="list of encoder convolution's kernel sizes") parser.add_argument("--encoder-conv-strides", type=str, metavar="EXPR", help="list of encoder convolution's strides") parser.add_argument("--encoder-rnn-hidden-size", type=int, metavar="N", help="encoder rnn's hidden size") parser.add_argument("--encoder-rnn-layers", type=int, metavar="N", help="number of rnn encoder layers") parser.add_argument("--encoder-rnn-bidirectional", type=lambda x: utils.eval_bool(x), help="make all rnn layers of encoder bidirectional") parser.add_argument("--encoder-rnn-residual", type=lambda x: utils.eval_bool(x), help="create residual connections for rnn encoder " "layers (starting from the 2nd layer), i.e., the actual " "output of such layer is the sum of its input and output") parser.add_argument("--encoder-multilayer-rnn-as-single-module", type=lambda x: utils.eval_bool(x), help="if True use a single nn.Module.LSTM for multilayer LSTMs " "(faster and may fix a possible cuDNN error); otherwise use " "nn.ModuleList(for back-compatibility). Note: if True then " "encoder_rnn_residual is set to False") # Granular dropout settings (if not specified these default to --dropout) parser.add_argument("--encoder-rnn-dropout-in", type=float, metavar="D", help="dropout probability for encoder rnn's input") parser.add_argument("--encoder-rnn-dropout-out", type=float, metavar="D", help="dropout probability for encoder rnn's output")
def setup_task(cls, args, **kwargs): """Setup the task (e.g., load dictionaries). Args: args (argparse.Namespace): parsed command-line arguments """ args.left_pad_source = utils.eval_bool(args.left_pad_source) args.left_pad_target = utils.eval_bool(args.left_pad_target) paths = utils.split_paths(args.data) assert len(paths) > 0 # find language pair automatically if args.source_lang is None or args.target_lang is None: args.source_lang, args.target_lang = data_utils.infer_language_pair( paths[0]) if args.source_lang is None or args.target_lang is None: raise Exception( "Could not infer language pair, please provide it explicitly") dictionary = cls.load_dictionary(os.path.join(paths[0], "dict.txt")) # langs:"en-zh,my-en" logger.info("args.add_lang_token: {} ".format(args.add_lang_token)) if args.add_lang_token and len(args.langs) > 0: languages = args.langs.split(",") for lang_pair in languages: if lang_pair == "-": continue logger.info("{} was add to dictionary".format(lang_pair)) lang = lang_pair.split("-") dictionary.add_symbol("[{}]".format(lang[0])) dictionary.add_symbol("[{}]".format(lang[1])) return cls(args, dictionary, dictionary)
def setup_task(cls, args, **kwargs): """Setup the task (e.g., load dictionaries). Args: args (argparse.Namespace): parsed command-line arguments """ args.left_pad_source = utils.eval_bool(args.left_pad_source) args.left_pad_target = utils.eval_bool(args.left_pad_target) paths = utils.split_paths(args.data) assert len(paths) > 0 # find language pair automatically if args.source_lang is None or args.target_lang is None: args.source_lang, args.target_lang = data_utils.infer_language_pair( paths[0]) if args.source_lang is None or args.target_lang is None: raise Exception( 'Could not infer language pair, please provide it explicitly') # load dictionaries src_dict = cls.load_dictionary( os.path.join(paths[0], 'dict.{}.txt'.format(args.source_lang))) tgt_dict = cls.load_dictionary( os.path.join(paths[0], 'dict.{}.txt'.format(args.target_lang))) assert src_dict.pad() == tgt_dict.pad() assert src_dict.eos() == tgt_dict.eos() assert src_dict.unk() == tgt_dict.unk() logger.info('[{}] dictionary: {} types'.format(args.source_lang, len(src_dict))) logger.info('[{}] dictionary: {} types'.format(args.target_lang, len(tgt_dict))) return cls(args, src_dict, tgt_dict)
class LSTMLanguageModelEspressoConfig(FairseqDataclass): dropout: float = field(default=0.1, metadata={"help": "dropout probability"}) decoder_embed_dim: int = field( default=48, metadata={"help": "decoder embedding dimension"} ) decoder_embed_path: Optional[str] = field( default=None, metadata={"help": "path to pre-trained decoder embedding"} ) decoder_freeze_embed: bool = field( default=False, metadata={"help": "freeze decoder embeddings"} ) decoder_hidden_size: int = field( default=650, metadata={"help": "decoder hidden size"} ) decoder_layers: int = field( default=2, metadata={"help": "number of decoder layers"} ) decoder_out_embed_dim: int = field( default=650, metadata={"help": "decoder output embedding dimension"} ) decoder_rnn_residual: lambda x: utils.eval_bool(x) = field( default=False, metadata={ "help": "create residual connections for rnn decoder layers " "(starting from the 2nd layer), i.e., the actual output of such " "layer is the sum of its input and output" }, ) adaptive_softmax_cutoff: Optional[str] = field( default=None, metadata={ "help": "comma separated list of adaptive softmax cutoff points. " "Must be used with adaptive_loss criterion" }, ) share_embed: lambda x: utils.eval_bool(x) = field( default=False, metadata={"help": "share input and output embeddings"} ) is_wordlm: bool = field( default=False, metadata={ "help": "whether it is word LM or subword LM. Only relevant for ASR decoding " "with LM, and it determines how the underlying decoder instance gets the " "dictionary from the task instance when calling cls.build_model()" }, ) decoder_dropout_in: float = field( default=0.1, metadata={"help": "dropout probability for decoder input embedding"} ) decoder_dropout_out: float = field( default=0.1, metadata={"help": "dropout probability for decoder output"} ) # TODO common var add to parent add_bos_token: bool = II("task.add_bos_token") tokens_per_sample: int = II("task.tokens_per_sample") max_target_positions: Optional[int] = II("task.max_target_positions") tpu: bool = II("params.common.tpu")
def prepare(cls, load_dictionary, args, **kargs): args.left_pad_source = utils.eval_bool(args.left_pad_source) args.left_pad_target = utils.eval_bool(args.left_pad_target) if not hasattr(args, "shuffle_instance"): args.shuffle_instance = False if args.langtoks is None: args.langtoks = {} if "main" not in args.langtoks: src_langtok_spec = args.encoder_langtok if args.encoder_langtok else None tgt_langtok_spec = "tgt" if args.decoder_langtok else None args.langtoks["main"] = (src_langtok_spec, tgt_langtok_spec) def check_langs(langs, pairs): messages = [] for src, tgt in pairs: if src not in langs or tgt not in langs: messages.append( f"language pair {src}-{tgt} contains languages " "that are not in the language dictionary") if len(messages) > 0: raise ValueError(" ".join(messages) + f"; langs: {langs}") if args.lang_pairs is None: raise ValueError( "--lang-pairs is required. List all the language pairs in the training objective." ) if isinstance(args.lang_pairs, str): args.lang_pairs = args.lang_pairs.split(",") if args.source_lang is not None or args.target_lang is not None: training = False else: training = True language_list = cls.load_langs(args, **kargs) check_langs( language_list, ([p.split("-") for p in args.lang_pairs] if training else [(args.source_lang, args.target_lang)]), ) def load_dictionary_and_postproc(path): d = load_dictionary(path) augment_dictionary( dictionary=d, language_list=language_list, lang_tok_style=args.lang_tok_style, langtoks_specs=args.langtoks_specs, extra_data=args.extra_data, ) return d dicts = cls.load_all_dictionaries(args, language_list, load_dictionary_and_postproc, training) return language_list, dicts, training
def update_args(cls, args): args.left_pad_source = utils.eval_bool(args.left_pad_source) args.left_pad_target = utils.eval_bool(args.left_pad_target) if args.lang_pairs is None: raise ValueError( "--lang-pairs is required. List all the language pairs in the training objective." ) if isinstance(args.lang_pairs, str): args.lang_pairs = args.lang_pairs.split(",")
def add_args(parser): """Add model-specific arguments to the parser.""" # fmt: off parser.add_argument("--dropout", type=float, metavar="D", help="dropout probability") parser.add_argument("--encoder-conv-channels", type=str, metavar="EXPR", help="list of encoder convolution's out channels") parser.add_argument("--encoder-conv-kernel-sizes", type=str, metavar="EXPR", help="list of encoder convolution's kernel sizes") parser.add_argument("--encoder-conv-strides", type=str, metavar="EXPR", help="list of encoder convolution's strides") parser.add_argument("--encoder-rnn-hidden-size", type=int, metavar="N", help="encoder rnn's hidden size") parser.add_argument("--encoder-rnn-layers", type=int, metavar="N", help="number of rnn encoder layers") parser.add_argument( "--encoder-rnn-bidirectional", type=lambda x: utils.eval_bool(x), help="make all rnn layers of encoder bidirectional") parser.add_argument( "--encoder-rnn-residual", type=lambda x: utils.eval_bool(x), help="create residual connections for rnn encoder " "layers (starting from the 2nd layer), i.e., the actual " "output of such layer is the sum of its input and output") # Granular dropout settings (if not specified these default to --dropout) parser.add_argument("--encoder-rnn-dropout-in", type=float, metavar="D", help="dropout probability for encoder rnn's input") parser.add_argument( "--encoder-rnn-dropout-out", type=float, metavar="D", help="dropout probability for encoder rnn's output")
def add_args(parser): """Add model-specific arguments to the parser.""" # fmt: off parser.add_argument("--dropout", type=float, metavar="D", help="dropout probability") parser.add_argument("--decoder-embed-dim", type=int, metavar="N", help="decoder embedding dimension") parser.add_argument("--decoder-embed-path", type=str, metavar="STR", help="path to pre-trained decoder embedding") parser.add_argument("--decoder-freeze-embed", action="store_true", help="freeze decoder embeddings") parser.add_argument("--decoder-hidden-size", type=int, metavar="N", help="decoder hidden size") parser.add_argument("--decoder-layers", type=int, metavar="N", help="number of decoder layers") parser.add_argument("--decoder-out-embed-dim", type=int, metavar="N", help="decoder output embedding dimension") parser.add_argument( "--adaptive-softmax-cutoff", metavar="EXPR", help="comma separated list of adaptive softmax cutoff points. " "Must be used with adaptive_loss criterion") parser.add_argument("--share-embed", type=lambda x: utils.eval_bool(x), help="share input and output embeddings") parser.add_argument( "--is-wordlm", action="store_true", help="whether it is word LM or subword LM. Only " "relevant for ASR decoding with LM, and it determines " "how the underlying decoder instance gets the dictionary " "from the task instance when calling cls.build_model()") # Granular dropout settings (if not specified these default to --dropout) parser.add_argument( "--decoder-dropout-in", type=float, metavar="D", help="dropout probability for decoder input embedding") parser.add_argument("--decoder-dropout-out", type=float, metavar="D", help="dropout probability for decoder output")
def prepare(cls, args, **kargs): args.left_pad_source = utils.eval_bool(args.left_pad_source) args.left_pad_target = utils.eval_bool(args.left_pad_target) if args.lang_pairs is None: raise ValueError( '--lang-pairs is required. List all the language pairs in the training objective.' ) if isinstance(args.lang_pairs, str): args.lang_pairs = args.lang_pairs.split(',') sorted_langs = sorted( list({ x for lang_pair in args.lang_pairs for x in lang_pair.split('-') })) if args.source_lang is not None or args.target_lang is not None: training = False else: training = True # load dictionaries dicts = OrderedDict() for lang in sorted_langs: paths = utils.split_paths(args.data) assert len(paths) > 0 dicts[lang] = cls.load_dictionary( os.path.join(paths[0], 'dict.{}.txt'.format(lang))) if len(dicts) > 0: assert dicts[lang].pad() == dicts[sorted_langs[0]].pad() assert dicts[lang].eos() == dicts[sorted_langs[0]].eos() assert dicts[lang].unk() == dicts[sorted_langs[0]].unk() if args.encoder_langtok is not None or args.decoder_langtok: for lang_to_add in sorted_langs: dicts[lang].add_symbol(_lang_token(lang_to_add)) logger.info('[{}] dictionary: {} types'.format( lang, len(dicts[lang]))) return dicts, training
def setup_task(cls, args, **kwargs): """Setup the task (e.g., load dictionaries). Args: args (argparse.Namespace): parsed command-line arguments """ # get padding... args.left_pad_source = utils.eval_bool(args.left_pad_source) args.left_pad_target = utils.eval_bool(args.left_pad_target) paths = utils.split_paths(args.data) assert len(paths) > 0 # find language pair automatically if args.source_lang is None or args.target_lang is None: args.source_lang, args.target_lang = data_utils.infer_language_pair( paths[0] ) print("path:",os.path.join(paths[0], "/Dicts/dict.txt")) dictionary = cls.load_dictionary( os.path.join(paths[0]+"/Dicts/", "dict.txt") ) return cls(args, dictionary,paths)
def add_args(parser): """Add model-specific arguments to the parser.""" # fmt: off parser.add_argument("--dropout", type=float, metavar="D", help="dropout probability") parser.add_argument("--hidden-sizes", type=str, metavar="EXPR", help="list of hidden sizes for all Tdnn layers") parser.add_argument("--kernel-sizes", type=str, metavar="EXPR", help="list of all Tdnn layer\'s kernel sizes") parser.add_argument("--strides", type=str, metavar="EXPR", help="list of all Tdnn layer\'s strides") parser.add_argument("--dilations", type=str, metavar="EXPR", help="list of all Tdnn layer\'s dilations") parser.add_argument("--num-layers", type=int, metavar="N", help="number of Tdnn layers") parser.add_argument( "--residual", type=lambda x: utils.eval_bool(x), help="create residual connections for rnn encoder " "layers (starting from the 2nd layer), i.e., the actual " "output of such layer is the sum of its input and output") # Granular dropout settings (if not specified these default to --dropout) parser.add_argument("--dropout-in", type=float, metavar="D", help="dropout probability for encoder\'s input") parser.add_argument( "--dropout-out", type=float, metavar="D", help="dropout probability for Tdnn layers\' output")
def prepare(cls, load_dictionary, args, **kargs): args.left_pad_source = utils.eval_bool(args.left_pad_source) args.left_pad_target = utils.eval_bool(args.left_pad_target) if not hasattr(args, "shuffle_instance"): args.shuffle_instance = False if args.langtoks is None: args.langtoks = {} if "main" not in args.langtoks: src_langtok_spec = args.encoder_langtok if args.encoder_langtok else None tgt_langtok_spec = "tgt" if args.decoder_langtok else None args.langtoks["main"] = (src_langtok_spec, tgt_langtok_spec) def check_langs(langs, pairs): messages = [] for src, tgt in pairs: if src not in langs or tgt not in langs: messages.append( f"language pair {src}-{tgt} contains languages " "that are not in the language dictionary" ) if len(messages) > 0: raise ValueError(" ".join(messages) + f"; langs: {langs}") if args.lang_pairs is None: raise ValueError( "--lang-pairs is required. List all the language pairs in the training objective." ) if isinstance(args.lang_pairs, str): args.lang_pairs = args.lang_pairs.split(",") if args.source_lang is not None or args.target_lang is not None: training = False else: training = True language_list = cls.load_langs(args, **kargs) check_langs( language_list, ( [p.split("-") for p in args.lang_pairs] if training else [(args.source_lang, args.target_lang)] ), ) # load dictionaries if training: extra_lang_pairs = ( list( {p for _, v in args.extra_lang_pairs.items() for p in v.split(",")} ) if args.extra_lang_pairs else [] ) langs_to_load_dicts = sorted( {x for p in args.lang_pairs + extra_lang_pairs for x in p.split("-")} ) else: langs_to_load_dicts = sorted([args.source_lang, args.target_lang]) dicts = OrderedDict() paths = utils.split_paths(args.data) assert len(paths) > 0 for lang in langs_to_load_dicts: dicts[lang] = load_dictionary( os.path.join(paths[0], "dict.{}.txt".format(lang)) ) augment_dictionary( dictionary=dicts[lang], language_list=language_list, lang_tok_style=args.lang_tok_style, langtoks_specs=args.langtoks_specs, extra_data=args.extra_data, ) if len(dicts) > 0: assert dicts[lang].pad() == dicts[langs_to_load_dicts[0]].pad() assert dicts[lang].eos() == dicts[langs_to_load_dicts[0]].eos() assert dicts[lang].unk() == dicts[langs_to_load_dicts[0]].unk() logger.info("[{}] dictionary: {} types".format(lang, len(dicts[lang]))) return language_list, dicts, training
def build_model(cls, args, task): """Build a new model instance.""" # make sure that all args are properly defaulted (in case there are any new ones) base_architecture(args) if args.encoder_layers != args.decoder_layers: raise ValueError("--encoder-layers must match --decoder-layers") max_source_positions = getattr( args, "max_source_positions", DEFAULT_MAX_SOURCE_POSITIONS ) max_target_positions = getattr( args, "max_target_positions", DEFAULT_MAX_TARGET_POSITIONS ) def load_pretrained_embedding_from_file(embed_path, dictionary, embed_dim): num_embeddings = len(dictionary) padding_idx = dictionary.pad() embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) embed_dict = utils.parse_embedding(embed_path) utils.print_embed_overlap(embed_dict, dictionary) return utils.load_embedding(embed_dict, dictionary, embed_tokens) if args.encoder_embed_path: pretrained_encoder_embed = load_pretrained_embedding_from_file( args.encoder_embed_path, task.source_dictionary, args.encoder_embed_dim ) else: num_embeddings = len(task.source_dictionary) pretrained_encoder_embed = Embedding( num_embeddings, args.encoder_embed_dim, task.source_dictionary.pad() ) if args.share_all_embeddings: # double check all parameters combinations are valid if task.source_dictionary != task.target_dictionary: raise ValueError("--share-all-embeddings requires a joint dictionary") if args.decoder_embed_path and ( args.decoder_embed_path != args.encoder_embed_path ): raise ValueError( "--share-all-embed not compatible with --decoder-embed-path" ) if args.encoder_embed_dim != args.decoder_embed_dim: raise ValueError( "--share-all-embeddings requires --encoder-embed-dim to " "match --decoder-embed-dim" ) pretrained_decoder_embed = pretrained_encoder_embed args.share_decoder_input_output_embed = True else: # separate decoder input embeddings pretrained_decoder_embed = None if args.decoder_embed_path: pretrained_decoder_embed = load_pretrained_embedding_from_file( args.decoder_embed_path, task.target_dictionary, args.decoder_embed_dim, ) # one last double check of parameter combinations if args.share_decoder_input_output_embed and ( args.decoder_embed_dim != args.decoder_out_embed_dim ): raise ValueError( "--share-decoder-input-output-embeddings requires " "--decoder-embed-dim to match --decoder-out-embed-dim" ) if args.encoder_freeze_embed: pretrained_encoder_embed.weight.requires_grad = False if args.decoder_freeze_embed: pretrained_decoder_embed.weight.requires_grad = False encoder = LSTMEncoder( dictionary=task.source_dictionary, embed_dim=args.encoder_embed_dim, hidden_size=args.encoder_hidden_size, num_layers=args.encoder_layers, dropout_in=args.encoder_dropout_in, dropout_out=args.encoder_dropout_out, bidirectional=args.encoder_bidirectional, pretrained_embed=pretrained_encoder_embed, max_source_positions=max_source_positions, ) decoder = LSTMDecoder( dictionary=task.target_dictionary, embed_dim=args.decoder_embed_dim, hidden_size=args.decoder_hidden_size, out_embed_dim=args.decoder_out_embed_dim, num_layers=args.decoder_layers, dropout_in=args.decoder_dropout_in, dropout_out=args.decoder_dropout_out, attention=utils.eval_bool(args.decoder_attention), encoder_output_units=encoder.output_units, pretrained_embed=pretrained_decoder_embed, share_input_output_embed=args.share_decoder_input_output_embed, adaptive_softmax_cutoff=( utils.eval_str_list(args.adaptive_softmax_cutoff, type=int) if args.criterion == "adaptive_loss" else None ), max_target_positions=max_target_positions, residuals=False, ) return cls(encoder, decoder)
def load_dataset_only(self, split, lang_pairs, do_mask=True, epoch=1, combine=False): paths = utils.split_paths(self.args.data) assert len(paths) > 0 data_path = paths[(epoch - 1) % len(paths)] # TODO unk token will be considered as first word too, though it might be an unknown phoneme within a word # get_whole_word_mask returns a tensor (size V by 1 ) to indicate if a token is a word start token mask_whole_src_words = gen_whole_word_mask(self.args, self.src_dict) language_without_segmentations = self.args.no_whole_word_mask_langs.split( ",") lang_datasets = [] eos_bos = [] lang_pairs = lang_pairs.split(",") if lang_pairs != "" else [] assert len(lang_pairs) > 0 for lp in lang_pairs: src, tgt = lp.split("-") lang_mask_whole_src_words = (mask_whole_src_words if src not in language_without_segmentations else None) end_token = (self.source_dictionary.index( PairedDenoisingTask.LANG_TAG_TEMPLATE.format(src)) if self.args.add_src_lang_token else None) bos_token = (self.target_dictionary.index( PairedDenoisingTask.LANG_TAG_TEMPLATE.format(tgt)) if self.args.add_tgt_lang_token else None) src_lang_id = None if self.args.add_src_lang_token or self.args.add_tgt_lang_token: eos_bos.append((end_token, bos_token)) dataset = PairedDenoisingTask.language_pair_denoising_dataset( data_path, do_mask, split, src, self.source_dictionary, tgt, self.target_dictionary, self.mask_idx, lang_mask_whole_src_words, self.args.seed, self.args, self.args.dataset_impl, combine=combine, left_pad_source=utils.eval_bool(self.args.left_pad_source), left_pad_target=utils.eval_bool(self.args.left_pad_target), max_source_positions=self.args.max_source_positions, max_target_positions=self.args.max_target_positions, src_lang_id=src_lang_id, ) lang_datasets.append(dataset) if len(lang_datasets) == 0: return elif len(lang_datasets) == 1: dataset = lang_datasets[0] if self.args.add_src_lang_token or self.args.add_tgt_lang_token: end_token, bos_token = eos_bos[0] dataset = TransformEosLangPairDataset( dataset, src_eos=self.source_dictionary.eos(), new_src_eos=end_token, tgt_bos=self.target_dictionary.eos(), new_tgt_bos=bos_token, ) else: end_tokens = [item[0] for item in eos_bos if item[0] is not None] bos_tokens = [item[1] for item in eos_bos if item[1] is not None] lang_datasets = self.resample_datasets(lang_datasets, lang_pairs, epoch) dataset = TransformEosConcatLangPairDataset( lang_datasets, self.source_dictionary.eos(), self.target_dictionary.eos(), new_src_eos=end_tokens, new_tgt_bos=bos_tokens, ) return dataset
def prepare(cls, load_dictionary, args, **kargs): args.left_pad_source = utils.eval_bool(args.left_pad_source) args.left_pad_target = utils.eval_bool(args.left_pad_target) if not hasattr(args, "shuffle_instance"): args.shuffle_instance = False if args.langtoks is None: args.langtoks = {} if "main" not in args.langtoks: src_langtok_spec = args.encoder_langtok if args.encoder_langtok else None tgt_langtok_spec = "tgt" if args.decoder_langtok else None args.langtoks["main"] = (src_langtok_spec, tgt_langtok_spec) def check_langs(langs, pairs): messages = [] for src, tgt in pairs: if src not in langs or tgt not in langs: messages.append( f"language pair {src}-{tgt} contains languages " "that are not in the language dictionary" ) if len(messages) > 0: raise ValueError(" ".join(messages) + f"; langs: {langs}") if args.lang_pairs is None: raise ValueError( "--lang-pairs is required. List all the language pairs in the training objective." ) if isinstance(args.lang_pairs, str): args.lang_pairs = args.lang_pairs.split(",") if args.source_lang is not None or args.target_lang is not None: training = False else: training = True sorted_langs = cls.load_langs(args, **kargs) check_langs( sorted_langs, ( [p.split("-") for p in args.lang_pairs] if training else [(args.source_lang, args.target_lang)] ), ) # load dictionaries if training: extra_lang_pairs = ( list( {p for _, v in args.extra_lang_pairs.items() for p in v.split(",")} ) if args.extra_lang_pairs else [] ) langs_to_load_dicts = sorted( {x for p in args.lang_pairs + extra_lang_pairs for x in p.split("-")} ) else: langs_to_load_dicts = sorted([args.source_lang, args.target_lang]) dicts = OrderedDict() supported_langtok_specs = args.langtoks_specs for lang in langs_to_load_dicts: paths = utils.split_paths(args.data) assert len(paths) > 0 dicts[lang] = load_dictionary( os.path.join(paths[0], "dict.{}.txt".format(lang)) ) if len(dicts) > 0: assert dicts[lang].pad() == dicts[langs_to_load_dicts[0]].pad() assert dicts[lang].eos() == dicts[langs_to_load_dicts[0]].eos() assert dicts[lang].unk() == dicts[langs_to_load_dicts[0]].unk() # keep the langs consistent for all experiments with the same lang dict # for finetuning regardless of whether lang_tok is required or not just add the tokens to the dicts for spec in supported_langtok_specs: for lang_to_add in sorted_langs: dicts[lang].add_symbol( MultilingualDatasetManager.get_lang_tok(lang_to_add, args, spec) ) if args.lang_tok_style == "mbart" or ( args.extra_data and "mono_dae" in args.extra_data ): dicts[lang].add_symbol("<mask>") logger.info("[{}] dictionary: {} types".format(lang, len(dicts[lang]))) return sorted_langs, dicts, training
def setup_task(cls, args, **kwargs): """Setup the task (e.g., load dictionaries). Args: args (argparse.Namespace): parsed command-line arguments """ args.left_pad_source = utils.eval_bool(args.left_pad_source) args.left_pad_target = utils.eval_bool(args.left_pad_target) paths = utils.split_paths(args.data) assert len(paths) > 0 # find language pair automatically if args.source_lang is None or args.target_lang is None: args.source_lang, args.target_lang = data_utils.infer_language_pair( paths[0]) if args.source_lang is None or args.target_lang is None: raise Exception( 'Could not infer language pair, please provide it explicitly') # load dictionaries if args.use_bert_model: tgt_first = True else: tgt_first = False if tgt_first: tgt_dict = cls.load_dictionary(os.path.join( paths[0], 'dict.{}.txt'.format(args.target_lang)), custom_bos=args.bos, custom_pad=args.pad, custom_eos=args.eos, custom_unk=args.unk, add_sentence_limit_words_after=True) bos_id_tgt = tgt_dict.bos() pad_id_tgt = tgt_dict.pad() eos_id_tgt = tgt_dict.eos() unk_id_tgt = tgt_dict.unk() src_dict = cls.load_dictionary(os.path.join( paths[0], 'dict.{}.txt'.format(args.source_lang)), custom_bos=args.bos, custom_pad=args.pad, custom_eos=args.eos, custom_unk=args.unk, add_sentence_limit_words_after=True, tgt_first=tgt_first, bos_id_tgt=bos_id_tgt, pad_id_tgt=pad_id_tgt, eos_id_tgt=eos_id_tgt, unk_id_tgt=unk_id_tgt) else: src_dict = cls.load_dictionary( os.path.join(paths[0], 'dict.{}.txt'.format(args.source_lang))) tgt_dict = cls.load_dictionary( os.path.join(paths[0], 'dict.{}.txt'.format(args.target_lang))) # print(src_dict.pad(), '', tgt_dict.pad()) # print(src_dict.bos(), '', tgt_dict.bos()) # print(src_dict.eos(), '', tgt_dict.eos()) # print(src_dict.unk(), '', tgt_dict.unk()) assert src_dict.pad() == tgt_dict.pad() assert src_dict.eos() == tgt_dict.eos() assert src_dict.unk() == tgt_dict.unk() logger.info('[{}] dictionary: {} types'.format(args.source_lang, len(src_dict))) logger.info('[{}] dictionary: {} types'.format(args.target_lang, len(tgt_dict))) return cls(args, src_dict, tgt_dict)
def add_args(parser): """Add model-specific arguments to the parser.""" # fmt: off parser.add_argument("--dropout", type=float, metavar="D", help="dropout probability") parser.add_argument("--encoder-conv-channels", type=str, metavar="EXPR", help="list of encoder convolution\'s out channels") parser.add_argument("--encoder-conv-kernel-sizes", type=str, metavar="EXPR", help="list of encoder convolution\'s kernel sizes") parser.add_argument("--encoder-conv-strides", type=str, metavar="EXPR", help="list of encoder convolution\'s strides") parser.add_argument("--encoder-rnn-hidden-size", type=int, metavar="N", help="encoder rnn\'s hidden size") parser.add_argument("--encoder-rnn-layers", type=int, metavar="N", help="number of rnn encoder layers") parser.add_argument("--encoder-rnn-bidirectional", type=lambda x: utils.eval_bool(x), help="make all rnn layers of encoder bidirectional") parser.add_argument("--encoder-rnn-residual", type=lambda x: utils.eval_bool(x), help="create residual connections for rnn encoder " "layers (starting from the 2nd layer), i.e., the actual " "output of such layer is the sum of its input and output") parser.add_argument("--decoder-embed-dim", type=int, metavar="N", help="decoder embedding dimension") parser.add_argument("--decoder-embed-path", type=str, metavar="STR", help="path to pre-trained decoder embedding") parser.add_argument("--decoder-freeze-embed", action="store_true", help="freeze decoder embeddings") parser.add_argument("--decoder-hidden-size", type=int, metavar="N", help="decoder hidden size") parser.add_argument("--decoder-layers", type=int, metavar="N", help="number of decoder layers") parser.add_argument("--decoder-out-embed-dim", type=int, metavar="N", help="decoder output embedding dimension") parser.add_argument("--decoder-rnn-residual", type=lambda x: utils.eval_bool(x), help="create residual connections for rnn decoder " "layers (starting from the 2nd layer), i.e., the actual " "output of such layer is the sum of its input and output") parser.add_argument("--attention-type", type=str, metavar="STR", choices=["bahdanau", "luong"], help="attention type") parser.add_argument("--attention-dim", type=int, metavar="N", help="attention dimension") parser.add_argument("--need-attention", action="store_true", help="need to return attention tensor for the caller") parser.add_argument("--adaptive-softmax-cutoff", metavar="EXPR", help="comma separated list of adaptive softmax cutoff points. " "Must be used with adaptive_loss criterion") parser.add_argument("--share-decoder-input-output-embed", type=lambda x: utils.eval_bool(x), help="share decoder input and output embeddings") parser.add_argument("--pretrained-lm-checkpoint", type=str, metavar="STR", help="path to load checkpoint from pretrained language model(LM), " "which will be present and kept fixed during training.") # Granular dropout settings (if not specified these default to --dropout) parser.add_argument("--encoder-rnn-dropout-in", type=float, metavar="D", help="dropout probability for encoder rnn\'s input") parser.add_argument("--encoder-rnn-dropout-out", type=float, metavar="D", help="dropout probability for encoder rnn\'s output") parser.add_argument("--decoder-dropout-in", type=float, metavar="D", help="dropout probability for decoder input embedding") parser.add_argument("--decoder-dropout-out", type=float, metavar="D", help="dropout probability for decoder output") # Scheduled sampling options parser.add_argument("--scheduled-sampling-probs", type=lambda p: utils.eval_str_list(p), metavar="P_1,P_2,...,P_N", default=[1.0], help="scheduled sampling probabilities of sampling the truth " "labels for N epochs starting from --start-schedule-sampling-epoch; " "all later epochs using P_N") parser.add_argument("--start-scheduled-sampling-epoch", type=int, metavar="N", default=1, help="start scheduled sampling from the specified epoch")