def add_args(parser): parser.add_argument( "data", help="colon separated path to data directories list, \ will be iterated upon during epochs in round-robin manner", action=FileContentsAction, ) parser.add_argument( "--langs", default=None, type=csv_str_list, help="a list of languages comma sperated languages which can appear in lang-pairs; " "note that the ordering determines language token IDs", action=FileContentsAction, ) parser.add_argument( "--lang-dict", default=None, type=str, help="an external file which contains a list of " "languages which can appear in lang-pairs; " "note that the ordering determines language token IDs; " "--langs and --lang-dict are two exclusive options", ) parser.add_argument( "--lang-tok-style", default=LangTokStyle.multilingual.value, type=str, choices=[LangTokStyle.multilingual.value, LangTokStyle.mbart.value], help="language token styles", ) parser.add_argument( "--load-alignments", action="store_true", help="load the binarized alignments", ) parser.add_argument( "--left-pad-source", default="True", type=str, metavar="BOOL", help="pad the source on the left", ) parser.add_argument( "--left-pad-target", default="False", type=str, metavar="BOOL", help="pad the target on the left", ) parser.add_argument( "--max-source-positions", default=1024, type=int, metavar="N", help="max number of tokens in the source sequence", ) parser.add_argument( "--max-target-positions", default=1024, type=int, metavar="N", help="max number of tokens in the target sequence", ) parser.add_argument( "--upsample-primary", default=1, type=int, help="amount to upsample primary dataset", ) parser.add_argument( "--truncate-source", action="store_true", default=False, help="truncate source to max-source-positions", ) parser.add_argument( "--encoder-langtok", default=None, type=str, choices=[EncoderLangtok.src.value, EncoderLangtok.tgt.value], metavar="SRCTGT", help="prepend to the beginning of source sentence the source or target " "language token. (src/tgt)", ) parser.add_argument( "--decoder-langtok", action="store_true", help="prepend to the beginning of target sentence the target language token", ) parser.add_argument( "--lang-tok-replacing-bos-eos", action="store_true", default=False ) parser.add_argument( "--enable-lang-ids", default=False, action="store_true", help="whether to include language IDs in samples", ) parser.add_argument( "--enable-reservsed-directions-shared-datasets", default=False, action="store_true", help="whether to allow datasets be used in reversed directions", ) parser.add_argument( "--extra-data", help='a dictionary of data name to this path, \ e.g. {"mined", path_to_mined_data, "denoised": path_to_denoised_data}', type=lambda uf: eval_str_dict(uf, type=str), default=None, ) parser.add_argument( "--extra-lang-pairs", help='a dictionary of data name to the language pairs they serve, \ e.g. {"mined": comma-separated-lang-pairs, "denoised": comma-separated-lang-pairs}', type=lambda uf: eval_str_dict(uf, type=str), default=None, ) parser.add_argument( "--langtoks-specs", help='a list of comma separated data types that a set of language tokens to be specialized for, \ e.g. "main,dae,mined". There will be a set of language tokens added to the vocab to \ distinguish languages in different training data types. If not specified, default language \ tokens per languages will be added', default=LangTokSpec.main.value, type=csv_str_list, ) parser.add_argument( "--langtoks", help='a dictionary of how to add language tokens, \ e.g. {"mined": (None, "tgt"), "mono_dae": ("src.dae", "tgt"), "main": \ ("src", "tgt")}, or {"mined": ("src.mined", "tgt")}', default=None, type=lambda uf: eval_str_dict(uf, type=str), ) parser.add_argument( "--sampling-weights-from-file", help='a file contain a python dictionary of how to sample data sets, \ e.g. { "main:en_XX-es_XX": 0.2, "mined:en_XX-pt_XX": 0.5, \ "mono_dae:es_XX-es_XX: 0.3, "main:en_xx-fr_XX": 0.8 }', default=None, type=str, ) parser.add_argument( "--sampling-weights", help='a dictionary of how to sample data sets, \ e.g. { "main:en_XX-es_XX": 0.2, "mined:en_XX-pt_XX": 0.5, \ "mono_dae:es_XX-es_XX: 0.3, "main:en_xx-fr_XX": 0.8 }', default=None, type=lambda uf: eval_str_dict(uf, type=str), ) parser.add_argument( "--virtual-epoch-size", default=1000000, type=int, help="virtual epoch size to speed up data loading", ) parser.add_argument( "--virtual-data-size", default=None, type=int, help="virtual data size of the whole joint dataset to speed" "up data loading and have specific dynamic sampling strategy interval", )
def add_args(parser): parser.add_argument( 'data', help='colon separated path to data directories list, \ will be iterated upon during epochs in round-robin manner' ) parser.add_argument( '--langs', default=None, type=csv_str_list, help= 'a list of languages comma sperated languages which can appear in lang-pairs; ' 'note that the ordering determines language token IDs', ) parser.add_argument( '--lang-dict', default=None, type=str, help='an external file which contains a list of ' 'languages which can appear in lang-pairs; ' 'note that the ordering determines language token IDs; ' '--langs and --lang-dict are two exclusive options') parser.add_argument('--lang-tok-style', default='multilingual', type=str, choices=['multilingual', 'mbart'], help='language token styles') parser.add_argument('--load-alignments', action='store_true', help='load the binarized alignments') parser.add_argument('--left-pad-source', default='True', type=str, metavar='BOOL', help='pad the source on the left') parser.add_argument('--left-pad-target', default='False', type=str, metavar='BOOL', help='pad the target on the left') parser.add_argument('--max-source-positions', default=1024, type=int, metavar='N', help='max number of tokens in the source sequence') parser.add_argument('--max-target-positions', default=1024, type=int, metavar='N', help='max number of tokens in the target sequence') parser.add_argument('--upsample-primary', default=1, type=int, help='amount to upsample primary dataset') parser.add_argument('--truncate-source', action='store_true', default=False, help='truncate source to max-source-positions') parser.add_argument( '--encoder-langtok', default=None, type=str, choices=['src', 'tgt'], metavar='SRCTGT', help= 'prepend to the beginning of source sentence the source or target ' 'language token. (src/tgt)') parser.add_argument( '--decoder-langtok', action='store_true', help= 'prepend to the beginning of target sentence the target language token' ) parser.add_argument('--lang-tok-replacing-bos-eos', action='store_true', default=False) parser.add_argument('--enable-lang-ids', default=False, action='store_true', help='whether to include language IDs in samples') parser.add_argument( '--enable-reservsed-directions-shared-datasets', default=False, action='store_true', help='whether to allow datasets be used in reversed directions') parser.add_argument('--extra-data', help='a dictionary of data name to this path, \ e.g. {"mined", path_to_mined_data, "denoised": path_to_denoised_data}', type=lambda uf: eval_str_dict(uf, type=str), default=None) parser.add_argument( '--extra-lang-pairs', help='a dictionary of data name to the language pairs they serve, \ e.g. {"mined": comma-separated-lang-pairs, "denoised": comma-separated-lang-pairs}', type=lambda uf: eval_str_dict(uf, type=str), default=None) parser.add_argument( '--langtoks-specs', help= 'a list of comma separated data types that a set of language tokens to be specialized for, \ e.g. "main,dae,mined". There will be a set of language tokens added to the vocab to \ distinguish languages in different training data types. If not specified, default language \ tokens per languages will be added', default='main', type=csv_str_list, ) parser.add_argument( '--langtoks', help='a dictionary of how to add language tokens, \ e.g. {"mined": (None, "tgt"), "mono_dae": ("src.dae", "tgt"), "main": \ ("src", "tgt")}, or {"mined": ("src.mined", "tgt")}', default=None, type=lambda uf: eval_str_dict(uf, type=str), ) parser.add_argument( '--sampling-weights-from-file', help= 'a file contain a python dictionary of how to sample data sets, \ e.g. { "main:en_XX-es_XX": 0.2, "mined:en_XX-pt_XX": 0.5, \ "mono_dae:es_XX-es_XX: 0.3, "main:en_xx-fr_XX": 0.8 }', default=None, type=str, ) parser.add_argument( '--sampling-weights', help='a dictionary of how to sample data sets, \ e.g. { "main:en_XX-es_XX": 0.2, "mined:en_XX-pt_XX": 0.5, \ "mono_dae:es_XX-es_XX: 0.3, "main:en_xx-fr_XX": 0.8 }', default=None, type=lambda uf: eval_str_dict(uf, type=str), ) parser.add_argument('--virtual-epoch-size', default=1000000, type=int, help='virtual epoch size to speed up data loading') parser.add_argument( '--virtual-data-size', default=None, type=int, help='virtual data size of the whole joint dataset to speed' 'up data loading and have specific dynamic sampling strategy interval' )