def add_preprocess_args(parser): group = parser.add_argument_group("Preprocessing") # fmt: off group.add_argument("-s", "--source-lang", default=None, metavar="SRC", help="source language") group.add_argument("-t", "--target-lang", default=None, metavar="TARGET", help="target language") group.add_argument("--trainpref", metavar="FP", default=None, help="train file prefix (also used to build dictionaries)") group.add_argument("--validpref", metavar="FP", default=None, help="comma separated, valid file prefixes " "(words missing from train set are replaced with <unk>)") group.add_argument("--testpref", metavar="FP", default=None, help="comma separated, test file prefixes " "(words missing from train set are replaced with <unk>)") group.add_argument("--align-suffix", metavar="FP", default=None, help="alignment file suffix") group.add_argument("--destdir", metavar="DIR", default="data-bin", help="destination dir") group.add_argument("--thresholdtgt", metavar="N", default=0, type=int, help="map words appearing less than threshold times to unknown") group.add_argument("--thresholdsrc", metavar="N", default=0, type=int, help="map words appearing less than threshold times to unknown") group.add_argument("--tgtdict", metavar="FP", help="reuse given target dictionary") group.add_argument("--srcdict", metavar="FP", help="reuse given source dictionary") group.add_argument("--nwordstgt", metavar="N", default=-1, type=int, help="number of target words to retain") group.add_argument("--nwordssrc", metavar="N", default=-1, type=int, help="number of source words to retain") group.add_argument("--alignfile", metavar="ALIGN", default=None, help="an alignment file (optional)") parser.add_argument('--dataset-impl', metavar='FORMAT', default='mmap', choices=get_available_dataset_impl(), help='output dataset implementation') group.add_argument("--joined-dictionary", action="store_true", help="Generate joined dictionary") group.add_argument("--only-source", action="store_true", help="Only process the source language") group.add_argument("--padding-factor", metavar="N", default=8, type=int, help="Pad dictionary size to be multiple of N") group.add_argument("--workers", metavar="N", default=1, type=int, help="number of parallel workers") #--mult-teachers --avoid-tokenize-extras --input-mapping # --bert-model-name /mnt/yardcephfs/mmyard/g_wxg_td_prc/mt/v_xyvhuang/data/bert-base-cased-new # --bart-model-name /mnt/yardcephfs/mmyard/g_wxg_td_prc/mt/v_xyvhuang/data/bart-base #parser.add_argument('--avoid-tokenize-extras', action='store_true', help='...') group.add_argument("--mult-teachers", action="store_true", help="...") group.add_argument("--avoid-tokenize-extras", action="store_true", help="...") group.add_argument("--input-mapping", action="store_true", help="...") group.add_argument('--bert-model-name', default='', type=str) group.add_argument('--bart-model-name', default='', type=str) group.add_argument('--electra-model-name', default='', type=str) # fmt: on return parser
def add_preprocess_args(parser): group = parser.add_argument_group("Preprocessing") # fmt: off group.add_argument("-s", "--source-lang", default=None, metavar="SRC", help="source language") group.add_argument("-t", "--target-lang", default=None, metavar="TARGET", help="target language") group.add_argument("--trainpref", metavar="FP", default=None, help="train file prefix") group.add_argument("--validpref", metavar="FP", default=None, help="comma separated, valid file prefixes") group.add_argument("--testpref", metavar="FP", default=None, help="comma separated, test file prefixes") group.add_argument("--align-suffix", metavar="FP", default=None, help="alignment file suffix") group.add_argument("--destdir", metavar="DIR", default="data-bin", help="destination dir") group.add_argument("--thresholdtgt", metavar="N", default=0, type=int, help="map words appearing less than threshold times to unknown") group.add_argument("--thresholdsrc", metavar="N", default=0, type=int, help="map words appearing less than threshold times to unknown") group.add_argument("--tgtdict", metavar="FP", help="reuse given target dictionary") # group.add_argument("--bos", default="<s>", type=str, # help="Specify bos token from the dictionary.") # group.add_argument("--pad", default="<pad>", type=str, # help="Specify bos token from the dictionary.") # group.add_argument("--eos", default="</s>", type=str, # help="Specify bos token from the dictionary.") # group.add_argument("--unk", default="<unk>", type=str, # help="Specify bos token from the dictionary.") # group.add_argument("--tgtdict_add_sentence_limit_words_after", action="store_true", # help="Add sentence limit words (i.e. bos, eos, pad, unk) after loading tgtdict.") group.add_argument("--srcdict", metavar="FP", help="reuse given source dictionary") group.add_argument("--nwordstgt", metavar="N", default=-1, type=int, help="number of target words to retain") group.add_argument("--nwordssrc", metavar="N", default=-1, type=int, help="number of source words to retain") group.add_argument("--alignfile", metavar="ALIGN", default=None, help="an alignment file (optional)") parser.add_argument('--dataset-impl', metavar='FORMAT', default='mmap', choices=get_available_dataset_impl(), help='output dataset implementation') group.add_argument("--joined-dictionary", action="store_true", help="Generate joined dictionary") group.add_argument("--only-source", action="store_true", help="Only process the source language") group.add_argument("--padding-factor", metavar="N", default=8, type=int, help="Pad dictionary size to be multiple of N") group.add_argument("--workers", metavar="N", default=1, type=int, help="number of parallel workers") # group.add_argument("--use-bert-in-target", action="store_true", # help="Whether to use BERT target model or not.") # group.add_argument("--target-bert-model", metavar="N", default='bert-base-uncased', type=str, # help="Pre-trained BERT model used in the BERT loss" # "and needed to preprocess target language sentences.") # fmt: on return parser
class LanguageModelingConfig(FairseqDataclass): data: Optional[str] = field(default=None, metadata={"help": "path to data directory"}) sample_break_mode: SAMPLE_BREAK_MODE_CHOICES = field( default="none", metadata={ "help": 'If omitted or "none", fills each sample with tokens-per-sample ' 'tokens. If set to "complete", splits samples only at the end ' "of sentence, but may include multiple sentences per sample. " '"complete_doc" is similar but respects doc boundaries. ' 'If set to "eos", includes only one sentence per sample.' }, ) tokens_per_sample: int = field( default=1024, metadata={"help": "max number of tokens per sample for LM dataset"}, ) output_dictionary_size: int = field( default=-1, metadata={"help": "limit the size of output dictionary"}) self_target: bool = field(default=False, metadata={"help": "include self target"}) future_target: bool = field(default=False, metadata={"help": "include future target"}) past_target: bool = field(default=False, metadata={"help": "include past target"}) add_bos_token: bool = field( default=False, metadata={"help": "prepend beginning of sentence token (<s>)"}) max_target_positions: Optional[int] = field( default=None, metadata={"help": "max number of tokens in the target sequence"}) shorten_method: SHORTEN_METHOD_CHOICES = field( default="none", metadata={ "help": "if not none, shorten sequences that exceed --tokens-per-sample" }, ) shorten_data_split_list: str = field( default="", metadata={ "help": "comma-separated list of dataset splits to apply shortening to, " 'e.g., "train,valid" (default: all dataset splits)' }, ) # TODO common vars below add to parent seed: int = II("common.seed") dataset_impl: Optional[ChoiceEnum( get_available_dataset_impl())] = II("dataset.dataset_impl") data_buffer_size: int = II("dataset.data_buffer_size") tpu: bool = II("common.tpu") use_plasma_view: bool = II("common.use_plasma_view") plasma_path: str = II("common.plasma_path")
def add_dataset_args(parser, train=False, gen=False): group = parser.add_argument_group("Dataset and data loading") # fmt: off group.add_argument('--num-workers', default=1, type=int, metavar='N', help='how many subprocesses to use for data loading') group.add_argument('--skip-invalid-size-inputs-valid-test', action='store_true', help='ignore too long or too short lines in valid and test set') group.add_argument('--max-tokens', type=int, metavar='N', help='maximum number of tokens in a batch') group.add_argument('--max-sentences', '--batch-size', type=int, metavar='N', help='maximum number of sentences in a batch') group.add_argument('--required-batch-size-multiple', default=8, type=int, metavar='N', help='batch size will either be less than this value, ' 'or a multiple of this value') group.add_argument('--required-seq-len-multiple', default=1, type=int, metavar='N', help='maximum sequence length in batch will be a multiplier of this value') parser.add_argument('--dataset-impl', metavar='FORMAT', choices=get_available_dataset_impl(), help='output dataset implementation') group.add_argument('--data-buffer-size', default=10, type=int, metavar='N', help='number of batches to preload') if train: group.add_argument('--train-subset', default='train', metavar='SPLIT', help='data subset to use for training (e.g. train, valid, test)') group.add_argument('--valid-subset', default='valid', metavar='SPLIT', help='comma separated list of data subsets to use for validation' ' (e.g. train, valid, test)') group.add_argument('--validate-interval', type=int, default=1, metavar='N', help='validate every N epochs') group.add_argument('--validate-interval-updates', type=int, default=0, metavar='N', help='validate every N updates') group.add_argument('--validate-after-updates', type=int, default=0, metavar='N', help='dont validate until reaching this many updates') group.add_argument('--fixed-validation-seed', default=None, type=int, metavar='N', help='specified random seed for validation') group.add_argument('--disable-validation', action='store_true', help='disable validation') group.add_argument('--max-tokens-valid', type=int, metavar='N', help='maximum number of tokens in a validation batch' ' (defaults to --max-tokens)') group.add_argument('--max-sentences-valid', type=int, metavar='N', help='maximum number of sentences in a validation batch' ' (defaults to --max-sentences)') group.add_argument('--curriculum', default=0, type=int, metavar='N', help='don\'t shuffle batches for first N epochs') if gen: group.add_argument('--gen-subset', default='test', metavar='SPLIT', help='data subset to generate (train, valid, test)') group.add_argument('--num-shards', default=1, type=int, metavar='N', help='shard generation over N shards') group.add_argument('--shard-id', default=0, type=int, metavar='ID', help='id of the shard to generate (id < num_shards)') # fmt: on return group
def get_parser(): parser = argparse.ArgumentParser( description="writes text from binarized file to stdout" ) # fmt: off parser.add_argument('--dataset-impl', help='dataset implementation', choices=indexed_dataset.get_available_dataset_impl()) parser.add_argument('--dict', metavar='FP', help='dictionary containing known words', default=None) parser.add_argument('--input', metavar='FP', required=True, help='binarized file to read') # fmt: on return parser
def add_preprocess_args(parser): group = parser.add_argument_group("Preprocessing") # fmt: off group.add_argument("-s", "--source-lang", default=None, metavar="SRC", help="source language") group.add_argument("-t", "--target-lang", default=None, metavar="TARGET", help="target language") group.add_argument("--trainpref", metavar="FP", default=None, help="train file prefix") group.add_argument("--validpref", metavar="FP", default=None, help="comma separated, valid file prefixes") group.add_argument("--testpref", metavar="FP", default=None, help="comma separated, test file prefixes") group.add_argument("--align-suffix", metavar="FP", default=None, help="alignment file suffix") group.add_argument("--destdir", metavar="DIR", default="data-bin", help="destination dir") group.add_argument("--thresholdtgt", metavar="N", default=0, type=int, help="map words appearing less than threshold times to unknown") group.add_argument("--thresholdsrc", metavar="N", default=0, type=int, help="map words appearing less than threshold times to unknown") group.add_argument("--tgtdict", metavar="FP", help="reuse given target dictionary") group.add_argument("--srcdict", metavar="FP", help="reuse given source dictionary") group.add_argument("--nwordstgt", metavar="N", default=-1, type=int, help="number of target words to retain") group.add_argument("--nwordssrc", metavar="N", default=-1, type=int, help="number of source words to retain") group.add_argument("--alignfile", metavar="ALIGN", default=None, help="an alignment file (optional)") parser.add_argument('--dataset-impl', metavar='FORMAT', default='mmap', choices=get_available_dataset_impl(), help='output dataset implementation') group.add_argument("--joined-dictionary", action="store_true", help="Generate joined dictionary") group.add_argument("--only-source", action="store_true", help="Only process the source language") group.add_argument("--padding-factor", metavar="N", default=8, type=int, help="Pad dictionary size to be multiple of N") group.add_argument("--workers", metavar="N", default=1, type=int, help="number of parallel workers") #TODO: Change here #group.add_argument("--multi-views", action='store_true', # help="Load Multi-views") # fmt: on return parser
def add_dataset_args(parser, train=False, gen=False): group = parser.add_argument_group('Dataset and data loading') # fmt: off group.add_argument('--num-workers', default=1, type=int, metavar='N', help='how many subprocesses to use for data loading') group.add_argument('--skip-invalid-size-inputs-valid-test', action='store_true', help='ignore too long or too short lines in valid and test set') group.add_argument('--max-tokens', type=int, metavar='N', help='maximum number of tokens in a batch') group.add_argument('--max-sentences', '--batch-size', type=int, metavar='N', help='maximum number of sentences in a batch') group.add_argument('--required-batch-size-multiple', default=8, type=int, metavar='N', help='batch size will be a multiplier of this value') parser.add_argument('--dataset-impl', metavar='FORMAT', choices=get_available_dataset_impl(), help='output dataset implementation') if train: group.add_argument('--train-subset', default='train', metavar='SPLIT', choices=['train', 'valid', 'test'], help='data subset to use for training (train, valid, test)') group.add_argument('--valid-subset', default='valid', metavar='SPLIT', help='comma separated list of data subsets to use for validation' ' (train, valid, valid1, test, test1)') group.add_argument('--validate-interval', type=int, default=1, metavar='N', help='validate every N epochs') group.add_argument('--disable-validation', action='store_true', help='disable validation') group.add_argument('--max-tokens-valid', type=int, metavar='N', help='maximum number of tokens in a validation batch' ' (defaults to --max-tokens)') group.add_argument('--max-sentences-valid', type=int, metavar='N', help='maximum number of sentences in a validation batch' ' (defaults to --max-sentences)') group.add_argument('--curriculum', default=0, type=int, metavar='N', help='don\'t shuffle batches for first N epochs') if gen: group.add_argument('--gen-subset', default='test', metavar='SPLIT', help='data subset to generate (train, valid, test)') group.add_argument('--num-shards', default=1, type=int, metavar='N', help='shard generation over N shards') group.add_argument('--shard-id', default=0, type=int, metavar='ID', help='id of the shard to generate (id < num_shards)') # fmt: on return group
class DatasetParams(FairseqDataclass): num_workers: int = field( default=1, metadata={"help": "how many subprocesses to use for data loading"} ) skip_invalid_size_inputs_valid_test: bool = field( default=False, metadata={"help": "ignore too long or too short lines in valid and test set"}, ) max_tokens: Optional[int] = field( default=None, metadata={"help": "maximum number of tokens in a batch"} ) batch_size: Optional[int] = field( default=None, metadata={"help": "number of examples in a batch"} ) required_batch_size_multiple: int = field( default=8, metadata={"help": "batch size will be a multiplier of this value"} ) required_seq_len_multiple: int = field( default=1, metadata={"help": "maximum sequence length in batch will be a multiplier of this value"} ) dataset_impl: Optional[ChoiceEnum(get_available_dataset_impl())] = field( default=None, metadata={"help": "output dataset implementation"} ) data_buffer_size: int = field( default=10, metadata={"help": "Number of batches to preload"} ) train_subset: str = field( default="train", metadata={"help": "data subset to use for training (e.g. train, valid, test)"}, ) valid_subset: str = field( default="valid", metadata={ "help": "comma separated list of data subsets to use for validation" " (e.g. train, valid, test)" }, ) validate_interval: int = field( default=1, metadata={"help": "validate every N epochs"} ) validate_interval_updates: int = field( default=0, metadata={"help": "validate every N updates"} ) validate_after_updates: int = field( default=0, metadata={"help": "dont validate until reaching this many updates"} ) fixed_validation_seed: Optional[int] = field( default=None, metadata={"help": "specified random seed for validation"} ) disable_validation: bool = field( default=False, metadata={"help": "disable validation"} ) max_tokens_valid: Optional[int] = field( default=None, metadata={ "help": "maximum number of tokens in a validation batch" " (defaults to --max-tokens)" }, ) batch_size_valid: Optional[int] = field( default=None, metadata={ "help": "batch size of the validation batch" " (defaults to --batch-size)" }, ) curriculum: int = field( default=0, metadata={"help": "don't shuffle batches for first N epochs"} ) gen_subset: str = field( default="test", metadata={"help": "data subset to generate (train, valid, test)"}, ) num_shards: int = field( default=1, metadata={"help": "shard generation over N shards"} ) shard_id: int = field( default=0, metadata={"help": "id of the shard to generate (id < num_shards)"} )
def add_preprocess_args(parser): group = parser.add_argument_group('Preprocessing') # fmt: off group.add_argument("-s", "--source-lang", default=None, metavar="SRC", help="source language") group.add_argument("-t", "--target-lang", default=None, metavar="TARGET", help="target language") group.add_argument("--trainpref", metavar="FP", default=None, help="train file prefix") group.add_argument("--validpref", metavar="FP", default=None, help="comma separated, valid file prefixes") group.add_argument("--testpref", metavar="FP", default=None, help="comma separated, test file prefixes") group.add_argument( "--destdir", metavar="DIR", default="data-bin", help="destination dir for target actions and states information") group.add_argument( "--embdir", metavar="DIR", default="data-bin", help="destination dir for pre-trained source embeddings") group.add_argument( "--thresholdtgt", metavar="N", default=0, type=int, help="map words appearing less than threshold times to unknown") group.add_argument( "--thresholdsrc", metavar="N", default=0, type=int, help="map words appearing less than threshold times to unknown") group.add_argument("--tgtdict", metavar="FP", help="reuse given target dictionary") group.add_argument("--srcdict", metavar="FP", help="reuse given source dictionary") group.add_argument("--nwordstgt", metavar="N", default=-1, type=int, help="number of target words to retain") group.add_argument("--nwordssrc", metavar="N", default=-1, type=int, help="number of source words to retain") group.add_argument("--alignfile", metavar="ALIGN", default=None, help="an alignment file (optional)") parser.add_argument('--dataset-impl', metavar='FORMAT', default='mmap', choices=get_available_dataset_impl(), help='output dataset implementation') group.add_argument("--joined-dictionary", action="store_true", help="Generate joined dictionary") group.add_argument("--only-source", action="store_true", help="Only process the source language") group.add_argument("--padding-factor", metavar="N", default=8, type=int, help="Pad dictionary size to be multiple of N") group.add_argument("--workers", metavar="N", default=1, type=int, help="number of parallel workers") # for extending BART vocabulary with actions specific symbols parser.add_argument( '--node-freq-min', default=5, type=int, help='minimum frequency of node names to add to vocabulary') # for pretrained external embeddings group.add_argument("--pretrained-embed", default='roberta.base', help="Type of pretrained embedding") # NOTE: Previous default "17 18 19 20 21 22 23 24" group.add_argument('--bert-layers', nargs='+', type=int, help='RoBERTa layers to extract (default last)') # for stack-transformer add_state_machine_args(group) # fmt: on return parser
dataset_builder.finalize(os.path.join(output_dir, split + '.text.idx')) relations_builder.finalize( os.path.join(output_dir, split + '.relations.idx')) annotations_list = np.concatenate(annotations_list) np.save(os.path.join(output_dir, split + '.annotations'), annotations_list) if __name__ == '__main__': parser = argparse.ArgumentParser( description='Data preparation for SemEval 2010 Task 8 dataset') parser.add_argument('--split', type=str, help='Dataset split', choices=['train', 'dev', 'test']) parser.add_argument('--root-dir', type=str, default='../data/SemEval2010_task8_all_data', help='SemEval 2010 Task 8 root directory') parser.add_argument('--roberta-dir', type=str, default='../data/roberta', help='RoBERTa directory with all dictionaries.') parser.add_argument('--append-eos', default=False, action='store_true') parser.add_argument('--dataset-impl', metavar='FORMAT', default='mmap', choices=indexed_dataset.get_available_dataset_impl(), help='output dataset implementation') args = parser.parse_args() main(args)
class TranslationConfig(FairseqDataclass): data: Optional[str] = field( default=None, metadata={ "help": "colon separated path to data directories list, will be iterated upon during epochs " "in round-robin manner; however, valid and test data are always in the first directory " "to avoid the need for repeating them in all directories" }, ) source_lang: Optional[str] = field( default=None, metadata={ "help": "source language", "argparse_alias": "-s", }, ) target_lang: Optional[str] = field( default=None, metadata={ "help": "target language", "argparse_alias": "-t", }, ) load_alignments: bool = field( default=False, metadata={"help": "load the binarized alignments"}) left_pad_source: bool = field( default=True, metadata={"help": "pad the source on the left"}) left_pad_target: bool = field( default=False, metadata={"help": "pad the target on the left"}) max_source_positions: int = field( default=1024, metadata={"help": "max number of tokens in the source sequence"}) max_target_positions: int = field( default=1024, metadata={"help": "max number of tokens in the target sequence"}) upsample_primary: int = field( default=-1, metadata={"help": "the amount of upsample primary dataset"}) truncate_source: bool = field( default=False, metadata={"help": "truncate source to max-source-positions"}) num_batch_buckets: int = field( default=0, metadata={ "help": "if >0, then bucket source and target lengths into " "N buckets and pad accordingly; this is useful on TPUs to minimize the number of compilations" }, ) train_subset: str = II("dataset.train_subset") dataset_impl: Optional[ChoiceEnum( get_available_dataset_impl())] = II("dataset.dataset_impl") required_seq_len_multiple: int = II("dataset.required_seq_len_multiple") # options for reporting BLEU during validation eval_bleu: bool = field(default=False, metadata={"help": "evaluation with BLEU scores"}) eval_bleu_args: Optional[str] = field( default="{}", metadata={ "help": 'generation args for BLUE scoring, e.g., \'{"beam": 4, "lenpen": 0.6}\', as JSON string' }, ) eval_bleu_detok: str = field( default="space", metadata={ "help": "detokenize before computing BLEU (e.g., 'moses'); required if using --eval-bleu; " "use 'space' to disable detokenization; see fairseq.data.encoders for other options" }, ) eval_bleu_detok_args: Optional[str] = field( default="{}", metadata={ "help": "args for building the tokenizer, if needed, as JSON string" }, ) eval_tokenized_bleu: bool = field( default=False, metadata={"help": "compute tokenized BLEU instead of sacrebleu"}) eval_bleu_remove_bpe: Optional[str] = field( default=None, metadata={ "help": "remove BPE before computing BLEU", "argparse_const": "@@ ", }, ) eval_bleu_print_samples: bool = field( default=False, metadata={"help": "print sample generations during validation"}) extra_data: bool = field(default=False, metadata={"help": "..."}) denoising: bool = field(default=False, metadata={"help": "..."}) masking: bool = field(default=False, metadata={"help": "..."}) electra_pretrain: bool = field(default=False, metadata={"help": "..."}) input_mapping: bool = field(default=False, metadata={"help": "..."}) electra_model_name: str = field(default='electra-base', metadata={"help": "..."}) bart_model_name: str = field(default='bart-base', metadata={"help": "..."}) bert_model_name: str = field(default='bert-base-cased', metadata={"help": "..."}) finetune_bert: bool = field(default=False, metadata={"help": "..."}) use_bertinput: bool = field(default=False, metadata={"help": "..."}) use_bartinput: bool = field(default=False, metadata={"help": "..."}) use_electrainput: bool = field(default=False, metadata={"help": "..."}) # mask_ratio = None, random_ratio = None, insert_ratio = None, rotate_ratio = None, permute_sentence_ratio = None mask_ratio: float = field(default=0.3, metadata={"help": "..."}) random_ratio: float = field(default=0.1, metadata={"help": "..."}) insert_ratio: float = field(default=0.0, metadata={"help": "..."}) rotate_ratio: float = field(default=0.5, metadata={"help": "..."}) permute_sentence_ratio: float = field(default=1.0, metadata={"help": "..."})
class TranslationlfConfig(FairseqDataclass): data: Optional[str] = field( default=None, metadata={ "help": "colon separated path to data directories list, will be iterated upon during epochs " "in round-robin manner; however, valid and test data are always in the first directory " "to avoid the need for repeating them in all directories" }, ) source_lang: Optional[str] = field( default=None, metadata={ "help": "source language", "argparse_alias": "-s", }, ) target_lang: Optional[str] = field( default=None, metadata={ "help": "target language", "argparse_alias": "-t", }, ) load_alignments: bool = field( default=False, metadata={"help": "load the binarized alignments"}) left_pad_source: bool = field( default=True, metadata={"help": "pad the source on the left"}) left_pad_target: bool = field( default=False, metadata={"help": "pad the target on the left"}) max_source_positions: int = field( default=1024, metadata={"help": "max number of tokens in the source sequence"}) max_target_positions: int = field( default=1024, metadata={"help": "max number of tokens in the target sequence"}) upsample_primary: int = field( default=-1, metadata={"help": "the amount of upsample primary dataset"}) truncate_source: bool = field( default=False, metadata={"help": "truncate source to max-source-positions"}) num_batch_buckets: int = field( default=0, metadata={ "help": "if >0, then bucket source and target lengths into " "N buckets and pad accordingly; this is useful on TPUs to minimize the number of compilations" }, ) train_subset: str = II("dataset.train_subset") #valid_subset: str = II("dataset.valid_subset") #test_subset: str = II("dataset.test_subset") dataset_impl: Optional[ChoiceEnum( get_available_dataset_impl())] = II("dataset.dataset_impl") required_seq_len_multiple: int = II("dataset.required_seq_len_multiple") # options for reporting BLEU during validation eval_bleu: bool = field(default=False, metadata={"help": "evaluation with BLEU scores"}) eval_bleu_args: Optional[str] = field( default="{}", metadata={ "help": 'generation args for BLUE scoring, e.g., \'{"beam": 4, "lenpen": 0.6}\', as JSON string' }, ) eval_bleu_detok: str = field( default="space", metadata={ "help": "detokenize before computing BLEU (e.g., 'moses'); required if using --eval-bleu; " "use 'space' to disable detokenization; see fairseq.data.encoders for other options" }, ) eval_bleu_detok_args: Optional[str] = field( default="{}", metadata={ "help": "args for building the tokenizer, if needed, as JSON string" }, ) eval_tokenized_bleu: bool = field( default=False, metadata={"help": "compute tokenized BLEU instead of sacrebleu"}) eval_bleu_remove_bpe: Optional[str] = field( default=None, metadata={ "help": "remove BPE before computing BLEU", "argparse_const": "@@ ", }, ) eval_bleu_print_samples: bool = field( default=False, metadata={"help": "print sample generations during validation"}) # todo check! lf_path: Optional[str] = field( default=None, metadata={ "help": "long former representations path", "argparse_alias": "-lf-path", }, ) encoding_path: Optional[str] = field( default="encodings.json", metadata={ "help": "encodings output path", "argparse_alias": "-enc-path", }, ) # todo (next) three paths for h5py files sen_doc: Optional[str] = field( default=None, metadata={ "help": "sentence document alignment path to get which sentence belongs to which document ", "argparse_alias": "-sen-doc", }, )
def add_dataset_args(parser, train=False, gen=False): group = parser.add_argument_group('Dataset and data loading') # fmt: off group.add_argument('--num-workers', default=1, type=int, metavar='N', help='how many subprocesses to use for data loading') group.add_argument('--skip-invalid-size-inputs-valid-test', action='store_true', help='ignore too long or too short lines in valid and test set') group.add_argument('--max-tokens', type=int, metavar='N', help='maximum number of tokens in a batch') group.add_argument('--max-sentences', '--batch-size', type=int, metavar='N', help='maximum number of sentences in a batch') group.add_argument('--required-batch-size-multiple', default=8, type=int, metavar='N', help='batch size will be a multiplier of this value') parser.add_argument('--dataset-impl', metavar='FORMAT', choices=get_available_dataset_impl(), help='output dataset implementation') parser.add_argument('--set-add-align-head', action='store_true', help='if True, add an head in each decoder layer for alignment') parser.add_argument('--set-shift', action='store_true', help='if True, train and test shifted attention.') parser.add_argument('--alignment-task', default='vanilla', type=str, choices=['vanilla', 'usehead', 'addhead', 'supalign', 'ptrnet','dual'], help='train and test shifted attention.') parser.add_argument('--set-src-bow-loss', action='store_true', help='start to train with src bag-of-words loss') group.add_argument('--beam', default=5, type=int, metavar='N', help='beam size') # if 'alignment_layer' in parser: parser.add_argument('--alignment-layer', default=2, type=int, help='train and test shifted attention.') parser.add_argument('--alignment-heads', type=int, metavar='D', help='Number of cross attention heads per layer to supervised with alignments') parser.add_argument('--cons-type', type=str, metavar='D', help='Number of cross attention heads per layer to supervised with alignments') parser.add_argument('--set-dual-trans', action='store_true', help='train attention agreement model') if train: group.add_argument('--train-subset', default='train', metavar='SPLIT', choices=['train', 'valid', 'test'], help='data subset to use for training (train, valid, test)') group.add_argument('--valid-subset', default='valid', metavar='SPLIT', help='comma separated list of data subsets to use for validation' ' (train, valid, valid1, test, test1)') group.add_argument('--validate-interval', type=int, default=1, metavar='N', help='validate every N epochs') group.add_argument('--fixed-validation-seed', default=None, type=int, metavar='N', help='specified random seed for validation') group.add_argument('--disable-validation', action='store_true', help='disable validation') group.add_argument('--max-tokens-valid', type=int, metavar='N', help='maximum number of tokens in a validation batch' ' (defaults to --max-tokens)') group.add_argument('--max-sentences-valid', type=int, metavar='N', help='maximum number of sentences in a validation batch' ' (defaults to --max-sentences)') group.add_argument('--curriculum', default=0, type=int, metavar='N', help='don\'t shuffle batches for first N epochs') group.add_argument('--model-overrides', default="{}", type=str, metavar='DICT', help='a dictionary used to override model args at generation ' 'that were used during model training') if gen: group.add_argument('--gen-subset', default='test', metavar='SPLIT', help='data subset to generate (train, valid, test)') group.add_argument('--num-shards', default=1, type=int, metavar='N', help='shard generation over N shards') group.add_argument('--shard-id', default=0, type=int, metavar='ID', help='id of the shard to generate (id < num_shards)') group.add_argument('--print-vanilla-alignment', action="store_true", help='use shifted attention to extract alignment') # fmt: on return group
class MultilingualLanguageModelingConfig(FairseqDataclass): # TODO common var add to parent data: Optional[str] = field(default=None, metadata={"help": "path to data directory"}) sample_break_mode: SAMPLE_BREAK_MODE_CHOICES = field( default="none", metadata={ "help": 'If omitted or "none", fills each sample with tokens-per-sample ' 'tokens. If set to "complete", splits samples only at the end ' "of sentence, but may include multiple sentences per sample. " '"complete_doc" is similar but respects doc boundaries. ' 'If set to "eos", includes only one sentence per sample.' }, ) tokens_per_sample: int = field( default=1024, metadata={"help": "max number of tokens per sample for LM dataset"}, ) output_dictionary_size: int = field( default=-1, metadata={"help": "limit the size of output dictionary"}) self_target: bool = field(default=False, metadata={"help": "include self target"}) future_target: bool = field(default=False, metadata={"help": "include future target"}) past_target: bool = field(default=False, metadata={"help": "include past target"}) add_bos_token: bool = field( default=False, metadata={"help": "prepend lang id token <dialect>"}) max_source_positions: Optional[int] = field( default=None, metadata={"help": "max number of tokens in the source sequence"}) max_target_positions: Optional[int] = field( default=None, metadata={"help": "max number of tokens in the target sequence"}) pad_to_fixed_length: Optional[bool] = field( default=False, metadata={"help": "pad to fixed length"}) pad_to_fixed_bsz: Optional[bool] = field( default=False, metadata={"help": "boolean to pad to fixed batch size"}) multilang_sampling_alpha: Optional[float] = field( default=1.0, metadata={ "help": "smoothing alpha for sample rations across multiple datasets" }, ) shorten_method: SHORTEN_METHOD_CHOICES = field( default="none", metadata={ "help": "if not none, shorten sequences that exceed --tokens-per-sample" }, ) shorten_data_split_list: str = field( default="", metadata={ "help": "comma-separated list of dataset splits to apply shortening to, " 'e.g., "train,valid" (default: all dataset splits)' }, ) langs: str = field( default="", metadata={ "help": "comma-separated list of languages (default: all directories in data path)" }, ) baseline_model_langs: str = field( default="", metadata={ "help": "comma-separated list of languages in the baseline model (default: none)" }, ) # TODO: legacy parameter kept for compatibility baseline_model: str = field( default="", metadata={"help": "path to the baseline model (default: none)"}, ) lang_to_offline_shard_ratio: str = field( default="", metadata={ "help": "absolute path of tsv file location to indicate lang to offline shard ratio.", }, ) # TODO common vars below add to parent seed: int = II("common.seed") dataset_impl: Optional[ChoiceEnum( get_available_dataset_impl())] = II("dataset.dataset_impl") data_buffer_size: int = II("dataset.data_buffer_size") tpu: bool = II("common.tpu") batch_size: Optional[int] = II("dataset.batch_size") batch_size_valid: Optional[int] = II("dataset.batch_size_valid") train_subset: str = II("common.train_subset") valid_subset: str = II("common.valid_subset")
class TranslationConfig(FairseqDataclass): data: Optional[str] = field( default=None, metadata={ "help": "colon separated path to data directories list, will be iterated upon during epochs " "in round-robin manner; however, valid and test data are always in the first directory " "to avoid the need for repeating them in all directories" }, ) source_lang: Optional[str] = field( default=None, metadata={ "help": "source language", "argparse_alias": "-s", }, ) target_lang: Optional[str] = field( default=None, metadata={ "help": "target language", "argparse_alias": "-t", }, ) load_alignments: bool = field( default=False, metadata={"help": "load the binarized alignments"}) left_pad_source: bool = field( default=True, metadata={"help": "pad the source on the left"}) left_pad_target: bool = field( default=False, metadata={"help": "pad the target on the left"}) max_source_positions: int = field( default=1024, metadata={"help": "max number of tokens in the source sequence"}) max_target_positions: int = field( default=1024, metadata={"help": "max number of tokens in the target sequence"}) upsample_primary: int = field( default=-1, metadata={"help": "the amount of upsample primary dataset"}) truncate_source: bool = field( default=False, metadata={"help": "truncate source to max-source-positions"}) num_batch_buckets: int = field( default=0, metadata={ "help": "if >0, then bucket source and target lengths into " "N buckets and pad accordingly; this is useful on TPUs to minimize the number of compilations" }, ) train_subset: str = II("dataset.train_subset") dataset_impl: Optional[ChoiceEnum( get_available_dataset_impl())] = II("dataset.dataset_impl") required_seq_len_multiple: int = II("dataset.required_seq_len_multiple") # options for reporting BLEU during validation eval_bleu: bool = field(default=False, metadata={"help": "evaluation with BLEU scores"}) eval_bleu_args: Optional[str] = field( default="{}", metadata={ "help": 'generation args for BLUE scoring, e.g., \'{"beam": 4, "lenpen": 0.6}\', as JSON string' }, ) eval_bleu_detok: str = field( default="space", metadata={ "help": "detokenize before computing BLEU (e.g., 'moses'); required if using --eval-bleu; " "use 'space' to disable detokenization; see fairseq.data.encoders for other options" }, ) eval_bleu_detok_args: Optional[str] = field( default="{}", metadata={ "help": "args for building the tokenizer, if needed, as JSON string" }, ) eval_tokenized_bleu: bool = field( default=False, metadata={"help": "compute tokenized BLEU instead of sacrebleu"}) eval_bleu_remove_bpe: Optional[str] = field( default=None, metadata={ "help": "remove BPE before computing BLEU", "argparse_const": "@@ ", }, ) eval_bleu_print_samples: bool = field( default=False, metadata={"help": "print sample generations during validation"}) dataset_add_token_type_ids: bool = field( default=False, metadata={"help": "if set, append token_type_ids (all zeros)"}) freeze_encoder: bool = field( default=False, metadata={"help": "if set, freeze encoder's weight"}) # options fields for mask sample_break_mode: Optional[ChoiceEnum([ "none", "complete", "complete_doc", "eos" ])] = field( default="complete", metadata={ "help": 'If omitted or "none", fills each sample with tokens-per-sample ' 'tokens. If set to "complete", splits samples only at the end ' "of sentence, but may include multiple sentences per sample. " '"complete_doc" is similar but respects doc boundaries. ' 'If set to "eos", includes only one sentence per sample.', }) tokens_per_sample: int = field( default=512, metadata={ "help": "max number of total tokens over all segments " "per sample for BERT dataset", }) mask_prob: float = field( default=0.15, metadata={"help": "probability of replacing a token with mask"}) leave_unmasked_prob: float = field( default=0.1, metadata={"help": "probability that a masked token is unmasked"}) random_token_prob: float = field( default=0.1, metadata={ "help": "probability of replacing a token with a random token" }) freq_weighted_replacement: bool = field( default=False, metadata={ "help": "sample random replacement words based on word frequencies" }) mask_whole_words: bool = field( default=False, metadata={"help": "mask whole words; you may also want to set --bpe"}) mask_multiple_length: int = field( default=1, metadata={"help": "repeat the mask indices multiple times"}) mask_stdev: float = field(default=0.0, metadata={"help": "stdev of the mask length"}) shorten_dataset: Optional[ChoiceEnum([ "none", "none", "random_crop" ])] = field( default="none", metadata={ "help": "if not none, shorten sequences that exceed --tokens-per-sample" }) shorten_data_split_list: str = field( default="", metadata={ "help": "comma-separated list of dataset splits to apply shortening to, " 'e.g., "train,valid" (default: all dataset splits)', }) # Custom for bert use_bert_dict: str = field( default="", metadata={"help": "Specify which dictonary use custome one"}, ) load_bert_path: str = field( default="", metadata={ "help": "If set, load bert weight in path to encoder. Please add --add-token-type-embeddings and --type-vocab-size." }, )
def add_preprocess_args(parser): group = parser.add_argument_group('Preprocessing') # fmt: off group.add_argument("-s", "--source-lang", default=None, metavar="SRC", help="source language") group.add_argument("-t", "--target-lang", default=None, metavar="TARGET", help="target language") group.add_argument("--trainpref", metavar="FP", default=None, help="train file prefix") group.add_argument("--validpref", metavar="FP", default=None, help="comma separated, valid file prefixes") group.add_argument("--testpref", metavar="FP", default=None, help="comma separated, test file prefixes") group.add_argument("--align-suffix", metavar="FP", default=None, help="alignment file suffix") group.add_argument("--destdir", metavar="DIR", default="data-bin", help="destination dir") group.add_argument("--trainoutf", metavar="FILENAME", default="train", help="Output training data filename after preprocessed") group.add_argument("--validoutf", metavar="FILENAME", default="valid", help="Output validate data filename after preprocessed") group.add_argument("--testoutf", metavar="FILENAME", default="test", help="Output test data filename after preprocessed") group.add_argument( "--thresholdtgt", metavar="N", default=0, type=int, help="map words appearing less than threshold times to unknown") group.add_argument( "--thresholdsrc", metavar="N", default=0, type=int, help="map words appearing less than threshold times to unknown") group.add_argument("--tgtdict", metavar="FP", help="reuse given target dictionary") group.add_argument("--srcdict", metavar="FP", help="reuse given source dictionary") group.add_argument("--nwordstgt", metavar="N", default=-1, type=int, help="number of target words to retain") group.add_argument("--nwordssrc", metavar="N", default=-1, type=int, help="number of source words to retain") group.add_argument("--alignfile", metavar="ALIGN", default=None, help="an alignment file (optional)") parser.add_argument('--dataset-impl', metavar='FORMAT', default='mmap', choices=get_available_dataset_impl(), help='output dataset implementation') parser.add_argument( '--file-format', metavar='FORMAT', default=None, choices=['smiles'], help='Dataset file format, smiles mean no space between symbols') group.add_argument("--joined-dictionary", action="store_true", help="Generate joined dictionary") group.add_argument("--only-source", action="store_true", help="Only process the source language") group.add_argument("--padding-factor", metavar="N", default=8, type=int, help="Pad dictionary size to be multiple of N") group.add_argument("--workers", metavar="N", default=1, type=int, help="number of parallel workers") # fmt: on return parser
class TranslationConfig(FairseqDataclass): data: Optional[str] = field( default=None, metadata={ "help": "colon separated path to data directories list, will be iterated upon during epochs " "in round-robin manner; however, valid and test data are always in the first directory " "to avoid the need for repeating them in all directories" }, ) source_lang: Optional[str] = field( default=None, metadata={ "help": "source language", "argparse_alias": "-s", }, ) target_lang: Optional[str] = field( default=None, metadata={ "help": "target language", "argparse_alias": "-t", }, ) load_alignments: bool = field( default=False, metadata={"help": "load the binarized alignments"}) left_pad_source: bool = field( default=False, metadata={"help": "pad the source on the left"}) left_pad_target: bool = field( default=False, metadata={"help": "pad the target on the left"}) max_source_positions: int = field( default=1024, metadata={"help": "max number of tokens in the source sequence"}) max_target_positions: int = field( default=1024, metadata={"help": "max number of tokens in the target sequence"}) upsample_primary: int = field( default=-1, metadata={"help": "the amount of upsample primary dataset"}) truncate_source: bool = field( default=False, metadata={"help": "truncate source to max-source-positions"}) num_batch_buckets: int = field( default=0, metadata={ "help": "if >0, then bucket source and target lengths into " "N buckets and pad accordingly; this is useful on TPUs to minimize the number of compilations" }, ) train_subset: str = II("dataset.train_subset") dataset_impl: Optional[ChoiceEnum( get_available_dataset_impl())] = II("dataset.dataset_impl") required_seq_len_multiple: int = II("dataset.required_seq_len_multiple") # Additional options use_bert_dict: str = field( default="", metadata={"help": "Specify which dictonary use custome one"}, ) load_bert_path: str = field( default="", metadata={ "help": "If set, load bert weight in path to encoder. Please add --add-token-type-embeddings and --type-vocab-size." }, ) prepend_bos_to_src: bool = field( default=False, metadata={"help": "prepend bos token to src dataset"}) freeze_encoder: bool = field( default=False, metadata={"help": "if set, freeze encoder's weight"}) # options for reporting BLEU during validation eval_bleu: bool = field(default=False, metadata={"help": "evaluation with BLEU scores"}) eval_bleu_args: Optional[str] = field( default="{}", metadata={ "help": 'generation args for BLUE scoring, e.g., \'{"beam": 4, "lenpen": 0.6}\', as JSON string' }, ) eval_bleu_detok: str = field( default="space", metadata={ "help": "detokenize before computing BLEU (e.g., 'moses'); required if using --eval-bleu; " "use 'space' to disable detokenization; see fairseq.data.encoders for other options" }, ) eval_bleu_detok_args: Optional[str] = field( default="{}", metadata={ "help": "args for building the tokenizer, if needed, as JSON string" }, ) eval_tokenized_bleu: bool = field( default=False, metadata={"help": "compute tokenized BLEU instead of sacrebleu"}) eval_bleu_remove_bpe: Optional[str] = field( default=None, metadata={ "help": "remove BPE before computing BLEU", "argparse_const": "@@ ", }, ) eval_bleu_print_samples: bool = field( default=False, metadata={"help": "print sample generations during validation"}) dataset_add_token_type_ids: bool = field( default=False, metadata={"help": "if set, append token_type_ids (all zeros)"})
def add_dataset_args(parser, train=False, gen=False): """Same as fairseq.options.add_dataset_args but without the "data" argument""" group = parser.add_argument_group("Dataset and data loading") group.add_argument( "data", metavar="DIR", nargs="?", help="path to data directory. " "This is not needed but kept for backward compatibility", ) group.add_argument( "--num-workers", default=0, type=int, metavar="N", help="how many subprocesses to use for data loading", ) group.add_argument( "--skip-invalid-size-inputs-valid-test", action="store_true", help="Ignore too long or too short lines in valid and test set", ) group.add_argument( "--max-tokens", default=5000, type=int, metavar="N", help="maximum number of tokens in a batch", ) group.add_argument( "--max-sentences", "--batch-size", type=int, metavar="N", help="maximum number of sentences in a batch", ) group.add_argument( "--dataset-impl", metavar="FORMAT", choices=get_available_dataset_impl(), help="output dataset implementation", ) if train: group.add_argument( "--train-subset", default="train", metavar="SPLIT", choices=["train", "valid", "test"], help="data subset to use for training (train, valid, test)", ) group.add_argument( "--valid-subset", default="valid", metavar="SPLIT", help="comma separated list of data subsets to use" " for validation (train, valid, valid1,test, test1)", ) group.add_argument( "--max-sentences-valid", type=int, metavar="N", help="maximum number of sentences in a validation batch" " (defaults to --max-sentences)", ) if gen: group.add_argument( "--gen-subset", default="test", metavar="SPLIT", help="data subset to generate (train, valid, test)", ) group.add_argument( "--num-shards", default=1, type=int, metavar="N", help="shard generation over N shards", ) group.add_argument( "--shard-id", default=0, type=int, metavar="ID", help="id of the shard to generate (id < num_shards)", ) return group