Esempio n. 1
0
def add_preprocess_args(parser):
    group = parser.add_argument_group("Preprocessing")
    # fmt: off
    group.add_argument("-s", "--source-lang", default=None, metavar="SRC",
                       help="source language")
    group.add_argument("-t", "--target-lang", default=None, metavar="TARGET",
                       help="target language")
    group.add_argument("--trainpref", metavar="FP", default=None,
                       help="train file prefix (also used to build dictionaries)")
    group.add_argument("--validpref", metavar="FP", default=None,
                       help="comma separated, valid file prefixes "
                            "(words missing from train set are replaced with <unk>)")
    group.add_argument("--testpref", metavar="FP", default=None,
                       help="comma separated, test file prefixes "
                            "(words missing from train set are replaced with <unk>)")
    group.add_argument("--align-suffix", metavar="FP", default=None,
                       help="alignment file suffix")
    group.add_argument("--destdir", metavar="DIR", default="data-bin",
                       help="destination dir")
    group.add_argument("--thresholdtgt", metavar="N", default=0, type=int,
                       help="map words appearing less than threshold times to unknown")
    group.add_argument("--thresholdsrc", metavar="N", default=0, type=int,
                       help="map words appearing less than threshold times to unknown")
    group.add_argument("--tgtdict", metavar="FP",
                       help="reuse given target dictionary")
    group.add_argument("--srcdict", metavar="FP",
                       help="reuse given source dictionary")
    group.add_argument("--nwordstgt", metavar="N", default=-1, type=int,
                       help="number of target words to retain")
    group.add_argument("--nwordssrc", metavar="N", default=-1, type=int,
                       help="number of source words to retain")
    group.add_argument("--alignfile", metavar="ALIGN", default=None,
                       help="an alignment file (optional)")
    parser.add_argument('--dataset-impl', metavar='FORMAT', default='mmap',
                        choices=get_available_dataset_impl(),
                        help='output dataset implementation')
    group.add_argument("--joined-dictionary", action="store_true",
                       help="Generate joined dictionary")
    group.add_argument("--only-source", action="store_true",
                       help="Only process the source language")
    group.add_argument("--padding-factor", metavar="N", default=8, type=int,
                       help="Pad dictionary size to be multiple of N")
    group.add_argument("--workers", metavar="N", default=1, type=int,
                       help="number of parallel workers")
    #--mult-teachers --avoid-tokenize-extras --input-mapping
    # --bert-model-name /mnt/yardcephfs/mmyard/g_wxg_td_prc/mt/v_xyvhuang/data/bert-base-cased-new
    # --bart-model-name /mnt/yardcephfs/mmyard/g_wxg_td_prc/mt/v_xyvhuang/data/bart-base
    #parser.add_argument('--avoid-tokenize-extras', action='store_true', help='...')
    group.add_argument("--mult-teachers", action="store_true",
                       help="...")
    group.add_argument("--avoid-tokenize-extras", action="store_true",
                       help="...")
    group.add_argument("--input-mapping", action="store_true",
                       help="...")
    group.add_argument('--bert-model-name', default='', type=str)
    group.add_argument('--bart-model-name', default='', type=str)
    group.add_argument('--electra-model-name', default='', type=str)
    # fmt: on
    return parser
Esempio n. 2
0
def add_preprocess_args(parser):
    group = parser.add_argument_group("Preprocessing")
    # fmt: off
    group.add_argument("-s", "--source-lang", default=None, metavar="SRC",
                       help="source language")
    group.add_argument("-t", "--target-lang", default=None, metavar="TARGET",
                       help="target language")
    group.add_argument("--trainpref", metavar="FP", default=None,
                       help="train file prefix")
    group.add_argument("--validpref", metavar="FP", default=None,
                       help="comma separated, valid file prefixes")
    group.add_argument("--testpref", metavar="FP", default=None,
                       help="comma separated, test file prefixes")
    group.add_argument("--align-suffix", metavar="FP", default=None,
                       help="alignment file suffix")
    group.add_argument("--destdir", metavar="DIR", default="data-bin",
                       help="destination dir")
    group.add_argument("--thresholdtgt", metavar="N", default=0, type=int,
                       help="map words appearing less than threshold times to unknown")
    group.add_argument("--thresholdsrc", metavar="N", default=0, type=int,
                       help="map words appearing less than threshold times to unknown")
    group.add_argument("--tgtdict", metavar="FP",
                       help="reuse given target dictionary")
    # group.add_argument("--bos", default="<s>", type=str,
    #                    help="Specify bos token from the dictionary.")
    # group.add_argument("--pad", default="<pad>", type=str,
    #                    help="Specify bos token from the dictionary.")
    # group.add_argument("--eos", default="</s>", type=str,
    #                    help="Specify bos token from the dictionary.")
    # group.add_argument("--unk", default="<unk>", type=str,
    #                    help="Specify bos token from the dictionary.")
    # group.add_argument("--tgtdict_add_sentence_limit_words_after", action="store_true",
    #                    help="Add sentence limit words (i.e. bos, eos, pad, unk) after loading tgtdict.")
    group.add_argument("--srcdict", metavar="FP",
                       help="reuse given source dictionary")
    group.add_argument("--nwordstgt", metavar="N", default=-1, type=int,
                       help="number of target words to retain")
    group.add_argument("--nwordssrc", metavar="N", default=-1, type=int,
                       help="number of source words to retain")
    group.add_argument("--alignfile", metavar="ALIGN", default=None,
                       help="an alignment file (optional)")
    parser.add_argument('--dataset-impl', metavar='FORMAT', default='mmap',
                        choices=get_available_dataset_impl(),
                        help='output dataset implementation')
    group.add_argument("--joined-dictionary", action="store_true",
                       help="Generate joined dictionary")
    group.add_argument("--only-source", action="store_true",
                       help="Only process the source language")
    group.add_argument("--padding-factor", metavar="N", default=8, type=int,
                       help="Pad dictionary size to be multiple of N")
    group.add_argument("--workers", metavar="N", default=1, type=int,
                       help="number of parallel workers")
    # group.add_argument("--use-bert-in-target", action="store_true",
    #                    help="Whether to use BERT target model or not.")
    # group.add_argument("--target-bert-model", metavar="N", default='bert-base-uncased', type=str,
    #                    help="Pre-trained BERT model used in the BERT loss"
    #                         "and needed to preprocess target language sentences.")
    # fmt: on
    return parser
Esempio n. 3
0
class LanguageModelingConfig(FairseqDataclass):
    data: Optional[str] = field(default=None,
                                metadata={"help": "path to data directory"})
    sample_break_mode: SAMPLE_BREAK_MODE_CHOICES = field(
        default="none",
        metadata={
            "help":
            'If omitted or "none", fills each sample with tokens-per-sample '
            'tokens. If set to "complete", splits samples only at the end '
            "of sentence, but may include multiple sentences per sample. "
            '"complete_doc" is similar but respects doc boundaries. '
            'If set to "eos", includes only one sentence per sample.'
        },
    )
    tokens_per_sample: int = field(
        default=1024,
        metadata={"help": "max number of tokens per sample for LM dataset"},
    )
    output_dictionary_size: int = field(
        default=-1, metadata={"help": "limit the size of output dictionary"})
    self_target: bool = field(default=False,
                              metadata={"help": "include self target"})
    future_target: bool = field(default=False,
                                metadata={"help": "include future target"})
    past_target: bool = field(default=False,
                              metadata={"help": "include past target"})
    add_bos_token: bool = field(
        default=False,
        metadata={"help": "prepend beginning of sentence token (<s>)"})
    max_target_positions: Optional[int] = field(
        default=None,
        metadata={"help": "max number of tokens in the target sequence"})
    shorten_method: SHORTEN_METHOD_CHOICES = field(
        default="none",
        metadata={
            "help":
            "if not none, shorten sequences that exceed --tokens-per-sample"
        },
    )
    shorten_data_split_list: str = field(
        default="",
        metadata={
            "help":
            "comma-separated list of dataset splits to apply shortening to, "
            'e.g., "train,valid" (default: all dataset splits)'
        },
    )
    # TODO common vars below add to parent
    seed: int = II("common.seed")
    dataset_impl: Optional[ChoiceEnum(
        get_available_dataset_impl())] = II("dataset.dataset_impl")
    data_buffer_size: int = II("dataset.data_buffer_size")
    tpu: bool = II("common.tpu")
    use_plasma_view: bool = II("common.use_plasma_view")
    plasma_path: str = II("common.plasma_path")
Esempio n. 4
0
def add_dataset_args(parser, train=False, gen=False):
    group = parser.add_argument_group("Dataset and data loading")
    # fmt: off
    group.add_argument('--num-workers', default=1, type=int, metavar='N',
                       help='how many subprocesses to use for data loading')
    group.add_argument('--skip-invalid-size-inputs-valid-test', action='store_true',
                       help='ignore too long or too short lines in valid and test set')
    group.add_argument('--max-tokens', type=int, metavar='N',
                       help='maximum number of tokens in a batch')
    group.add_argument('--max-sentences', '--batch-size', type=int, metavar='N',
                       help='maximum number of sentences in a batch')
    group.add_argument('--required-batch-size-multiple', default=8, type=int, metavar='N',
                       help='batch size will either be less than this value, '
                            'or a multiple of this value')
    group.add_argument('--required-seq-len-multiple', default=1, type=int, metavar='N',
                       help='maximum sequence length in batch will be a multiplier of this value')
    parser.add_argument('--dataset-impl', metavar='FORMAT',
                        choices=get_available_dataset_impl(),
                        help='output dataset implementation')
    group.add_argument('--data-buffer-size', default=10, type=int, metavar='N',
                        help='number of batches to preload')
    if train:
        group.add_argument('--train-subset', default='train', metavar='SPLIT',
                           help='data subset to use for training (e.g. train, valid, test)')
        group.add_argument('--valid-subset', default='valid', metavar='SPLIT',
                           help='comma separated list of data subsets to use for validation'
                                ' (e.g. train, valid, test)')
        group.add_argument('--validate-interval', type=int, default=1, metavar='N',
                           help='validate every N epochs')
        group.add_argument('--validate-interval-updates', type=int, default=0, metavar='N',
                           help='validate every N updates')
        group.add_argument('--validate-after-updates', type=int, default=0, metavar='N',
                           help='dont validate until reaching this many updates')
        group.add_argument('--fixed-validation-seed', default=None, type=int, metavar='N',
                           help='specified random seed for validation')
        group.add_argument('--disable-validation', action='store_true',
                           help='disable validation')
        group.add_argument('--max-tokens-valid', type=int, metavar='N',
                           help='maximum number of tokens in a validation batch'
                                ' (defaults to --max-tokens)')
        group.add_argument('--max-sentences-valid', type=int, metavar='N',
                           help='maximum number of sentences in a validation batch'
                                ' (defaults to --max-sentences)')
        group.add_argument('--curriculum', default=0, type=int, metavar='N',
                           help='don\'t shuffle batches for first N epochs')
    if gen:
        group.add_argument('--gen-subset', default='test', metavar='SPLIT',
                           help='data subset to generate (train, valid, test)')
        group.add_argument('--num-shards', default=1, type=int, metavar='N',
                           help='shard generation over N shards')
        group.add_argument('--shard-id', default=0, type=int, metavar='ID',
                           help='id of the shard to generate (id < num_shards)')
    # fmt: on
    return group
Esempio n. 5
0
def get_parser():
    parser = argparse.ArgumentParser(
        description="writes text from binarized file to stdout"
    )
    # fmt: off
    parser.add_argument('--dataset-impl', help='dataset implementation',
                        choices=indexed_dataset.get_available_dataset_impl())
    parser.add_argument('--dict', metavar='FP', help='dictionary containing known words', default=None)
    parser.add_argument('--input', metavar='FP', required=True, help='binarized file to read')
    # fmt: on

    return parser
Esempio n. 6
0
def add_preprocess_args(parser):
    group = parser.add_argument_group("Preprocessing")
    # fmt: off
    group.add_argument("-s", "--source-lang", default=None, metavar="SRC",
                       help="source language")
    group.add_argument("-t", "--target-lang", default=None, metavar="TARGET",
                       help="target language")
    group.add_argument("--trainpref", metavar="FP", default=None,
                       help="train file prefix")
    group.add_argument("--validpref", metavar="FP", default=None,
                       help="comma separated, valid file prefixes")
    group.add_argument("--testpref", metavar="FP", default=None,
                       help="comma separated, test file prefixes")
    group.add_argument("--align-suffix", metavar="FP", default=None,
                       help="alignment file suffix")
    group.add_argument("--destdir", metavar="DIR", default="data-bin",
                       help="destination dir")
    group.add_argument("--thresholdtgt", metavar="N", default=0, type=int,
                       help="map words appearing less than threshold times to unknown")
    group.add_argument("--thresholdsrc", metavar="N", default=0, type=int,
                       help="map words appearing less than threshold times to unknown")
    group.add_argument("--tgtdict", metavar="FP",
                       help="reuse given target dictionary")
    group.add_argument("--srcdict", metavar="FP",
                       help="reuse given source dictionary")
    group.add_argument("--nwordstgt", metavar="N", default=-1, type=int,
                       help="number of target words to retain")
    group.add_argument("--nwordssrc", metavar="N", default=-1, type=int,
                       help="number of source words to retain")
    group.add_argument("--alignfile", metavar="ALIGN", default=None,
                       help="an alignment file (optional)")
    parser.add_argument('--dataset-impl', metavar='FORMAT', default='mmap',
                        choices=get_available_dataset_impl(),
                        help='output dataset implementation')
    group.add_argument("--joined-dictionary", action="store_true",
                       help="Generate joined dictionary")
    group.add_argument("--only-source", action="store_true",
                       help="Only process the source language")
    group.add_argument("--padding-factor", metavar="N", default=8, type=int,
                       help="Pad dictionary size to be multiple of N")
    group.add_argument("--workers", metavar="N", default=1, type=int,
                       help="number of parallel workers")

    #TODO: Change here
    #group.add_argument("--multi-views",  action='store_true',
    #                   help="Load Multi-views")
    # fmt: on
    return parser
Esempio n. 7
0
def add_dataset_args(parser, train=False, gen=False):
    group = parser.add_argument_group('Dataset and data loading')
    # fmt: off
    group.add_argument('--num-workers', default=1, type=int, metavar='N',
                       help='how many subprocesses to use for data loading')
    group.add_argument('--skip-invalid-size-inputs-valid-test', action='store_true',
                       help='ignore too long or too short lines in valid and test set')
    group.add_argument('--max-tokens', type=int, metavar='N',
                       help='maximum number of tokens in a batch')
    group.add_argument('--max-sentences', '--batch-size', type=int, metavar='N',
                       help='maximum number of sentences in a batch')
    group.add_argument('--required-batch-size-multiple', default=8, type=int, metavar='N',
                       help='batch size will be a multiplier of this value')
    parser.add_argument('--dataset-impl', metavar='FORMAT',
                        choices=get_available_dataset_impl(),
                        help='output dataset implementation')
    if train:
        group.add_argument('--train-subset', default='train', metavar='SPLIT',
                           choices=['train', 'valid', 'test'],
                           help='data subset to use for training (train, valid, test)')
        group.add_argument('--valid-subset', default='valid', metavar='SPLIT',
                           help='comma separated list of data subsets to use for validation'
                                ' (train, valid, valid1, test, test1)')
        group.add_argument('--validate-interval', type=int, default=1, metavar='N',
                           help='validate every N epochs')
        group.add_argument('--disable-validation', action='store_true',
                           help='disable validation')
        group.add_argument('--max-tokens-valid', type=int, metavar='N',
                           help='maximum number of tokens in a validation batch'
                                ' (defaults to --max-tokens)')
        group.add_argument('--max-sentences-valid', type=int, metavar='N',
                           help='maximum number of sentences in a validation batch'
                                ' (defaults to --max-sentences)')
        group.add_argument('--curriculum', default=0, type=int, metavar='N',
                           help='don\'t shuffle batches for first N epochs')
    if gen:
        group.add_argument('--gen-subset', default='test', metavar='SPLIT',
                           help='data subset to generate (train, valid, test)')
        group.add_argument('--num-shards', default=1, type=int, metavar='N',
                           help='shard generation over N shards')
        group.add_argument('--shard-id', default=0, type=int, metavar='ID',
                           help='id of the shard to generate (id < num_shards)')
    # fmt: on
    return group
Esempio n. 8
0
class DatasetParams(FairseqDataclass):
    num_workers: int = field(
        default=1, metadata={"help": "how many subprocesses to use for data loading"}
    )
    skip_invalid_size_inputs_valid_test: bool = field(
        default=False,
        metadata={"help": "ignore too long or too short lines in valid and test set"},
    )
    max_tokens: Optional[int] = field(
        default=None, metadata={"help": "maximum number of tokens in a batch"}
    )
    batch_size: Optional[int] = field(
        default=None, metadata={"help": "number of examples in a batch"}
    )
    required_batch_size_multiple: int = field(
        default=8, metadata={"help": "batch size will be a multiplier of this value"}
    )
    required_seq_len_multiple: int = field(
        default=1, metadata={"help": "maximum sequence length in batch will be a multiplier of this value"}
    )
    dataset_impl: Optional[ChoiceEnum(get_available_dataset_impl())] = field(
        default=None, metadata={"help": "output dataset implementation"}
    )
    data_buffer_size: int = field(
        default=10, metadata={"help": "Number of batches to preload"}
    )
    train_subset: str = field(
        default="train",
        metadata={"help": "data subset to use for training (e.g. train, valid, test)"},
    )
    valid_subset: str = field(
        default="valid",
        metadata={
            "help": "comma separated list of data subsets to use for validation"
            " (e.g. train, valid, test)"
        },
    )
    validate_interval: int = field(
        default=1, metadata={"help": "validate every N epochs"}
    )
    validate_interval_updates: int = field(
        default=0, metadata={"help": "validate every N updates"}
    )
    validate_after_updates: int = field(
        default=0, metadata={"help": "dont validate until reaching this many updates"}
    )
    fixed_validation_seed: Optional[int] = field(
        default=None, metadata={"help": "specified random seed for validation"}
    )
    disable_validation: bool = field(
        default=False, metadata={"help": "disable validation"}
    )
    max_tokens_valid: Optional[int] = field(
        default=None,
        metadata={
            "help": "maximum number of tokens in a validation batch"
            " (defaults to --max-tokens)"
        },
    )
    batch_size_valid: Optional[int] = field(
        default=None,
        metadata={
            "help": "batch size of the validation batch"
            " (defaults to --batch-size)"
        },
    )
    curriculum: int = field(
        default=0, metadata={"help": "don't shuffle batches for first N epochs"}
    )
    gen_subset: str = field(
        default="test",
        metadata={"help": "data subset to generate (train, valid, test)"},
    )
    num_shards: int = field(
        default=1, metadata={"help": "shard generation over N shards"}
    )
    shard_id: int = field(
        default=0, metadata={"help": "id of the shard to generate (id < num_shards)"}
    )
Esempio n. 9
0
def add_preprocess_args(parser):
    group = parser.add_argument_group('Preprocessing')
    # fmt: off
    group.add_argument("-s",
                       "--source-lang",
                       default=None,
                       metavar="SRC",
                       help="source language")
    group.add_argument("-t",
                       "--target-lang",
                       default=None,
                       metavar="TARGET",
                       help="target language")
    group.add_argument("--trainpref",
                       metavar="FP",
                       default=None,
                       help="train file prefix")
    group.add_argument("--validpref",
                       metavar="FP",
                       default=None,
                       help="comma separated, valid file prefixes")
    group.add_argument("--testpref",
                       metavar="FP",
                       default=None,
                       help="comma separated, test file prefixes")
    group.add_argument(
        "--destdir",
        metavar="DIR",
        default="data-bin",
        help="destination dir for target actions and states information")
    group.add_argument(
        "--embdir",
        metavar="DIR",
        default="data-bin",
        help="destination dir for pre-trained source embeddings")
    group.add_argument(
        "--thresholdtgt",
        metavar="N",
        default=0,
        type=int,
        help="map words appearing less than threshold times to unknown")
    group.add_argument(
        "--thresholdsrc",
        metavar="N",
        default=0,
        type=int,
        help="map words appearing less than threshold times to unknown")
    group.add_argument("--tgtdict",
                       metavar="FP",
                       help="reuse given target dictionary")
    group.add_argument("--srcdict",
                       metavar="FP",
                       help="reuse given source dictionary")
    group.add_argument("--nwordstgt",
                       metavar="N",
                       default=-1,
                       type=int,
                       help="number of target words to retain")
    group.add_argument("--nwordssrc",
                       metavar="N",
                       default=-1,
                       type=int,
                       help="number of source words to retain")
    group.add_argument("--alignfile",
                       metavar="ALIGN",
                       default=None,
                       help="an alignment file (optional)")
    parser.add_argument('--dataset-impl',
                        metavar='FORMAT',
                        default='mmap',
                        choices=get_available_dataset_impl(),
                        help='output dataset implementation')
    group.add_argument("--joined-dictionary",
                       action="store_true",
                       help="Generate joined dictionary")
    group.add_argument("--only-source",
                       action="store_true",
                       help="Only process the source language")
    group.add_argument("--padding-factor",
                       metavar="N",
                       default=8,
                       type=int,
                       help="Pad dictionary size to be multiple of N")
    group.add_argument("--workers",
                       metavar="N",
                       default=1,
                       type=int,
                       help="number of parallel workers")

    # for extending BART vocabulary with actions specific symbols
    parser.add_argument(
        '--node-freq-min',
        default=5,
        type=int,
        help='minimum frequency of node names to add to vocabulary')

    # for pretrained external embeddings
    group.add_argument("--pretrained-embed",
                       default='roberta.base',
                       help="Type of pretrained embedding")
    # NOTE: Previous default "17 18 19 20 21 22 23 24"
    group.add_argument('--bert-layers',
                       nargs='+',
                       type=int,
                       help='RoBERTa layers to extract (default last)')

    # for stack-transformer
    add_state_machine_args(group)

    # fmt: on
    return parser
    dataset_builder.finalize(os.path.join(output_dir, split + '.text.idx'))
    relations_builder.finalize(
        os.path.join(output_dir, split + '.relations.idx'))
    annotations_list = np.concatenate(annotations_list)
    np.save(os.path.join(output_dir, split + '.annotations'), annotations_list)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Data preparation for SemEval 2010 Task 8 dataset')
    parser.add_argument('--split',
                        type=str,
                        help='Dataset split',
                        choices=['train', 'dev', 'test'])
    parser.add_argument('--root-dir',
                        type=str,
                        default='../data/SemEval2010_task8_all_data',
                        help='SemEval 2010 Task 8 root directory')
    parser.add_argument('--roberta-dir',
                        type=str,
                        default='../data/roberta',
                        help='RoBERTa directory with all dictionaries.')
    parser.add_argument('--append-eos', default=False, action='store_true')
    parser.add_argument('--dataset-impl',
                        metavar='FORMAT',
                        default='mmap',
                        choices=indexed_dataset.get_available_dataset_impl(),
                        help='output dataset implementation')
    args = parser.parse_args()
    main(args)
Esempio n. 11
0
class TranslationConfig(FairseqDataclass):
    data: Optional[str] = field(
        default=None,
        metadata={
            "help":
            "colon separated path to data directories list, will be iterated upon during epochs "
            "in round-robin manner; however, valid and test data are always in the first directory "
            "to avoid the need for repeating them in all directories"
        },
    )
    source_lang: Optional[str] = field(
        default=None,
        metadata={
            "help": "source language",
            "argparse_alias": "-s",
        },
    )
    target_lang: Optional[str] = field(
        default=None,
        metadata={
            "help": "target language",
            "argparse_alias": "-t",
        },
    )
    load_alignments: bool = field(
        default=False, metadata={"help": "load the binarized alignments"})
    left_pad_source: bool = field(
        default=True, metadata={"help": "pad the source on the left"})
    left_pad_target: bool = field(
        default=False, metadata={"help": "pad the target on the left"})
    max_source_positions: int = field(
        default=1024,
        metadata={"help": "max number of tokens in the source sequence"})
    max_target_positions: int = field(
        default=1024,
        metadata={"help": "max number of tokens in the target sequence"})
    upsample_primary: int = field(
        default=-1,
        metadata={"help": "the amount of upsample primary dataset"})
    truncate_source: bool = field(
        default=False,
        metadata={"help": "truncate source to max-source-positions"})
    num_batch_buckets: int = field(
        default=0,
        metadata={
            "help":
            "if >0, then bucket source and target lengths into "
            "N buckets and pad accordingly; this is useful on TPUs to minimize the number of compilations"
        },
    )
    train_subset: str = II("dataset.train_subset")
    dataset_impl: Optional[ChoiceEnum(
        get_available_dataset_impl())] = II("dataset.dataset_impl")
    required_seq_len_multiple: int = II("dataset.required_seq_len_multiple")

    # options for reporting BLEU during validation
    eval_bleu: bool = field(default=False,
                            metadata={"help": "evaluation with BLEU scores"})
    eval_bleu_args: Optional[str] = field(
        default="{}",
        metadata={
            "help":
            'generation args for BLUE scoring, e.g., \'{"beam": 4, "lenpen": 0.6}\', as JSON string'
        },
    )
    eval_bleu_detok: str = field(
        default="space",
        metadata={
            "help":
            "detokenize before computing BLEU (e.g., 'moses'); required if using --eval-bleu; "
            "use 'space' to disable detokenization; see fairseq.data.encoders for other options"
        },
    )
    eval_bleu_detok_args: Optional[str] = field(
        default="{}",
        metadata={
            "help":
            "args for building the tokenizer, if needed, as JSON string"
        },
    )
    eval_tokenized_bleu: bool = field(
        default=False,
        metadata={"help": "compute tokenized BLEU instead of sacrebleu"})
    eval_bleu_remove_bpe: Optional[str] = field(
        default=None,
        metadata={
            "help": "remove BPE before computing BLEU",
            "argparse_const": "@@ ",
        },
    )
    eval_bleu_print_samples: bool = field(
        default=False,
        metadata={"help": "print sample generations during validation"})
    extra_data: bool = field(default=False, metadata={"help": "..."})
    denoising: bool = field(default=False, metadata={"help": "..."})
    masking: bool = field(default=False, metadata={"help": "..."})
    electra_pretrain: bool = field(default=False, metadata={"help": "..."})
    input_mapping: bool = field(default=False, metadata={"help": "..."})
    electra_model_name: str = field(default='electra-base',
                                    metadata={"help": "..."})
    bart_model_name: str = field(default='bart-base', metadata={"help": "..."})
    bert_model_name: str = field(default='bert-base-cased',
                                 metadata={"help": "..."})
    finetune_bert: bool = field(default=False, metadata={"help": "..."})
    use_bertinput: bool = field(default=False, metadata={"help": "..."})
    use_bartinput: bool = field(default=False, metadata={"help": "..."})
    use_electrainput: bool = field(default=False, metadata={"help": "..."})
    # mask_ratio = None, random_ratio = None, insert_ratio = None, rotate_ratio = None, permute_sentence_ratio = None
    mask_ratio: float = field(default=0.3, metadata={"help": "..."})
    random_ratio: float = field(default=0.1, metadata={"help": "..."})
    insert_ratio: float = field(default=0.0, metadata={"help": "..."})
    rotate_ratio: float = field(default=0.5, metadata={"help": "..."})
    permute_sentence_ratio: float = field(default=1.0,
                                          metadata={"help": "..."})
Esempio n. 12
0
class TranslationlfConfig(FairseqDataclass):
    data: Optional[str] = field(
        default=None,
        metadata={
            "help":
            "colon separated path to data directories list, will be iterated upon during epochs "
            "in round-robin manner; however, valid and test data are always in the first directory "
            "to avoid the need for repeating them in all directories"
        },
    )

    source_lang: Optional[str] = field(
        default=None,
        metadata={
            "help": "source language",
            "argparse_alias": "-s",
        },
    )
    target_lang: Optional[str] = field(
        default=None,
        metadata={
            "help": "target language",
            "argparse_alias": "-t",
        },
    )
    load_alignments: bool = field(
        default=False, metadata={"help": "load the binarized alignments"})
    left_pad_source: bool = field(
        default=True, metadata={"help": "pad the source on the left"})
    left_pad_target: bool = field(
        default=False, metadata={"help": "pad the target on the left"})
    max_source_positions: int = field(
        default=1024,
        metadata={"help": "max number of tokens in the source sequence"})
    max_target_positions: int = field(
        default=1024,
        metadata={"help": "max number of tokens in the target sequence"})
    upsample_primary: int = field(
        default=-1,
        metadata={"help": "the amount of upsample primary dataset"})
    truncate_source: bool = field(
        default=False,
        metadata={"help": "truncate source to max-source-positions"})
    num_batch_buckets: int = field(
        default=0,
        metadata={
            "help":
            "if >0, then bucket source and target lengths into "
            "N buckets and pad accordingly; this is useful on TPUs to minimize the number of compilations"
        },
    )
    train_subset: str = II("dataset.train_subset")
    #valid_subset: str = II("dataset.valid_subset")
    #test_subset: str = II("dataset.test_subset")
    dataset_impl: Optional[ChoiceEnum(
        get_available_dataset_impl())] = II("dataset.dataset_impl")
    required_seq_len_multiple: int = II("dataset.required_seq_len_multiple")

    # options for reporting BLEU during validation
    eval_bleu: bool = field(default=False,
                            metadata={"help": "evaluation with BLEU scores"})
    eval_bleu_args: Optional[str] = field(
        default="{}",
        metadata={
            "help":
            'generation args for BLUE scoring, e.g., \'{"beam": 4, "lenpen": 0.6}\', as JSON string'
        },
    )
    eval_bleu_detok: str = field(
        default="space",
        metadata={
            "help":
            "detokenize before computing BLEU (e.g., 'moses'); required if using --eval-bleu; "
            "use 'space' to disable detokenization; see fairseq.data.encoders for other options"
        },
    )
    eval_bleu_detok_args: Optional[str] = field(
        default="{}",
        metadata={
            "help":
            "args for building the tokenizer, if needed, as JSON string"
        },
    )
    eval_tokenized_bleu: bool = field(
        default=False,
        metadata={"help": "compute tokenized BLEU instead of sacrebleu"})
    eval_bleu_remove_bpe: Optional[str] = field(
        default=None,
        metadata={
            "help": "remove BPE before computing BLEU",
            "argparse_const": "@@ ",
        },
    )
    eval_bleu_print_samples: bool = field(
        default=False,
        metadata={"help": "print sample generations during validation"})
    # todo check!
    lf_path: Optional[str] = field(
        default=None,
        metadata={
            "help": "long former representations path",
            "argparse_alias": "-lf-path",
        },
    )

    encoding_path: Optional[str] = field(
        default="encodings.json",
        metadata={
            "help": "encodings output path",
            "argparse_alias": "-enc-path",
        },
    )

    # todo (next) three paths for h5py files
    sen_doc: Optional[str] = field(
        default=None,
        metadata={
            "help":
            "sentence document alignment path to get which sentence belongs to which document ",
            "argparse_alias": "-sen-doc",
        },
    )
Esempio n. 13
0
def add_dataset_args(parser, train=False, gen=False):
    group = parser.add_argument_group('Dataset and data loading')
    # fmt: off
    group.add_argument('--num-workers', default=1, type=int, metavar='N',
                       help='how many subprocesses to use for data loading')
    group.add_argument('--skip-invalid-size-inputs-valid-test', action='store_true',
                       help='ignore too long or too short lines in valid and test set')
    group.add_argument('--max-tokens', type=int, metavar='N',
                       help='maximum number of tokens in a batch')
    group.add_argument('--max-sentences', '--batch-size', type=int, metavar='N',
                       help='maximum number of sentences in a batch')
    group.add_argument('--required-batch-size-multiple', default=8, type=int, metavar='N',
                       help='batch size will be a multiplier of this value')
    parser.add_argument('--dataset-impl', metavar='FORMAT',
                        choices=get_available_dataset_impl(),
                        help='output dataset implementation')
    parser.add_argument('--set-add-align-head', action='store_true',
                            help='if True, add an head in each decoder layer for alignment')
    parser.add_argument('--set-shift', action='store_true',
                            help='if True, train and test shifted attention.')
    parser.add_argument('--alignment-task', default='vanilla', type=str, choices=['vanilla', 'usehead', 'addhead', 'supalign', 'ptrnet','dual'],
                            help='train and test shifted attention.') 
    parser.add_argument('--set-src-bow-loss', action='store_true',
                            help='start to train with src bag-of-words loss')
    group.add_argument('--beam', default=5, type=int, metavar='N',
                       help='beam size')
    # if 'alignment_layer' in parser:
    parser.add_argument('--alignment-layer', default=2, type=int,
                            help='train and test shifted attention.')
    parser.add_argument('--alignment-heads', type=int, metavar='D',
                            help='Number of cross attention heads per layer to supervised with alignments')
    parser.add_argument('--cons-type', type=str, metavar='D',
                            help='Number of cross attention heads per layer to supervised with alignments')
    parser.add_argument('--set-dual-trans', action='store_true',
                            help='train attention agreement model')
    if train:
        group.add_argument('--train-subset', default='train', metavar='SPLIT',
                           choices=['train', 'valid', 'test'],
                           help='data subset to use for training (train, valid, test)')
        group.add_argument('--valid-subset', default='valid', metavar='SPLIT',
                           help='comma separated list of data subsets to use for validation'
                                ' (train, valid, valid1, test, test1)')
        group.add_argument('--validate-interval', type=int, default=1, metavar='N',
                           help='validate every N epochs')
        group.add_argument('--fixed-validation-seed', default=None, type=int, metavar='N',
                           help='specified random seed for validation')
        group.add_argument('--disable-validation', action='store_true',
                           help='disable validation')
        group.add_argument('--max-tokens-valid', type=int, metavar='N',
                           help='maximum number of tokens in a validation batch'
                                ' (defaults to --max-tokens)')
        group.add_argument('--max-sentences-valid', type=int, metavar='N',
                           help='maximum number of sentences in a validation batch'
                                ' (defaults to --max-sentences)')
        group.add_argument('--curriculum', default=0, type=int, metavar='N',
                           help='don\'t shuffle batches for first N epochs')
        group.add_argument('--model-overrides', default="{}", type=str, metavar='DICT',
                       help='a dictionary used to override model args at generation '
                            'that were used during model training')
    if gen:
        group.add_argument('--gen-subset', default='test', metavar='SPLIT',
                           help='data subset to generate (train, valid, test)')
        group.add_argument('--num-shards', default=1, type=int, metavar='N',
                           help='shard generation over N shards')
        group.add_argument('--shard-id', default=0, type=int, metavar='ID',
                           help='id of the shard to generate (id < num_shards)')
        group.add_argument('--print-vanilla-alignment', action="store_true",
                           help='use shifted attention to extract alignment')
        
        
    # fmt: on
    return group
class MultilingualLanguageModelingConfig(FairseqDataclass):
    # TODO common var add to parent
    data: Optional[str] = field(default=None,
                                metadata={"help": "path to data directory"})
    sample_break_mode: SAMPLE_BREAK_MODE_CHOICES = field(
        default="none",
        metadata={
            "help":
            'If omitted or "none", fills each sample with tokens-per-sample '
            'tokens. If set to "complete", splits samples only at the end '
            "of sentence, but may include multiple sentences per sample. "
            '"complete_doc" is similar but respects doc boundaries. '
            'If set to "eos", includes only one sentence per sample.'
        },
    )
    tokens_per_sample: int = field(
        default=1024,
        metadata={"help": "max number of tokens per sample for LM dataset"},
    )
    output_dictionary_size: int = field(
        default=-1, metadata={"help": "limit the size of output dictionary"})
    self_target: bool = field(default=False,
                              metadata={"help": "include self target"})
    future_target: bool = field(default=False,
                                metadata={"help": "include future target"})
    past_target: bool = field(default=False,
                              metadata={"help": "include past target"})
    add_bos_token: bool = field(
        default=False, metadata={"help": "prepend lang id token <dialect>"})
    max_source_positions: Optional[int] = field(
        default=None,
        metadata={"help": "max number of tokens in the source sequence"})
    max_target_positions: Optional[int] = field(
        default=None,
        metadata={"help": "max number of tokens in the target sequence"})
    pad_to_fixed_length: Optional[bool] = field(
        default=False, metadata={"help": "pad to fixed length"})
    pad_to_fixed_bsz: Optional[bool] = field(
        default=False, metadata={"help": "boolean to pad to fixed batch size"})

    multilang_sampling_alpha: Optional[float] = field(
        default=1.0,
        metadata={
            "help":
            "smoothing alpha for sample rations across multiple datasets"
        },
    )

    shorten_method: SHORTEN_METHOD_CHOICES = field(
        default="none",
        metadata={
            "help":
            "if not none, shorten sequences that exceed --tokens-per-sample"
        },
    )
    shorten_data_split_list: str = field(
        default="",
        metadata={
            "help":
            "comma-separated list of dataset splits to apply shortening to, "
            'e.g., "train,valid" (default: all dataset splits)'
        },
    )

    langs: str = field(
        default="",
        metadata={
            "help":
            "comma-separated list of languages (default: all directories in data path)"
        },
    )
    baseline_model_langs: str = field(
        default="",
        metadata={
            "help":
            "comma-separated list of languages in the baseline model (default: none)"
        },
    )
    # TODO: legacy parameter kept for compatibility
    baseline_model: str = field(
        default="",
        metadata={"help": "path to the baseline model (default: none)"},
    )

    lang_to_offline_shard_ratio: str = field(
        default="",
        metadata={
            "help":
            "absolute path of tsv file location to indicate lang to offline shard ratio.",
        },
    )
    # TODO common vars below add to parent
    seed: int = II("common.seed")
    dataset_impl: Optional[ChoiceEnum(
        get_available_dataset_impl())] = II("dataset.dataset_impl")
    data_buffer_size: int = II("dataset.data_buffer_size")
    tpu: bool = II("common.tpu")
    batch_size: Optional[int] = II("dataset.batch_size")
    batch_size_valid: Optional[int] = II("dataset.batch_size_valid")
    train_subset: str = II("common.train_subset")
    valid_subset: str = II("common.valid_subset")
class TranslationConfig(FairseqDataclass):
    data: Optional[str] = field(
        default=None,
        metadata={
            "help":
            "colon separated path to data directories list, will be iterated upon during epochs "
            "in round-robin manner; however, valid and test data are always in the first directory "
            "to avoid the need for repeating them in all directories"
        },
    )
    source_lang: Optional[str] = field(
        default=None,
        metadata={
            "help": "source language",
            "argparse_alias": "-s",
        },
    )
    target_lang: Optional[str] = field(
        default=None,
        metadata={
            "help": "target language",
            "argparse_alias": "-t",
        },
    )
    load_alignments: bool = field(
        default=False, metadata={"help": "load the binarized alignments"})
    left_pad_source: bool = field(
        default=True, metadata={"help": "pad the source on the left"})
    left_pad_target: bool = field(
        default=False, metadata={"help": "pad the target on the left"})
    max_source_positions: int = field(
        default=1024,
        metadata={"help": "max number of tokens in the source sequence"})
    max_target_positions: int = field(
        default=1024,
        metadata={"help": "max number of tokens in the target sequence"})
    upsample_primary: int = field(
        default=-1,
        metadata={"help": "the amount of upsample primary dataset"})
    truncate_source: bool = field(
        default=False,
        metadata={"help": "truncate source to max-source-positions"})
    num_batch_buckets: int = field(
        default=0,
        metadata={
            "help":
            "if >0, then bucket source and target lengths into "
            "N buckets and pad accordingly; this is useful on TPUs to minimize the number of compilations"
        },
    )
    train_subset: str = II("dataset.train_subset")
    dataset_impl: Optional[ChoiceEnum(
        get_available_dataset_impl())] = II("dataset.dataset_impl")
    required_seq_len_multiple: int = II("dataset.required_seq_len_multiple")

    # options for reporting BLEU during validation
    eval_bleu: bool = field(default=False,
                            metadata={"help": "evaluation with BLEU scores"})
    eval_bleu_args: Optional[str] = field(
        default="{}",
        metadata={
            "help":
            'generation args for BLUE scoring, e.g., \'{"beam": 4, "lenpen": 0.6}\', as JSON string'
        },
    )
    eval_bleu_detok: str = field(
        default="space",
        metadata={
            "help":
            "detokenize before computing BLEU (e.g., 'moses'); required if using --eval-bleu; "
            "use 'space' to disable detokenization; see fairseq.data.encoders for other options"
        },
    )
    eval_bleu_detok_args: Optional[str] = field(
        default="{}",
        metadata={
            "help":
            "args for building the tokenizer, if needed, as JSON string"
        },
    )
    eval_tokenized_bleu: bool = field(
        default=False,
        metadata={"help": "compute tokenized BLEU instead of sacrebleu"})
    eval_bleu_remove_bpe: Optional[str] = field(
        default=None,
        metadata={
            "help": "remove BPE before computing BLEU",
            "argparse_const": "@@ ",
        },
    )
    eval_bleu_print_samples: bool = field(
        default=False,
        metadata={"help": "print sample generations during validation"})
    dataset_add_token_type_ids: bool = field(
        default=False,
        metadata={"help": "if set, append token_type_ids (all zeros)"})
    freeze_encoder: bool = field(
        default=False, metadata={"help": "if set, freeze encoder's weight"})

    # options fields for mask
    sample_break_mode: Optional[ChoiceEnum([
        "none", "complete", "complete_doc", "eos"
    ])] = field(
        default="complete",
        metadata={
            "help":
            'If omitted or "none", fills each sample with tokens-per-sample '
            'tokens. If set to "complete", splits samples only at the end '
            "of sentence, but may include multiple sentences per sample. "
            '"complete_doc" is similar but respects doc boundaries. '
            'If set to "eos", includes only one sentence per sample.',
        })
    tokens_per_sample: int = field(
        default=512,
        metadata={
            "help":
            "max number of total tokens over all segments "
            "per sample for BERT dataset",
        })
    mask_prob: float = field(
        default=0.15,
        metadata={"help": "probability of replacing a token with mask"})
    leave_unmasked_prob: float = field(
        default=0.1,
        metadata={"help": "probability that a masked token is unmasked"})
    random_token_prob: float = field(
        default=0.1,
        metadata={
            "help": "probability of replacing a token with a random token"
        })
    freq_weighted_replacement: bool = field(
        default=False,
        metadata={
            "help": "sample random replacement words based on word frequencies"
        })
    mask_whole_words: bool = field(
        default=False,
        metadata={"help": "mask whole words; you may also want to set --bpe"})
    mask_multiple_length: int = field(
        default=1, metadata={"help": "repeat the mask indices multiple times"})
    mask_stdev: float = field(default=0.0,
                              metadata={"help": "stdev of the mask length"})
    shorten_dataset: Optional[ChoiceEnum([
        "none", "none", "random_crop"
    ])] = field(
        default="none",
        metadata={
            "help":
            "if not none, shorten sequences that exceed --tokens-per-sample"
        })
    shorten_data_split_list: str = field(
        default="",
        metadata={
            "help":
            "comma-separated list of dataset splits to apply shortening to, "
            'e.g., "train,valid" (default: all dataset splits)',
        })

    # Custom for bert
    use_bert_dict: str = field(
        default="",
        metadata={"help": "Specify which dictonary use custome one"},
    )
    load_bert_path: str = field(
        default="",
        metadata={
            "help":
            "If set, load bert weight in path to encoder. Please add --add-token-type-embeddings and --type-vocab-size."
        },
    )
Esempio n. 16
0
def add_preprocess_args(parser):
    group = parser.add_argument_group('Preprocessing')
    # fmt: off
    group.add_argument("-s",
                       "--source-lang",
                       default=None,
                       metavar="SRC",
                       help="source language")
    group.add_argument("-t",
                       "--target-lang",
                       default=None,
                       metavar="TARGET",
                       help="target language")
    group.add_argument("--trainpref",
                       metavar="FP",
                       default=None,
                       help="train file prefix")
    group.add_argument("--validpref",
                       metavar="FP",
                       default=None,
                       help="comma separated, valid file prefixes")
    group.add_argument("--testpref",
                       metavar="FP",
                       default=None,
                       help="comma separated, test file prefixes")
    group.add_argument("--align-suffix",
                       metavar="FP",
                       default=None,
                       help="alignment file suffix")
    group.add_argument("--destdir",
                       metavar="DIR",
                       default="data-bin",
                       help="destination dir")
    group.add_argument("--trainoutf",
                       metavar="FILENAME",
                       default="train",
                       help="Output training data filename after preprocessed")
    group.add_argument("--validoutf",
                       metavar="FILENAME",
                       default="valid",
                       help="Output validate data filename after preprocessed")
    group.add_argument("--testoutf",
                       metavar="FILENAME",
                       default="test",
                       help="Output test data filename after preprocessed")
    group.add_argument(
        "--thresholdtgt",
        metavar="N",
        default=0,
        type=int,
        help="map words appearing less than threshold times to unknown")
    group.add_argument(
        "--thresholdsrc",
        metavar="N",
        default=0,
        type=int,
        help="map words appearing less than threshold times to unknown")
    group.add_argument("--tgtdict",
                       metavar="FP",
                       help="reuse given target dictionary")
    group.add_argument("--srcdict",
                       metavar="FP",
                       help="reuse given source dictionary")
    group.add_argument("--nwordstgt",
                       metavar="N",
                       default=-1,
                       type=int,
                       help="number of target words to retain")
    group.add_argument("--nwordssrc",
                       metavar="N",
                       default=-1,
                       type=int,
                       help="number of source words to retain")
    group.add_argument("--alignfile",
                       metavar="ALIGN",
                       default=None,
                       help="an alignment file (optional)")
    parser.add_argument('--dataset-impl',
                        metavar='FORMAT',
                        default='mmap',
                        choices=get_available_dataset_impl(),
                        help='output dataset implementation')
    parser.add_argument(
        '--file-format',
        metavar='FORMAT',
        default=None,
        choices=['smiles'],
        help='Dataset file format, smiles mean no space between symbols')
    group.add_argument("--joined-dictionary",
                       action="store_true",
                       help="Generate joined dictionary")
    group.add_argument("--only-source",
                       action="store_true",
                       help="Only process the source language")
    group.add_argument("--padding-factor",
                       metavar="N",
                       default=8,
                       type=int,
                       help="Pad dictionary size to be multiple of N")
    group.add_argument("--workers",
                       metavar="N",
                       default=1,
                       type=int,
                       help="number of parallel workers")
    # fmt: on
    return parser
Esempio n. 17
0
class TranslationConfig(FairseqDataclass):
    data: Optional[str] = field(
        default=None,
        metadata={
            "help":
            "colon separated path to data directories list, will be iterated upon during epochs "
            "in round-robin manner; however, valid and test data are always in the first directory "
            "to avoid the need for repeating them in all directories"
        },
    )
    source_lang: Optional[str] = field(
        default=None,
        metadata={
            "help": "source language",
            "argparse_alias": "-s",
        },
    )
    target_lang: Optional[str] = field(
        default=None,
        metadata={
            "help": "target language",
            "argparse_alias": "-t",
        },
    )
    load_alignments: bool = field(
        default=False, metadata={"help": "load the binarized alignments"})
    left_pad_source: bool = field(
        default=False, metadata={"help": "pad the source on the left"})
    left_pad_target: bool = field(
        default=False, metadata={"help": "pad the target on the left"})
    max_source_positions: int = field(
        default=1024,
        metadata={"help": "max number of tokens in the source sequence"})
    max_target_positions: int = field(
        default=1024,
        metadata={"help": "max number of tokens in the target sequence"})
    upsample_primary: int = field(
        default=-1,
        metadata={"help": "the amount of upsample primary dataset"})
    truncate_source: bool = field(
        default=False,
        metadata={"help": "truncate source to max-source-positions"})
    num_batch_buckets: int = field(
        default=0,
        metadata={
            "help":
            "if >0, then bucket source and target lengths into "
            "N buckets and pad accordingly; this is useful on TPUs to minimize the number of compilations"
        },
    )
    train_subset: str = II("dataset.train_subset")
    dataset_impl: Optional[ChoiceEnum(
        get_available_dataset_impl())] = II("dataset.dataset_impl")
    required_seq_len_multiple: int = II("dataset.required_seq_len_multiple")

    # Additional options
    use_bert_dict: str = field(
        default="",
        metadata={"help": "Specify which dictonary use custome one"},
    )
    load_bert_path: str = field(
        default="",
        metadata={
            "help":
            "If set, load bert weight in path to encoder. Please add --add-token-type-embeddings and --type-vocab-size."
        },
    )
    prepend_bos_to_src: bool = field(
        default=False, metadata={"help": "prepend bos token to src dataset"})
    freeze_encoder: bool = field(
        default=False, metadata={"help": "if set, freeze encoder's weight"})

    # options for reporting BLEU during validation
    eval_bleu: bool = field(default=False,
                            metadata={"help": "evaluation with BLEU scores"})
    eval_bleu_args: Optional[str] = field(
        default="{}",
        metadata={
            "help":
            'generation args for BLUE scoring, e.g., \'{"beam": 4, "lenpen": 0.6}\', as JSON string'
        },
    )
    eval_bleu_detok: str = field(
        default="space",
        metadata={
            "help":
            "detokenize before computing BLEU (e.g., 'moses'); required if using --eval-bleu; "
            "use 'space' to disable detokenization; see fairseq.data.encoders for other options"
        },
    )
    eval_bleu_detok_args: Optional[str] = field(
        default="{}",
        metadata={
            "help":
            "args for building the tokenizer, if needed, as JSON string"
        },
    )
    eval_tokenized_bleu: bool = field(
        default=False,
        metadata={"help": "compute tokenized BLEU instead of sacrebleu"})
    eval_bleu_remove_bpe: Optional[str] = field(
        default=None,
        metadata={
            "help": "remove BPE before computing BLEU",
            "argparse_const": "@@ ",
        },
    )
    eval_bleu_print_samples: bool = field(
        default=False,
        metadata={"help": "print sample generations during validation"})
    dataset_add_token_type_ids: bool = field(
        default=False,
        metadata={"help": "if set, append token_type_ids (all zeros)"})
Esempio n. 18
0
def add_dataset_args(parser, train=False, gen=False):
    """Same as fairseq.options.add_dataset_args but without
    the "data" argument"""
    group = parser.add_argument_group("Dataset and data loading")
    group.add_argument(
        "data",
        metavar="DIR",
        nargs="?",
        help="path to data directory. "
        "This is not needed but kept for backward compatibility",
    )
    group.add_argument(
        "--num-workers",
        default=0,
        type=int,
        metavar="N",
        help="how many subprocesses to use for data loading",
    )
    group.add_argument(
        "--skip-invalid-size-inputs-valid-test",
        action="store_true",
        help="Ignore too long or too short lines in valid and test set",
    )
    group.add_argument(
        "--max-tokens",
        default=5000,
        type=int,
        metavar="N",
        help="maximum number of tokens in a batch",
    )
    group.add_argument(
        "--max-sentences",
        "--batch-size",
        type=int,
        metavar="N",
        help="maximum number of sentences in a batch",
    )
    group.add_argument(
        "--dataset-impl",
        metavar="FORMAT",
        choices=get_available_dataset_impl(),
        help="output dataset implementation",
    )
    if train:
        group.add_argument(
            "--train-subset",
            default="train",
            metavar="SPLIT",
            choices=["train", "valid", "test"],
            help="data subset to use for training (train, valid, test)",
        )
        group.add_argument(
            "--valid-subset",
            default="valid",
            metavar="SPLIT",
            help="comma separated list of data subsets to use"
            " for validation (train, valid, valid1,test, test1)",
        )
        group.add_argument(
            "--max-sentences-valid",
            type=int,
            metavar="N",
            help="maximum number of sentences in a validation batch"
            " (defaults to --max-sentences)",
        )
    if gen:
        group.add_argument(
            "--gen-subset",
            default="test",
            metavar="SPLIT",
            help="data subset to generate (train, valid, test)",
        )
        group.add_argument(
            "--num-shards",
            default=1,
            type=int,
            metavar="N",
            help="shard generation over N shards",
        )
        group.add_argument(
            "--shard-id",
            default=0,
            type=int,
            metavar="ID",
            help="id of the shard to generate (id < num_shards)",
        )
    return group