def parse_args(argv=None): parser = argparse.ArgumentParser(description='Train the neural model', prog='mmt train') parser.add_argument('data_path', metavar='DATA_FOLDER', help='data folder holding binarized training and validation sets') parser.add_argument('output_path', metavar='OUTPUT', help='the model output path') parser.add_argument('-n', '--checkpoints-num', dest='num_checkpoints', type=int, default=10, help='number of checkpoints to average (default is 10)') parser.add_argument('-w', '--working-dir', metavar='WORKING_DIR', dest='wdir', default=None, help='the working directory for temporary files (default is os temp folder)') parser.add_argument('-d', '--debug', action='store_true', dest='debug', default=False, help='prevents temporary files to be removed after execution') parser.add_argument('--log', dest='log_file', default=None, help='detailed log file') parser.add_argument('--resume', action='store_true', dest='resume', default=False, help='resume training from last saved checkpoint even after training completion') parser.add_argument('--from-model', dest='init_model', default=None, help='start the training from the specified model.pt file') parser.add_argument('--gpus', dest='gpus', nargs='+', type=int, default=None, help='the list of GPUs available for training (default is all available GPUs)') parser.add_argument('--tensorboard-port', dest='tensorboard_port', type=int, default=None, help='if specified, starts a tensorboard instance during training on the given port') parser.add_argument('--train-steps', dest='train_steps', type=int, default=None, help='by default the training stops when the validation loss reaches a plateau, with ' 'this option instead, the training process stops after the specified amount of steps') args, extra_argv = parser.parse_known_args(argv) if args.debug and args.wdir is None: raise CLIArgsException(parser, '"--debug" options requires explicit working dir with "--working-dir"') if args.tensorboard_port is not None: verify_tensorboard_dependencies(parser) return args, parse_extra_argv(parser, extra_argv)
def parse_args(argv=None): parser = argparse.ArgumentParser(description='Generate archives for neural training', prog='mmt datagen') parser.add_argument('lang_pairs', metavar='LANGUAGE_PAIRS', help='the language pair list encoded as <ls1>:<t1>[,<lsn>:<ltn>] (i.e. en:it,it:en,en:fr)') parser.add_argument('output_path', metavar='OUTPUT', help='the destination folder') parser.add_argument('input_paths', nargs='+', metavar='INPUT_PATHS', help='the paths to the training corpora') parser.add_argument('-w', '--working-dir', metavar='WORKING_DIR', dest='wdir', default=None, help='the working directory for temporary files (default is os temp folder)') parser.add_argument('-d', '--debug', action='store_true', dest='debug', default=False, help='prevents temporary files to be removed after execution') parser.add_argument('-s', '--voc-size', dest='voc_size', default=32768, type=int, help='the vocabulary size to use (default is 32768)') parser.add_argument('-T', '--threads', dest='threads', default=2, type=int, help='the number of threads used to find the bounds for vocabulary creation (default is 2)') parser.add_argument('--count-threshold', dest='count_threshold', default=None, type=int, help='all tokens with a count less than this threshold will be used ' 'only for alphabet generation in vocabulary creation, useful for very large corpus') parser.add_argument('--vocabulary', metavar='VOCABULARY_PATH', dest='vocabulary_path', default=None, help='use the specified bpe vocabulary model instead of re-train a new one from scratch') parser.add_argument('--log', dest='log_file', default=None, help='detailed log file') parser.add_argument('--test', metavar='TEST_SET_DIR', dest='test_dir', default=None, help='optional directory where to store a small subset of training data for testing') args = parser.parse_args(argv) if args.debug and args.wdir is None: raise CLIArgsException(parser, '"--debug" options requires explicit working dir with "--working-dir"') return args
def verify_tensorboard_dependencies(parser): try: import tensorflow import tensorboard except ImportError: raise CLIArgsException(parser, '"--tensorboard-port" options requires "tensorflow" and "tensorboard" ' 'python modules, but they could not be found, please install them using pip3')
def parse_extra_argv(parser, extra_argv): for reserved_opt in [ '--save-dir', '--user-dir', '--task', '--no-progress-bar', '--share-all-embeddings', '--tensorboard-logdir', '--max-update' ]: if argv_has(extra_argv, reserved_opt): raise CLIArgsException( parser, 'overriding option "%s" is not allowed' % reserved_opt) cmd_extra_args = extra_argv[:] if not argv_has(cmd_extra_args, '-a', '--arch'): cmd_extra_args.extend(['--arch', 'transformer_mmt_base']) if not argv_has(cmd_extra_args, '--clip-norm'): cmd_extra_args.extend(['--clip-norm', '0.0']) if not argv_has(cmd_extra_args, '--label-smoothing'): cmd_extra_args.extend(['--label-smoothing', '0.1']) if not argv_has(cmd_extra_args, '--attention-dropout'): cmd_extra_args.extend(['--attention-dropout', '0.1']) if not argv_has(cmd_extra_args, '--dropout'): cmd_extra_args.extend(['--dropout', '0.3']) if not argv_has(cmd_extra_args, '--wd', '--weight-decay'): cmd_extra_args.extend(['--weight-decay', '0.0']) if not argv_has(cmd_extra_args, '--criterion'): cmd_extra_args.extend(['--criterion', 'label_smoothed_cross_entropy']) if not argv_has(cmd_extra_args, '--optimizer'): cmd_extra_args.extend(['--optimizer', 'adam']) if not argv_has(cmd_extra_args, '--adam-betas'): cmd_extra_args.extend(['--adam-betas', '(0.9, 0.98)']) if not argv_has(cmd_extra_args, '--log-interval'): cmd_extra_args.extend(['--log-interval', '100']) if not argv_has(cmd_extra_args, '--lr', '--learning-rate'): cmd_extra_args.extend(['--lr', '0.0005']) if not argv_has(cmd_extra_args, '--lr-scheduler'): cmd_extra_args.extend(['--lr-scheduler', 'inverse_sqrt']) if not argv_has(cmd_extra_args, '--min-lr'): cmd_extra_args.extend(['--min-lr', '1e-09']) if not argv_has(cmd_extra_args, '--warmup-init-lr'): cmd_extra_args.extend(['--warmup-init-lr', '1e-07']) if not argv_has(cmd_extra_args, '--warmup-updates'): cmd_extra_args.extend(['--warmup-updates', '4000']) if not argv_has(cmd_extra_args, '--max-tokens'): cmd_extra_args.extend(['--max-tokens', '3072']) if not argv_has(cmd_extra_args, '--update-freq'): cmd_extra_args.extend(['--update-freq', '4']) if not argv_has(cmd_extra_args, '--save-interval-updates'): cmd_extra_args.extend(['--save-interval-updates', '1000']) if not argv_has(cmd_extra_args, '--keep-interval-updates'): cmd_extra_args.extend(['--keep-interval-updates', '10']) if not argv_has(cmd_extra_args, '--no-epoch-checkpoints') and not argv_has( cmd_extra_args, '--keep-last-epochs'): cmd_extra_args.extend(['--keep-last-epochs', '10']) return cmd_extra_args
def parse_args(argv=None): parser = argparse.ArgumentParser(description='Evaluate a ModernMT engine', prog='mmt evaluate') parser.add_argument('-s', '--source', dest='src_lang', metavar='SOURCE_LANGUAGE', default=None, help='the source language (ISO 639-1). Can be omitted if engine is monolingual.') parser.add_argument('-t', '--target', dest='tgt_lang', metavar='TARGET_LANGUAGE', default=None, help='the target language (ISO 639-1). Can be omitted if engine is monolingual.') parser.add_argument('--path', dest='test_set', metavar='CORPORA', default=None, help='the path to the test corpora (default is the automatically extracted sample)') parser.add_argument('-e', '--engine', dest='engine', help='the engine name, \'default\' will be used if absent', default='default') parser.add_argument('--gt-key', dest='google_key', metavar='GT_API_KEY', default=None, help='A custom Google Translate API Key to use during evaluation') parser.add_argument('--human-eval', dest='human_eval_path', metavar='OUTPUT', default=None, help='the output folder for the tab-spaced files needed to setup a Human Evaluation benchmark') parser.add_argument('-d', '--debug', action='store_true', dest='debug', help='if debug is set, prevents temporary files to be removed after execution') # Context arguments parser.add_argument('--context', metavar='CONTEXT', dest='context', help='A string to be used as translation context') parser.add_argument('--context-file', metavar='CONTEXT_FILE', dest='context_file', help='A local file to be used as translation context') parser.add_argument('--context-vector', metavar='CONTEXT_VECTOR', dest='context_vector', help='The context vector with format: <document 1>:<score 1>[,<document N>:<score N>]') args = parser.parse_args(argv) engine = Engine(args.engine) if args.src_lang is None or args.tgt_lang is None: if len(engine.languages) > 1: raise CLIArgsException(parser, 'Missing language. Options "-s" and "-t" are mandatory for multilingual engines.') args.src_lang, args.tgt_lang = engine.languages[0] if args.test_set is None: args.test_set = engine.get_test_path(args.src_lang, args.tgt_lang) if len(ParallelFileFormat.list(args.src_lang, args.tgt_lang, args.test_set)) == 0: raise CLIArgsException(parser, 'No parallel corpora found in path: ' + args.test_set) return args
def main_add(argv=None): parser = argparse.ArgumentParser( description='Add contribution to an existent memory', prog='mmt memory add') parser.add_argument('memory', help='the id of the memory', type=int) parser.add_argument('source', metavar='SOURCE_SENTENCE', help='the source sentence of the contribution') parser.add_argument('target', metavar='TARGET_SENTENCE', help='the target sentence of the contribution') parser.add_argument( '-s', '--source', dest='source_lang', metavar='SOURCE_LANGUAGE', default=None, help= 'the source language (ISO 639-1), can be omitted if engine is monolingual' ) parser.add_argument( '-t', '--target', dest='target_lang', metavar='TARGET_LANGUAGE', default=None, help= 'the target language (ISO 639-1), can be omitted if engine is monolingual' ) parser.add_argument( '-e', '--engine', dest='engine', help='the engine name, \'default\' will be used if absent', default='default') args = parser.parse_args(argv) node = _load_node(args.engine) # Infer default arguments if args.source_lang is None or args.target_lang is None: if len(node.engine.languages) > 1: raise CLIArgsException( parser, 'Missing language. Options "-s" and "-t" are mandatory for multilingual engines.' ) args.source_lang, args.target_lang = node.engine.languages[0] node.api.append_to_memory(args.source_lang, args.target_lang, args.memory, args.source, args.target) print('Contribution added to memory %d' % args.memory)
def parse_args(argv=None): parser = argparse.ArgumentParser( description='Clean parallel corpora before training', prog='mmt clean') parser.add_argument('src_lang', metavar='SOURCE_LANGUAGE', help='the source language (ISO 639-1)') parser.add_argument('tgt_lang', metavar='TARGET_LANGUAGE', help='the target language (ISO 639-1)') parser.add_argument('input_path', metavar='INPUT', help='the path to the corpora to clean') parser.add_argument('output_path', metavar='OUTPUT', help='the destination folder') parser.add_argument( '--dedup-sort', metavar='SUBSTRING', dest='dedup_sort', default=None, nargs='+', help='list of substrings to use to sort corpora during deduplication') parser.add_argument( '-w', '--working-dir', metavar='WORKING_DIR', dest='wdir', default=None, help= 'the working directory for temporary files (default is os temp folder)' ) parser.add_argument( '-d', '--debug', action='store_true', dest='debug', default=False, help='prevents temporary files to be removed after execution') parser.add_argument('--log', dest='log_file', default=None, help='detailed log file') args = parser.parse_args(argv) if args.debug and args.wdir is None: raise CLIArgsException( parser, '"--debug" options requires explicit working dir with "--working-dir"' ) return args
def parse_args(argv=None): parser = argparse.ArgumentParser( description='Translate text with ModernMT', prog='mmt translate') parser.add_argument('text', metavar='TEXT', help='text to be translated (optional)', default=None, nargs='?') parser.add_argument( '-s', '--source', dest='source_lang', metavar='SOURCE_LANGUAGE', default=None, help= 'the source language (ISO 639-1). Can be omitted if engine is monolingual.' ) parser.add_argument( '-t', '--target', dest='target_lang', metavar='TARGET_LANGUAGE', default=None, help= 'the target language (ISO 639-1). Can be omitted if engine is monolingual.' ) # Context arguments parser.add_argument('--context', metavar='CONTEXT', dest='context', help='A string to be used as translation context') parser.add_argument('--context-file', metavar='CONTEXT_FILE', dest='context_file', help='A local file to be used as translation context') parser.add_argument( '--context-vector', metavar='CONTEXT_VECTOR', dest='context_vector', help= 'The context vector with format: <document 1>:<score 1>[,<document N>:<score N>]' ) # Mixed arguments parser.add_argument( '-e', '--engine', dest='engine', help='the engine name, \'default\' will be used if absent', default='default') parser.add_argument( '--batch', action='store_true', dest='batch', default=False, help= 'if set, the script will read the whole stdin before send translations to MMT.' 'This can be used to execute translation in parallel for a faster translation. ' ) parser.add_argument('--threads', dest='threads', default=None, type=int, help='number of concurrent translation requests.') parser.add_argument('--xliff', dest='is_xliff', action='store_true', default=False, help='if set, the input is a XLIFF file.') parser.add_argument( '--split-lines', dest='split_lines', action='store_true', default=False, help='if set, ModernMT will split input text by carriage-return char') args = parser.parse_args(argv) engine = Engine(args.engine) if args.source_lang is None or args.target_lang is None: if len(engine.languages) > 1: raise CLIArgsException( parser, 'Missing language. Options "-s" and "-t" are mandatory for multilingual engines.' ) args.source_lang, args.target_lang = engine.languages[0] return parser.parse_args(argv)
def main_import(argv=None): parser = argparse.ArgumentParser( description= 'Import content, TMX or Parallel files, into a new or existing memory') parser.add_argument('-x', '--tmx-file', dest='tmx', metavar='TMX_FILE', help='TMX file to import', default=None) parser.add_argument( '-p', '--parallel-files', dest='parallel_file', default=None, nargs=2, help= 'source and target file (file extension must be source and target languages)' ) parser.add_argument( '-e', '--engine', dest='engine', help='the engine name, \'default\' will be used if absent', default='default') parser.add_argument( '--id', type=int, default=None, dest='memory', help= 'the optional destination memory id (by default, a new Memory is created)' ) args = parser.parse_args(argv) if args.tmx is None and args.parallel_file is None: raise CLIArgsException( parser, 'missing one of the following options: "-x" or "-p"') node = _load_node(args.engine) corpus_name = os.path.splitext( os.path.basename(args.tmx or args.parallel_file[0]))[0] new_memory = None if args.memory is None: new_memory = node.api.create_memory(corpus_name) args.memory = new_memory['id'] progressbar = Progressbar(label='Importing %s' % corpus_name) progressbar.start() try: if args.tmx is not None: job = node.api.import_into_memory(args.memory, tmx=args.tmx) else: src_file, tgt_file = args.parallel_file src_lang, tgt_lang = os.path.splitext( src_file)[1][1:], os.path.splitext(tgt_file)[1][1:] job = node.api.import_into_memory(args.memory, source_file=src_file, target_file=tgt_file, source_lang=src_lang, target_lang=tgt_lang) progressbar.set_progress(job['progress']) while job['progress'] != 1.0: time.sleep(1) job = node.api.get_import_job(job['id']) progressbar.set_progress(job['progress']) progressbar.complete() print('IMPORT SUCCESS') except BaseException as e: if new_memory is not None: try: node.api.delete_memory(new_memory['id']) except: pass progressbar.abort(repr(e)) print('IMPORT FAILED') raise
def parse_args(argv=None): parser = argparse.ArgumentParser( description='Create a new ModernMT engine from scratch', prog='mmt create') parser.add_argument('src_lang', metavar='SOURCE_LANGUAGE', help='the source language (ISO 639-1)') parser.add_argument('tgt_lang', metavar='TARGET_LANGUAGE', help='the target language (ISO 639-1)') parser.add_argument('input_path', metavar='INPUT', help='the path to the parallel corpora collection') parser.add_argument( '-e', '--engine', dest='engine', help='the engine name, "default" will be used if absent', default='default') parser.add_argument( '-d', '--debug', action='store_true', dest='debug', default=False, help='prevents temporary files to be removed after execution') parser.add_argument( '-y', '--yes', action='store_true', dest='force_delete', default=False, help='if set, skip engine overwrite confirmation check') parser.add_argument( '--resume', action='store_true', dest='resume', default=False, help='resume an interrupted training, ' 'it can be used also to resume a training after its completion') cleaning_args = parser.add_argument_group('Data cleaning arguments') cleaning_args.add_argument( '--skip-cleaning', action='store_true', dest='skip_cleaning', default=False, help= 'skip the cleaning step (input corpora MUST be in plain text parallel format)' ) datagen_args = parser.add_argument_group('Data generation arguments') datagen_args.add_argument( '--voc-size', dest='voc_size', default=32768, type=int, help='the vocabulary size to use (default is 32768)') datagen_args.add_argument( '-T', '--threads', dest='threads', default=2, type=int, help= 'the number of threads used in bounds search for vocabulary creation (default is 2)' ) datagen_args.add_argument( '--count-threshold', dest='count_threshold', default=None, type=int, help='all tokens with a count less than this threshold will be used ' 'only for alphabet generation in vocabulary creation, useful for very large corpus' ) datagen_args.add_argument( '--vocabulary', metavar='VOCABULARY_PATH', dest='vocabulary_path', default=None, help= 'use the specified bpe vocabulary model instead of re-train a new one from scratch' ) datagen_args.add_argument( '--no-test', action='store_false', dest='test_set', default=True, help= 'skip automatically extraction of a test set from the provided training corpora' ) train_args = parser.add_argument_group( 'Train arguments (note: you can use all fairseq cli options)') train_args.add_argument( '--from-model', dest='init_model', default=None, help='start the training from the specified model, ' 'the path must contain "model.pt" and "model.vcb" files') train_args.add_argument( '-n', '--checkpoints-num', dest='num_checkpoints', type=int, default=10, help='number of checkpoints to average (default is 10)') train_args.add_argument( '--gpus', dest='gpus', nargs='+', type=int, default=None, help= 'the list of GPUs available for training (default is all available GPUs)' ) train_args.add_argument( '--tensorboard-port', dest='tensorboard_port', type=int, default=None, help= 'if specified, starts a tensorboard instance during training on the given port' ) train_args.add_argument( '--train-steps', dest='train_steps', type=int, default=None, help= 'by default the training stops when the validation loss reaches a plateau, with ' 'this option instead, the training process stops after the specified amount of steps' ) args, extra_argv = parser.parse_known_args(argv) if args.vocabulary_path is not None and args.init_model is not None: raise CLIArgsException( parser, 'Cannot specify both options: "--vocabulary" and "--from-model"') if args.tensorboard_port is not None: train.verify_tensorboard_dependencies(parser) return args, train.parse_extra_argv(parser, extra_argv)