Ejemplo n.º 1
0
def parse_args(argv=None):
    parser = argparse.ArgumentParser(description='Train the neural model', prog='mmt train')
    parser.add_argument('data_path', metavar='DATA_FOLDER',
                        help='data folder holding binarized training and validation sets')
    parser.add_argument('output_path', metavar='OUTPUT', help='the model output path')
    parser.add_argument('-n', '--checkpoints-num', dest='num_checkpoints', type=int, default=10,
                        help='number of checkpoints to average (default is 10)')
    parser.add_argument('-w', '--working-dir', metavar='WORKING_DIR', dest='wdir', default=None,
                        help='the working directory for temporary files (default is os temp folder)')
    parser.add_argument('-d', '--debug', action='store_true', dest='debug', default=False,
                        help='prevents temporary files to be removed after execution')
    parser.add_argument('--log', dest='log_file', default=None, help='detailed log file')
    parser.add_argument('--resume', action='store_true', dest='resume', default=False,
                        help='resume training from last saved checkpoint even after training completion')
    parser.add_argument('--from-model', dest='init_model', default=None,
                        help='start the training from the specified model.pt file')
    parser.add_argument('--gpus', dest='gpus', nargs='+', type=int, default=None,
                        help='the list of GPUs available for training (default is all available GPUs)')
    parser.add_argument('--tensorboard-port', dest='tensorboard_port', type=int, default=None,
                        help='if specified, starts a tensorboard instance during training on the given port')
    parser.add_argument('--train-steps', dest='train_steps', type=int, default=None,
                        help='by default the training stops when the validation loss reaches a plateau, with '
                             'this option instead, the training process stops after the specified amount of steps')

    args, extra_argv = parser.parse_known_args(argv)
    if args.debug and args.wdir is None:
        raise CLIArgsException(parser, '"--debug" options requires explicit working dir with "--working-dir"')

    if args.tensorboard_port is not None:
        verify_tensorboard_dependencies(parser)

    return args, parse_extra_argv(parser, extra_argv)
Ejemplo n.º 2
0
def parse_args(argv=None):
    parser = argparse.ArgumentParser(description='Generate archives for neural training', prog='mmt datagen')
    parser.add_argument('lang_pairs', metavar='LANGUAGE_PAIRS',
                        help='the language pair list encoded as <ls1>:<t1>[,<lsn>:<ltn>] (i.e. en:it,it:en,en:fr)')
    parser.add_argument('output_path', metavar='OUTPUT', help='the destination folder')
    parser.add_argument('input_paths', nargs='+', metavar='INPUT_PATHS', help='the paths to the training corpora')
    parser.add_argument('-w', '--working-dir', metavar='WORKING_DIR', dest='wdir', default=None,
                        help='the working directory for temporary files (default is os temp folder)')
    parser.add_argument('-d', '--debug', action='store_true', dest='debug', default=False,
                        help='prevents temporary files to be removed after execution')
    parser.add_argument('-s', '--voc-size', dest='voc_size', default=32768, type=int,
                        help='the vocabulary size to use (default is 32768)')
    parser.add_argument('-T', '--threads', dest='threads', default=2, type=int,
                        help='the number of threads used to find the bounds for vocabulary creation (default is 2)')
    parser.add_argument('--count-threshold', dest='count_threshold', default=None, type=int,
                        help='all tokens with a count less than this threshold will be used '
                             'only for alphabet generation in vocabulary creation, useful for very large corpus')
    parser.add_argument('--vocabulary', metavar='VOCABULARY_PATH', dest='vocabulary_path', default=None,
                        help='use the specified bpe vocabulary model instead of re-train a new one from scratch')
    parser.add_argument('--log', dest='log_file', default=None, help='detailed log file')
    parser.add_argument('--test', metavar='TEST_SET_DIR', dest='test_dir', default=None,
                        help='optional directory where to store a small subset of training data for testing')

    args = parser.parse_args(argv)
    if args.debug and args.wdir is None:
        raise CLIArgsException(parser, '"--debug" options requires explicit working dir with "--working-dir"')
    return args
Ejemplo n.º 3
0
def verify_tensorboard_dependencies(parser):
    try:
        import tensorflow
        import tensorboard
    except ImportError:
        raise CLIArgsException(parser, '"--tensorboard-port" options requires "tensorflow" and "tensorboard" '
                                       'python modules, but they could not be found, please install them using pip3')
Ejemplo n.º 4
0
def parse_extra_argv(parser, extra_argv):
    for reserved_opt in [
            '--save-dir', '--user-dir', '--task', '--no-progress-bar',
            '--share-all-embeddings', '--tensorboard-logdir', '--max-update'
    ]:
        if argv_has(extra_argv, reserved_opt):
            raise CLIArgsException(
                parser, 'overriding option "%s" is not allowed' % reserved_opt)

    cmd_extra_args = extra_argv[:]

    if not argv_has(cmd_extra_args, '-a', '--arch'):
        cmd_extra_args.extend(['--arch', 'transformer_mmt_base'])
    if not argv_has(cmd_extra_args, '--clip-norm'):
        cmd_extra_args.extend(['--clip-norm', '0.0'])
    if not argv_has(cmd_extra_args, '--label-smoothing'):
        cmd_extra_args.extend(['--label-smoothing', '0.1'])
    if not argv_has(cmd_extra_args, '--attention-dropout'):
        cmd_extra_args.extend(['--attention-dropout', '0.1'])
    if not argv_has(cmd_extra_args, '--dropout'):
        cmd_extra_args.extend(['--dropout', '0.3'])
    if not argv_has(cmd_extra_args, '--wd', '--weight-decay'):
        cmd_extra_args.extend(['--weight-decay', '0.0'])
    if not argv_has(cmd_extra_args, '--criterion'):
        cmd_extra_args.extend(['--criterion', 'label_smoothed_cross_entropy'])

    if not argv_has(cmd_extra_args, '--optimizer'):
        cmd_extra_args.extend(['--optimizer', 'adam'])
        if not argv_has(cmd_extra_args, '--adam-betas'):
            cmd_extra_args.extend(['--adam-betas', '(0.9, 0.98)'])

    if not argv_has(cmd_extra_args, '--log-interval'):
        cmd_extra_args.extend(['--log-interval', '100'])

    if not argv_has(cmd_extra_args, '--lr', '--learning-rate'):
        cmd_extra_args.extend(['--lr', '0.0005'])
    if not argv_has(cmd_extra_args, '--lr-scheduler'):
        cmd_extra_args.extend(['--lr-scheduler', 'inverse_sqrt'])
    if not argv_has(cmd_extra_args, '--min-lr'):
        cmd_extra_args.extend(['--min-lr', '1e-09'])
    if not argv_has(cmd_extra_args, '--warmup-init-lr'):
        cmd_extra_args.extend(['--warmup-init-lr', '1e-07'])
    if not argv_has(cmd_extra_args, '--warmup-updates'):
        cmd_extra_args.extend(['--warmup-updates', '4000'])

    if not argv_has(cmd_extra_args, '--max-tokens'):
        cmd_extra_args.extend(['--max-tokens', '3072'])
    if not argv_has(cmd_extra_args, '--update-freq'):
        cmd_extra_args.extend(['--update-freq', '4'])

    if not argv_has(cmd_extra_args, '--save-interval-updates'):
        cmd_extra_args.extend(['--save-interval-updates', '1000'])
    if not argv_has(cmd_extra_args, '--keep-interval-updates'):
        cmd_extra_args.extend(['--keep-interval-updates', '10'])
    if not argv_has(cmd_extra_args, '--no-epoch-checkpoints') and not argv_has(
            cmd_extra_args, '--keep-last-epochs'):
        cmd_extra_args.extend(['--keep-last-epochs', '10'])

    return cmd_extra_args
Ejemplo n.º 5
0
def parse_args(argv=None):
    parser = argparse.ArgumentParser(description='Evaluate a ModernMT engine', prog='mmt evaluate')
    parser.add_argument('-s', '--source', dest='src_lang', metavar='SOURCE_LANGUAGE', default=None,
                        help='the source language (ISO 639-1). Can be omitted if engine is monolingual.')
    parser.add_argument('-t', '--target', dest='tgt_lang', metavar='TARGET_LANGUAGE', default=None,
                        help='the target language (ISO 639-1). Can be omitted if engine is monolingual.')
    parser.add_argument('--path', dest='test_set', metavar='CORPORA', default=None,
                        help='the path to the test corpora (default is the automatically extracted sample)')

    parser.add_argument('-e', '--engine', dest='engine', help='the engine name, \'default\' will be used if absent',
                        default='default')
    parser.add_argument('--gt-key', dest='google_key', metavar='GT_API_KEY', default=None,
                        help='A custom Google Translate API Key to use during evaluation')
    parser.add_argument('--human-eval', dest='human_eval_path', metavar='OUTPUT', default=None,
                        help='the output folder for the tab-spaced files needed to setup a Human Evaluation benchmark')
    parser.add_argument('-d', '--debug', action='store_true', dest='debug',
                        help='if debug is set, prevents temporary files to be removed after execution')

    # Context arguments
    parser.add_argument('--context', metavar='CONTEXT', dest='context',
                        help='A string to be used as translation context')
    parser.add_argument('--context-file', metavar='CONTEXT_FILE', dest='context_file',
                        help='A local file to be used as translation context')
    parser.add_argument('--context-vector', metavar='CONTEXT_VECTOR', dest='context_vector',
                        help='The context vector with format: <document 1>:<score 1>[,<document N>:<score N>]')

    args = parser.parse_args(argv)

    engine = Engine(args.engine)
    if args.src_lang is None or args.tgt_lang is None:
        if len(engine.languages) > 1:
            raise CLIArgsException(parser,
                                   'Missing language. Options "-s" and "-t" are mandatory for multilingual engines.')
        args.src_lang, args.tgt_lang = engine.languages[0]

    if args.test_set is None:
        args.test_set = engine.get_test_path(args.src_lang, args.tgt_lang)

    if len(ParallelFileFormat.list(args.src_lang, args.tgt_lang, args.test_set)) == 0:
        raise CLIArgsException(parser, 'No parallel corpora found in path: ' + args.test_set)

    return args
Ejemplo n.º 6
0
def main_add(argv=None):
    parser = argparse.ArgumentParser(
        description='Add contribution to an existent memory',
        prog='mmt memory add')

    parser.add_argument('memory', help='the id of the memory', type=int)
    parser.add_argument('source',
                        metavar='SOURCE_SENTENCE',
                        help='the source sentence of the contribution')
    parser.add_argument('target',
                        metavar='TARGET_SENTENCE',
                        help='the target sentence of the contribution')
    parser.add_argument(
        '-s',
        '--source',
        dest='source_lang',
        metavar='SOURCE_LANGUAGE',
        default=None,
        help=
        'the source language (ISO 639-1), can be omitted if engine is monolingual'
    )
    parser.add_argument(
        '-t',
        '--target',
        dest='target_lang',
        metavar='TARGET_LANGUAGE',
        default=None,
        help=
        'the target language (ISO 639-1), can be omitted if engine is monolingual'
    )
    parser.add_argument(
        '-e',
        '--engine',
        dest='engine',
        help='the engine name, \'default\' will be used if absent',
        default='default')

    args = parser.parse_args(argv)

    node = _load_node(args.engine)

    # Infer default arguments
    if args.source_lang is None or args.target_lang is None:
        if len(node.engine.languages) > 1:
            raise CLIArgsException(
                parser,
                'Missing language. Options "-s" and "-t" are mandatory for multilingual engines.'
            )
        args.source_lang, args.target_lang = node.engine.languages[0]

    node.api.append_to_memory(args.source_lang, args.target_lang, args.memory,
                              args.source, args.target)

    print('Contribution added to memory %d' % args.memory)
Ejemplo n.º 7
0
def parse_args(argv=None):
    parser = argparse.ArgumentParser(
        description='Clean parallel corpora before training', prog='mmt clean')
    parser.add_argument('src_lang',
                        metavar='SOURCE_LANGUAGE',
                        help='the source language (ISO 639-1)')
    parser.add_argument('tgt_lang',
                        metavar='TARGET_LANGUAGE',
                        help='the target language (ISO 639-1)')
    parser.add_argument('input_path',
                        metavar='INPUT',
                        help='the path to the corpora to clean')
    parser.add_argument('output_path',
                        metavar='OUTPUT',
                        help='the destination folder')
    parser.add_argument(
        '--dedup-sort',
        metavar='SUBSTRING',
        dest='dedup_sort',
        default=None,
        nargs='+',
        help='list of substrings to use to sort corpora during deduplication')
    parser.add_argument(
        '-w',
        '--working-dir',
        metavar='WORKING_DIR',
        dest='wdir',
        default=None,
        help=
        'the working directory for temporary files (default is os temp folder)'
    )
    parser.add_argument(
        '-d',
        '--debug',
        action='store_true',
        dest='debug',
        default=False,
        help='prevents temporary files to be removed after execution')
    parser.add_argument('--log',
                        dest='log_file',
                        default=None,
                        help='detailed log file')

    args = parser.parse_args(argv)
    if args.debug and args.wdir is None:
        raise CLIArgsException(
            parser,
            '"--debug" options requires explicit working dir with "--working-dir"'
        )
    return args
Ejemplo n.º 8
0
def parse_args(argv=None):
    parser = argparse.ArgumentParser(
        description='Translate text with ModernMT', prog='mmt translate')
    parser.add_argument('text',
                        metavar='TEXT',
                        help='text to be translated (optional)',
                        default=None,
                        nargs='?')
    parser.add_argument(
        '-s',
        '--source',
        dest='source_lang',
        metavar='SOURCE_LANGUAGE',
        default=None,
        help=
        'the source language (ISO 639-1). Can be omitted if engine is monolingual.'
    )
    parser.add_argument(
        '-t',
        '--target',
        dest='target_lang',
        metavar='TARGET_LANGUAGE',
        default=None,
        help=
        'the target language (ISO 639-1). Can be omitted if engine is monolingual.'
    )

    # Context arguments
    parser.add_argument('--context',
                        metavar='CONTEXT',
                        dest='context',
                        help='A string to be used as translation context')
    parser.add_argument('--context-file',
                        metavar='CONTEXT_FILE',
                        dest='context_file',
                        help='A local file to be used as translation context')
    parser.add_argument(
        '--context-vector',
        metavar='CONTEXT_VECTOR',
        dest='context_vector',
        help=
        'The context vector with format: <document 1>:<score 1>[,<document N>:<score N>]'
    )

    # Mixed arguments
    parser.add_argument(
        '-e',
        '--engine',
        dest='engine',
        help='the engine name, \'default\' will be used if absent',
        default='default')
    parser.add_argument(
        '--batch',
        action='store_true',
        dest='batch',
        default=False,
        help=
        'if set, the script will read the whole stdin before send translations to MMT.'
        'This can be used to execute translation in parallel for a faster translation. '
    )
    parser.add_argument('--threads',
                        dest='threads',
                        default=None,
                        type=int,
                        help='number of concurrent translation requests.')
    parser.add_argument('--xliff',
                        dest='is_xliff',
                        action='store_true',
                        default=False,
                        help='if set, the input is a XLIFF file.')
    parser.add_argument(
        '--split-lines',
        dest='split_lines',
        action='store_true',
        default=False,
        help='if set, ModernMT will split input text by carriage-return char')

    args = parser.parse_args(argv)

    engine = Engine(args.engine)

    if args.source_lang is None or args.target_lang is None:
        if len(engine.languages) > 1:
            raise CLIArgsException(
                parser,
                'Missing language. Options "-s" and "-t" are mandatory for multilingual engines.'
            )
        args.source_lang, args.target_lang = engine.languages[0]

    return parser.parse_args(argv)
Ejemplo n.º 9
0
def main_import(argv=None):
    parser = argparse.ArgumentParser(
        description=
        'Import content, TMX or Parallel files, into a new or existing memory')
    parser.add_argument('-x',
                        '--tmx-file',
                        dest='tmx',
                        metavar='TMX_FILE',
                        help='TMX file to import',
                        default=None)
    parser.add_argument(
        '-p',
        '--parallel-files',
        dest='parallel_file',
        default=None,
        nargs=2,
        help=
        'source and target file (file extension must be source and target languages)'
    )
    parser.add_argument(
        '-e',
        '--engine',
        dest='engine',
        help='the engine name, \'default\' will be used if absent',
        default='default')
    parser.add_argument(
        '--id',
        type=int,
        default=None,
        dest='memory',
        help=
        'the optional destination memory id (by default, a new Memory is created)'
    )

    args = parser.parse_args(argv)

    if args.tmx is None and args.parallel_file is None:
        raise CLIArgsException(
            parser, 'missing one of the following options: "-x" or "-p"')

    node = _load_node(args.engine)
    corpus_name = os.path.splitext(
        os.path.basename(args.tmx or args.parallel_file[0]))[0]

    new_memory = None
    if args.memory is None:
        new_memory = node.api.create_memory(corpus_name)
        args.memory = new_memory['id']

    progressbar = Progressbar(label='Importing %s' % corpus_name)
    progressbar.start()

    try:
        if args.tmx is not None:
            job = node.api.import_into_memory(args.memory, tmx=args.tmx)
        else:
            src_file, tgt_file = args.parallel_file
            src_lang, tgt_lang = os.path.splitext(
                src_file)[1][1:], os.path.splitext(tgt_file)[1][1:]

            job = node.api.import_into_memory(args.memory,
                                              source_file=src_file,
                                              target_file=tgt_file,
                                              source_lang=src_lang,
                                              target_lang=tgt_lang)

        progressbar.set_progress(job['progress'])

        while job['progress'] != 1.0:
            time.sleep(1)
            job = node.api.get_import_job(job['id'])
            progressbar.set_progress(job['progress'])

        progressbar.complete()
        print('IMPORT SUCCESS')
    except BaseException as e:
        if new_memory is not None:
            try:
                node.api.delete_memory(new_memory['id'])
            except:
                pass

        progressbar.abort(repr(e))
        print('IMPORT FAILED')

        raise
Ejemplo n.º 10
0
def parse_args(argv=None):
    parser = argparse.ArgumentParser(
        description='Create a new ModernMT engine from scratch',
        prog='mmt create')
    parser.add_argument('src_lang',
                        metavar='SOURCE_LANGUAGE',
                        help='the source language (ISO 639-1)')
    parser.add_argument('tgt_lang',
                        metavar='TARGET_LANGUAGE',
                        help='the target language (ISO 639-1)')
    parser.add_argument('input_path',
                        metavar='INPUT',
                        help='the path to the parallel corpora collection')
    parser.add_argument(
        '-e',
        '--engine',
        dest='engine',
        help='the engine name, "default" will be used if absent',
        default='default')
    parser.add_argument(
        '-d',
        '--debug',
        action='store_true',
        dest='debug',
        default=False,
        help='prevents temporary files to be removed after execution')
    parser.add_argument(
        '-y',
        '--yes',
        action='store_true',
        dest='force_delete',
        default=False,
        help='if set, skip engine overwrite confirmation check')
    parser.add_argument(
        '--resume',
        action='store_true',
        dest='resume',
        default=False,
        help='resume an interrupted training, '
        'it can be used also to resume a training after its completion')

    cleaning_args = parser.add_argument_group('Data cleaning arguments')
    cleaning_args.add_argument(
        '--skip-cleaning',
        action='store_true',
        dest='skip_cleaning',
        default=False,
        help=
        'skip the cleaning step (input corpora MUST be in plain text parallel format)'
    )

    datagen_args = parser.add_argument_group('Data generation arguments')
    datagen_args.add_argument(
        '--voc-size',
        dest='voc_size',
        default=32768,
        type=int,
        help='the vocabulary size to use (default is 32768)')
    datagen_args.add_argument(
        '-T',
        '--threads',
        dest='threads',
        default=2,
        type=int,
        help=
        'the number of threads used in bounds search for vocabulary creation (default is 2)'
    )
    datagen_args.add_argument(
        '--count-threshold',
        dest='count_threshold',
        default=None,
        type=int,
        help='all tokens with a count less than this threshold will be used '
        'only for alphabet generation in vocabulary creation, useful for very large corpus'
    )
    datagen_args.add_argument(
        '--vocabulary',
        metavar='VOCABULARY_PATH',
        dest='vocabulary_path',
        default=None,
        help=
        'use the specified bpe vocabulary model instead of re-train a new one from scratch'
    )
    datagen_args.add_argument(
        '--no-test',
        action='store_false',
        dest='test_set',
        default=True,
        help=
        'skip automatically extraction of a test set from the provided training corpora'
    )

    train_args = parser.add_argument_group(
        'Train arguments (note: you can use all fairseq cli options)')
    train_args.add_argument(
        '--from-model',
        dest='init_model',
        default=None,
        help='start the training from the specified model, '
        'the path must contain "model.pt" and "model.vcb" files')
    train_args.add_argument(
        '-n',
        '--checkpoints-num',
        dest='num_checkpoints',
        type=int,
        default=10,
        help='number of checkpoints to average (default is 10)')
    train_args.add_argument(
        '--gpus',
        dest='gpus',
        nargs='+',
        type=int,
        default=None,
        help=
        'the list of GPUs available for training (default is all available GPUs)'
    )
    train_args.add_argument(
        '--tensorboard-port',
        dest='tensorboard_port',
        type=int,
        default=None,
        help=
        'if specified, starts a tensorboard instance during training on the given port'
    )
    train_args.add_argument(
        '--train-steps',
        dest='train_steps',
        type=int,
        default=None,
        help=
        'by default the training stops when the validation loss reaches a plateau, with '
        'this option instead, the training process stops after the specified amount of steps'
    )

    args, extra_argv = parser.parse_known_args(argv)

    if args.vocabulary_path is not None and args.init_model is not None:
        raise CLIArgsException(
            parser,
            'Cannot specify both options: "--vocabulary" and "--from-model"')

    if args.tensorboard_port is not None:
        train.verify_tensorboard_dependencies(parser)

    return args, train.parse_extra_argv(parser, extra_argv)