Beispiel #1
0
def add_arguments(parser):
    argument_group = parser.add_argument_group("files")
    argument_group.add_argument(
        'model_path', metavar='MODEL-FILE', type=str,
        help='the model file that will be used to score text')
    argument_group.add_argument(
        'input_file', metavar='TEXT-FILE', type=TextFileType('r'),
        help='text file containing text to be scored (UTF-8, one sentence per '
             'line, assumed to be compressed if the name ends in ".gz")')
    argument_group.add_argument(
        '--output-file', metavar='FILE', type=TextFileType('w'), default='-',
        help='where to write the statistics (default stdout, will be '
             'compressed if the name ends in ".gz")')

    argument_group = parser.add_argument_group("scoring")
    argument_group.add_argument(
        '--output', metavar='DETAIL', type=str, default='perplexity',
        help='what to output, one of "perplexity", "utterance-scores", '
             '"word-scores" (default "perplexity")')
    argument_group.add_argument(
        '--log-base', metavar='B', type=int, default=None,
        help='convert output log probabilities to base B (default is the '
             'natural logarithm)')
    argument_group.add_argument(
        '--unk-penalty', metavar='LOGPROB', type=float, default=None,
        help="if LOGPROB is zero, do not include <unk> tokens in perplexity "
             "computation; otherwise use constant LOGPROB as <unk> token score "
             "(default is to use the network to predict <unk> probability)")
Beispiel #2
0
def add_arguments(parser):
    """Specifies the command line arguments supported by the "theanolm sample"
    command.

    :type parser: argparse.ArgumentParser
    :param parser: a command line argument parser
    """

    argument_group = parser.add_argument_group("files")
    argument_group.add_argument(
        'model_path', metavar='MODEL-FILE', type=str,
        help='the model file that will be used to generate text')
    argument_group.add_argument(
        '--output-file', metavar='FILE', type=TextFileType('w'), default='-',
        help='where to write the generated sentences (default stdout, will be '
             'compressed if the name ends in ".gz")')

    argument_group = parser.add_argument_group("sampling")
    argument_group.add_argument(
        '--num-sentences', metavar='N', type=int, default=10,
        help='generate N sentences')
    argument_group.add_argument(
        '--random-seed', metavar='N', type=int, default=None,
        help='seed to initialize the random state (default is to seed from a '
             'random source provided by the oprating system)')

    argument_group = parser.add_argument_group("debugging")
    argument_group.add_argument(
        '--debug', action="store_true",
        help='enables debugging Theano errors')
Beispiel #3
0
def main():
    parser = argparse.ArgumentParser(prog='wctool')

    argument_group = parser.add_argument_group("files")
    argument_group.add_argument(
        '--training-set',
        metavar='FILE',
        type=TextFileType('r'),
        nargs='+',
        required=True,
        help='text or .gz files containing training data (one sentence per '
        'line)')
    argument_group.add_argument(
        '--vocabulary',
        metavar='FILE',
        type=TextFileType('r'),
        default=None,
        help='text or .gz file containing a list of words to include in class '
        'forming, and possibly their initial classes')
    argument_group.add_argument(
        '--vocabulary-format',
        metavar='FORMAT',
        type=str,
        default='words',
        help='vocabulary format, one of "words" (one word per line, default), '
        '"classes" (word and class ID per line), "srilm-classes" (class '
        'name, membership probability, and word per line)')
    argument_group.add_argument(
        '--output-file',
        metavar='FILE',
        type=TextFileType('w'),
        default='-',
        help='where to write the word classes (default stdout)')
    argument_group.add_argument(
        '--output-format',
        metavar='FORMAT',
        type=str,
        default='srilm-classes',
        help='format of the output file, one of "classes" (word and class ID '
        'per line), "srilm-classes" (default; class name, membership '
        'probability, and word per line)')
    argument_group.add_argument(
        '--output-frequency',
        metavar='N',
        type=int,
        default='1',
        help='save classes N times per optimization iteration (default 1)')

    argument_group = parser.add_argument_group("optimization")
    argument_group.add_argument(
        '--num-classes',
        metavar='N',
        type=int,
        default=2000,
        help='number of classes to form, if vocabulary is not specified '
        '(default 2000)')
    argument_group.add_argument(
        '--method',
        metavar='NAME',
        type=str,
        default='bigram-theano',
        help='method for creating word classes, one of "bigram-theano", '
        '"bigram-numpy" (default "bigram-theano")')

    argument_group = parser.add_argument_group("logging and debugging")
    argument_group.add_argument(
        '--log-file',
        metavar='FILE',
        type=str,
        default='-',
        help='path where to write log file (default is standard output)')
    argument_group.add_argument(
        '--log-level',
        metavar='LEVEL',
        type=str,
        default='info',
        help='minimum level of events to log, one of "debug", "info", "warn" '
        '(default "info")')
    argument_group.add_argument(
        '--log-interval',
        metavar='N',
        type=int,
        default=1000,
        help='print statistics after every Nth word; quiet if less than one '
        '(default 1000)')

    args = parser.parse_args()

    log_file = args.log_file
    log_level = getattr(logging, args.log_level.upper(), None)
    if not isinstance(log_level, int):
        raise ValueError("Invalid logging level requested: " + args.log_level)
    log_format = '%(asctime)s %(funcName)s: %(message)s'
    if args.log_file == '-':
        logging.basicConfig(stream=sys.stdout,
                            format=log_format,
                            level=log_level)
    else:
        logging.basicConfig(filename=log_file,
                            format=log_format,
                            level=log_level)

    if args.vocabulary is None:
        vocabulary = Vocabulary.from_corpus(args.training_set,
                                            args.num_classes)
        for subset_file in args.training_set:
            subset_file.seek(0)
    else:
        vocabulary = Vocabulary.from_file(args.vocabulary,
                                          args.vocabulary_format)

    print("Number of words in vocabulary:", vocabulary.num_words())
    print("Number of word classes:", vocabulary.num_classes())
    print("Number of normal word classes:", vocabulary.num_normal_classes)

    logging.info("Reading word unigram and bigram statistics.")
    statistics = WordStatistics(args.training_set, vocabulary)

    if args.method == 'bigram-theano':
        optimizer = TheanoBigramOptimizer(statistics, vocabulary)
    elif args.method == 'bigram-numpy':
        optimizer = NumpyBigramOptimizer(statistics, vocabulary)
    else:
        raise ValueError("Invalid method requested: " + args.method)

    iteration = 1
    while True:
        logging.info("Starting iteration %d.", iteration)
        num_words = 0
        num_moves = 0
        for word in vocabulary.words():
            start_time = time()
            num_words += 1
            if optimizer.move_to_best_class(word):
                num_moves += 1
            duration = time() - start_time
            if (args.log_interval >= 1) and \
               (num_words % args.log_interval == 0):
                logging.info(
                    "[%d] (%.1f %%) of iteration %d -- moves = %d, cost = %.2f, duration = %.1f ms",
                    num_words, num_words / vocabulary.num_words() * 100,
                    iteration, num_moves, optimizer.log_likelihood(),
                    duration * 100)
            if is_scheduled(num_words, args.output_frequency,
                            vocabulary.num_words()):
                save(optimizer, args.output_file, args.output_format)

        if num_moves == 0:
            break
        iteration += 1

    logging.info("Optimization finished.")
    save(optimizer, args.output_file, args.output_format)
Beispiel #4
0
def add_arguments(parser):
    """Specifies the command line arguments supported by the "theanolm train"
    command.

    :type parser: argparse.ArgumentParser
    :param parser: a command line argument parser
    """

    argument_group = parser.add_argument_group("data")
    argument_group.add_argument(
        'model_path',
        metavar='MODEL-FILE',
        type=str,
        help='path where the best model state will be saved in HDF5 binary '
        'data format')
    argument_group.add_argument(
        '--training-set',
        metavar='FILE',
        type=TextFileType('r'),
        nargs='+',
        required=True,
        help='text files containing training data (UTF-8, one sentence per '
        'line, assumed to be compressed if the name ends in ".gz")')
    argument_group.add_argument(
        '--validation-file',
        metavar='VALID-FILE',
        type=TextFileType('r'),
        default=None,
        help='text file containing validation data for early stopping (UTF-8, '
        'one sentence per line, assumed to be compressed if the name ends '
        'in ".gz")')

    argument_group = parser.add_argument_group("vocabulary")
    argument_group.add_argument(
        '--vocabulary',
        metavar='FILE',
        type=str,
        default=None,
        help='word or class vocabulary to be used in the neural network input '
        'and output, in the format specified by the --vocabulary-format '
        'argument (UTF-8 text, default is to use all the words from the '
        'training data)')
    argument_group.add_argument(
        '--vocabulary-format',
        metavar='FORMAT',
        type=str,
        default='words',
        help='format of the file specified with --vocabulary argument, one of '
        '"words" (one word per line, default), "classes" (word and class '
        'ID per line), "srilm-classes" (class name, membership '
        'probability, and word per line)')
    argument_group.add_argument(
        '--num-classes',
        metavar='N',
        type=int,
        default=None,
        help='generate N classes using a simple word frequency based algorithm '
        'when --vocabulary argument is not given (default is to not use '
        'word classes)')

    argument_group = parser.add_argument_group("network architecture")
    argument_group.add_argument(
        '--architecture',
        metavar='FILE',
        type=str,
        default='lstm300',
        help='path to neural network architecture description, or a standard '
        'architecture name, "lstm300" or "lstm1500" (default "lstm300")')

    argument_group = parser.add_argument_group("training process")
    argument_group.add_argument(
        '--sampling',
        metavar='FRACTION',
        type=float,
        nargs='*',
        default=[],
        help='randomly sample only FRACTION of each training file on each '
        'epoch (list the fractions in the same order as the training '
        'files)')
    argument_group.add_argument(
        '--sequence-length',
        metavar='N',
        type=int,
        default=100,
        help='ignore sentences longer than N words (default 100)')
    argument_group.add_argument(
        '--batch-size',
        metavar='N',
        type=int,
        default=16,
        help='each mini-batch will contain N sentences (default 16)')
    argument_group.add_argument(
        '--validation-frequency',
        metavar='N',
        type=int,
        default='5',
        help='cross-validate for reducing learning rate or early stopping N '
        'times per training epoch (default 5)')
    argument_group.add_argument(
        '--patience',
        metavar='N',
        type=int,
        default=4,
        help='allow perplexity to increase N consecutive cross-validations, '
        'before decreasing learning rate; if less than zero, never '
        'decrease learning rate (default 4)')
    argument_group.add_argument(
        '--random-seed',
        metavar='N',
        type=int,
        default=None,
        help='seed to initialize the random state (default is to seed from a '
        'random source provided by the oprating system)')

    argument_group = parser.add_argument_group("optimization")
    argument_group.add_argument(
        '--optimization-method',
        metavar='NAME',
        type=str,
        default='adagrad',
        help='optimization method, one of "sgd", "nesterov", "adagrad", '
        '"adadelta", "rmsprop-sgd", "rmsprop-nesterov", "adam" '
        '(default "adagrad")')
    argument_group.add_argument('--learning-rate',
                                metavar='ALPHA',
                                type=float,
                                default=0.1,
                                help='initial learning rate (default 0.1)')
    argument_group.add_argument(
        '--momentum',
        metavar='BETA',
        type=float,
        default=0.9,
        help='momentum coefficient for momentum optimization methods (default '
        '0.9)')
    argument_group.add_argument(
        '--gradient-decay-rate',
        metavar='GAMMA',
        type=float,
        default=0.9,
        help='geometric rate for averaging gradients (default 0.9)')
    argument_group.add_argument(
        '--sqr-gradient-decay-rate',
        metavar='GAMMA',
        type=float,
        default=0.999,
        help='geometric rate for averaging squared gradients in Adam optimizer '
        '(default 0.999)')
    argument_group.add_argument(
        '--numerical-stability-term',
        metavar='EPSILON',
        type=float,
        default=1e-6,
        help='a value that is used to prevent instability when dividing by '
        'very small numbers (default 1e-6)')
    argument_group.add_argument(
        '--gradient-normalization',
        metavar='THRESHOLD',
        type=float,
        default=5,
        help='scale down the gradients if necessary to make sure their norm '
        '(normalized by mini-batch size) will not exceed THRESHOLD '
        '(default 5)')
    argument_group.add_argument(
        '--cost',
        metavar='NAME',
        type=str,
        default='cross-entropy',
        help='cost function, one of "cross-entropy" (default), "nce" '
        '(noise-contrastive estimation), or "blackout"')
    argument_group.add_argument(
        '--num-noise-samples',
        metavar='K',
        type=int,
        default=5,
        help='sampling based costs sample K noise words per one training word '
        '(default 5)')
    argument_group.add_argument(
        '--noise-sharing',
        metavar='SHARING',
        type=str,
        default=None,
        help='can be "seq" for sharing noise samples between mini-batch '
        'sequences, or "batch" for sharing noise samples across einter '
        'mini-batch for improved speed (default is no sharing, which is '
        'very slow)')
    argument_group.add_argument(
        '--noise-dampening',
        metavar='ALPHA',
        type=float,
        default=0.5,
        help='the empirical unigram distribution is raised to the power ALPHA '
        'before sampling noise words; 0.0 corresponds to the uniform '
        'distribution and 1.0 corresponds to the unigram distribution '
        '(default 0.5)')
    argument_group.add_argument(
        '--exclude-unk',
        action="store_true",
        help="exclude <unk> tokens from cost and perplexity computations")
    argument_group.add_argument(
        '--weights',
        metavar='LAMBDA',
        type=float,
        nargs='*',
        default=[],
        help='scale a mini-batch update by LAMBDA if the data is from the '
        'corresponding training file (list the weights in the same order '
        'as the training files)')

    argument_group = parser.add_argument_group("early stopping")
    argument_group.add_argument(
        '--stopping-criterion',
        metavar='NAME',
        type=str,
        default='annealing-count',
        help='selects a criterion for early-stopping, one of "epoch-count" '
        '(fixed number of epochs), "no-improvement" (no improvement since '
        'learning rate was decreased), "annealing-count" (default, '
        'learning rate is decreased a fixed number of times)')
    argument_group.add_argument(
        '--min-epochs',
        metavar='N',
        type=int,
        default=1,
        help='perform at least N training epochs (default 1)')
    argument_group.add_argument(
        '--max-epochs',
        metavar='N',
        type=int,
        default=100,
        help='perform at most N training epochs (default 100)')
    argument_group.add_argument(
        '--max-annealing-count',
        metavar='N',
        type=int,
        default=0,
        help='when using annealing-count stopping criterion, continue training '
        'after decreasing learning rate at most N times (default 0)')

    argument_group = parser.add_argument_group("configuration")
    argument_group.add_argument(
        '--default-device',
        metavar='DEVICE',
        type=str,
        default=None,
        help='when multiple GPUs are present, use DEVICE as default')

    argument_group = parser.add_argument_group("logging and debugging")
    argument_group.add_argument(
        '--log-file',
        metavar='FILE',
        type=str,
        default='-',
        help='path where to write log file (default is standard output)')
    argument_group.add_argument(
        '--log-level',
        metavar='LEVEL',
        type=str,
        default='info',
        help='minimum level of events to log, one of "debug", "info", "warn" '
        '(default "info")')
    argument_group.add_argument(
        '--log-interval',
        metavar='N',
        type=int,
        default=1000,
        help='print statistics of every Nth mini-batch update; quiet if less '
        'than one (default 1000)')
    argument_group.add_argument(
        '--debug',
        action="store_true",
        help='use test values to get better error messages from Theano')
    argument_group.add_argument('--print-graph',
                                action="store_true",
                                help='print Theano computation graph')
    argument_group.add_argument('--profile',
                                action="store_true",
                                help='enable profiling Theano functions')
Beispiel #5
0
def add_arguments(parser):
    """Specifies the command line arguments supported by the "theanolm score"
    command.

    :type parser: argparse.ArgumentParser
    :param parser: a command line argument parser
    """

    argument_group = parser.add_argument_group("files")
    argument_group.add_argument(
        'model_path',
        metavar='MODEL-FILE',
        type=str,
        help='the model file that will be used to score text')
    argument_group.add_argument(
        'input_file',
        metavar='TEXT-FILE',
        type=TextFileType('r'),
        help='text file containing text to be scored (UTF-8, one sentence per '
        'line, assumed to be compressed if the name ends in ".gz")')
    argument_group.add_argument(
        '--output-file',
        metavar='FILE',
        type=TextFileType('w'),
        default='-',
        help='where to write the statistics (default stdout, will be '
        'compressed if the name ends in ".gz")')

    argument_group = parser.add_argument_group("scoring")
    argument_group.add_argument(
        '--output',
        metavar='DETAIL',
        type=str,
        default='perplexity',
        help='what to output, one of "perplexity", "utterance-scores", '
        '"word-scores" (default "perplexity")')
    argument_group.add_argument(
        '--log-base',
        metavar='B',
        type=int,
        default=None,
        help='convert output log probabilities to base B (default is the '
        'natural logarithm)')
    argument_group.add_argument(
        '--exclude-unk',
        action="store_true",
        help="exclude <unk> tokens from perplexity computation")
    argument_group.add_argument(
        '--subwords',
        metavar='MARKING',
        type=str,
        default=None,
        help='the subword vocabulary uses MARKING to indicate how words are '
        'formed from subwords; one of "word-boundary" (<w> token '
        'separates words), "prefix-affix" (subwords that can be '
        'concatenated are prefixed or affixed with +, e.g. "cat+ +s")')
    argument_group.add_argument(
        '--shortlist',
        action="store_true",
        help='distribute <unk> token probability among the out-of-shortlist '
        'words according to their unigram frequencies in the training '
        'data')

    argument_group = parser.add_argument_group("logging and debugging")
    argument_group.add_argument(
        '--log-file',
        metavar='FILE',
        type=str,
        default='-',
        help='path where to write log file (default is standard output)')
    argument_group.add_argument(
        '--log-level',
        metavar='LEVEL',
        type=str,
        default='info',
        help='minimum level of events to log, one of "debug", "info", "warn" '
        '(default "info")')
    argument_group.add_argument(
        '--debug',
        action="store_true",
        help='use test values to get better error messages from Theano')
    argument_group.add_argument('--profile',
                                action="store_true",
                                help='enable profiling Theano functions')
Beispiel #6
0
def decode(args):
    """A function that performs the "theanolm decode" command.

    :type args: argparse.Namespace
    :param args: a collection of command line arguments
    """

    log_file = args.log_file
    log_level = getattr(logging, args.log_level.upper(), None)
    if not isinstance(log_level, int):
        print("Invalid logging level requested:", args.log_level)
        sys.exit(1)
    log_format = '%(asctime)s %(funcName)s: %(message)s'
    if args.log_file == '-':
        logging.basicConfig(stream=sys.stdout,
                            format=log_format,
                            level=log_level)
    else:
        logging.basicConfig(filename=log_file,
                            format=log_format,
                            level=log_level)

    if args.debug:
        theano.config.compute_test_value = 'warn'
    else:
        theano.config.compute_test_value = 'off'
    theano.config.profile = args.profile
    theano.config.profile_memory = args.profile

    network = Network.from_file(args.model_path,
                                mode=Network.Mode(minibatch=False))

    log_scale = 1.0 if args.log_base is None else numpy.log(args.log_base)

    if args.wi_penalty is None:
        wi_penalty = None
    else:
        wi_penalty = args.wi_penalty * log_scale
    if args.unk_penalty is None:
        ignore_unk = False
        unk_penalty = None
    elif args.unk_penalty == 0:
        ignore_unk = True
        unk_penalty = None
    else:
        ignore_unk = False
        unk_penalty = args.unk_penalty
    decoding_options = {
        'nnlm_weight': args.nnlm_weight,
        'lm_scale': args.lm_scale,
        'wi_penalty': wi_penalty,
        'ignore_unk': ignore_unk,
        'unk_penalty': unk_penalty,
        'linear_interpolation': args.linear_interpolation,
        'max_tokens_per_node': args.max_tokens_per_node,
        'beam': args.beam,
        'recombination_order': args.recombination_order
    }
    logging.debug("DECODING OPTIONS")
    for option_name, option_value in decoding_options.items():
        logging.debug("%s: %s", option_name, str(option_value))

    print("Building word lattice decoder.")
    sys.stdout.flush()
    decoder = LatticeDecoder(network, decoding_options)

    # Combine paths from command line and lattice list.
    lattices = args.lattices
    if args.lattice_list is not None:
        lattices.extend(args.lattice_list.readlines())
    lattices = [path.strip() for path in lattices]
    # Ignore empty lines in the lattice list.
    lattices = [x for x in lattices if x]
    # Pick every Ith lattice, if --num-jobs is specified and > 1.
    if args.num_jobs < 1:
        print("Invalid number of jobs specified:", args.num_jobs)
        sys.exit(1)
    if (args.job < 0) or (args.job > args.num_jobs - 1):
        print("Invalid job specified:", args.job)
        sys.exit(1)
    lattices = lattices[args.job::args.num_jobs]

    file_type = TextFileType('r')
    for index, path in enumerate(lattices):
        logging.info("Reading word lattice: %s", path)
        lattice_file = file_type(path)
        lattice = SLFLattice(lattice_file)

        if lattice.utterance_id is not None:
            utterance_id = lattice.utterance_id
        else:
            utterance_id = os.path.basename(lattice_file.name)
        logging.info("Utterance `%s' -- %d/%d of job %d", utterance_id,
                     index + 1, len(lattices), args.job)
        tokens = decoder.decode(lattice)

        for index in range(min(args.n_best, len(tokens))):
            line = format_token(tokens[index], utterance_id,
                                network.vocabulary, log_scale, args.output)
            args.output_file.write(line + "\n")
Beispiel #7
0
def add_arguments(parser):
    """Specifies the command line arguments supported by the "theanolm decode"
    command.

    :type parser: argparse.ArgumentParser
    :param parser: a command line argument parser
    """

    argument_group = parser.add_argument_group("files")
    argument_group.add_argument(
        'model_path',
        metavar='MODEL-FILE',
        type=str,
        help='the model file that will be used to decode the lattice')
    argument_group.add_argument(
        '--lattices',
        metavar='FILE',
        type=str,
        nargs='*',
        default=[],
        help='word lattices to be decoded (SLF, assumed to be compressed if '
        'the name ends in ".gz")')
    argument_group.add_argument(
        '--lattice-list',
        metavar='FILE',
        type=TextFileType('r'),
        help='text file containing a list of word lattices to be decoded (one '
        'path to an SLF file per line, the list and the SLF files are '
        'assumed to be compressed if the name ends in ".gz")')
    argument_group.add_argument(
        '--output-file',
        metavar='FILE',
        type=TextFileType('w'),
        default='-',
        help='where to write the best paths through the lattices (default '
        'stdout, will be compressed if the name ends in ".gz")')
    argument_group.add_argument(
        '--num-jobs',
        metavar='J',
        type=int,
        default=1,
        help='divide the set of lattice files into J distinct batches, and '
        'process only batch I')
    argument_group.add_argument(
        '--job',
        metavar='I',
        type=int,
        default=0,
        help='the index of the batch that this job should process, between 0 '
        'and J-1')

    argument_group = parser.add_argument_group("decoding")
    argument_group.add_argument(
        '--output',
        metavar='FORMAT',
        type=str,
        default='ref',
        help='format of the output, one of "ref" (default, utterance ID '
        'followed by words), "trn" (words followed by utterance ID in '
        'parentheses), "full" (utterance ID, acoustic score, language '
        'score, and number of words, followed by words)')
    argument_group.add_argument(
        '--n-best',
        metavar='N',
        type=int,
        default=1,
        help='print N best paths of each lattice (default 1)')
    argument_group.add_argument(
        '--nnlm-weight',
        metavar='LAMBDA',
        type=float,
        default=1.0,
        help="language model probabilities given by the model read from "
        "MODEL-FILE will be weighted by LAMBDA, when interpolating with "
        "the language model probabilities in the lattice (default is 1.0, "
        "meaning that the LM probabilities in the lattice will be "
        "ignored)")
    argument_group.add_argument(
        '--lm-scale',
        metavar='LMSCALE',
        type=float,
        default=None,
        help="scale language model log probabilities by LMSCALE when computing "
        "the total probability of a path (default is to use the LM scale "
        "specified in the lattice file, or 1.0 if not specified)")
    argument_group.add_argument(
        '--wi-penalty',
        metavar='WIP',
        type=float,
        default=None,
        help="penalize word insertion by adding WIP to the total log "
        "probability as many times as there are words in the path "
        "(without scaling WIP by LMSCALE)")
    argument_group.add_argument(
        '--log-base',
        metavar='B',
        type=int,
        default=None,
        help="convert output log probabilities to base B and WIP from base B "
        "(default is natural logarithm; this does not affect reading "
        "lattices, since they specify their internal log base)")
    argument_group.add_argument(
        '--unk-penalty',
        metavar='LOGPROB',
        type=float,
        default=None,
        help="if LOGPROB is zero, do not include <unk> tokens in perplexity "
        "computation; otherwise use constant LOGPROB as <unk> token score "
        "(default is to use the network to predict <unk> probability)")
    argument_group.add_argument(
        '--linear-interpolation',
        action="store_true",
        help="use linear interpolation of language model probabilities, "
        "instead of (pseudo) log-linear")

    argument_group = parser.add_argument_group("pruning")
    argument_group.add_argument(
        '--max-tokens-per-node',
        metavar='T',
        type=int,
        default=None,
        help="keep only at most T tokens at each node when decoding a lattice "
        "(default is no limit)")
    argument_group.add_argument(
        '--beam',
        metavar='B',
        type=float,
        default=None,
        help="prune tokens whose log probability is at least B smaller than "
        "the log probability of the best token at any given time (default "
        "is no beam pruning)")
    argument_group.add_argument(
        '--recombination-order',
        metavar='O',
        type=int,
        default=None,
        help="keep only the best token, when at least O previous words are "
        "identical (default is to recombine tokens only if the entire "
        "word history matches)")

    argument_group = parser.add_argument_group("logging and debugging")
    argument_group.add_argument(
        '--log-file',
        metavar='FILE',
        type=str,
        default='-',
        help='path where to write log file (default is standard output)')
    argument_group.add_argument(
        '--log-level',
        metavar='LEVEL',
        type=str,
        default='info',
        help='minimum level of events to log, one of "debug", "info", "warn" '
        '(default "info")')
    argument_group.add_argument('--debug',
                                action="store_true",
                                help='enables debugging Theano errors')
    argument_group.add_argument('--profile',
                                action="store_true",
                                help='enables profiling Theano functions')
Beispiel #8
0
def add_arguments(parser):
    argument_group = parser.add_argument_group("files")
    argument_group.add_argument(
        'model_path', metavar='MODEL-FILE', type=str,
        help='path where the best model state will be saved in HDF5 binary '
             'data format')
    argument_group.add_argument(
        'validation_file', metavar='VALID-FILE', type=TextFileType('r'),
        help='text file containing validation data for early stopping (UTF-8, '
             'one sentence per line, assumed to be compressed if the name ends '
             'in ".gz")')
    argument_group.add_argument(
        '--training-set', metavar='FILE', type=TextFileType('r'), nargs='+',
        required=True,
        help='text files containing training data (UTF-8, one sentence per '
             'line, assumed to be compressed if the name ends in ".gz")')
    argument_group.add_argument(
        '--vocabulary', metavar='FILE', type=str, default=None,
        help='word or class vocabulary to be used in the neural network input '
             'and output, in the format specified by the --vocabulary-format '
             'argument (UTF-8 text, default is to use all the words from the '
             'training data)')
    argument_group.add_argument(
        '--vocabulary-format', metavar='FORMAT', type=str, default='words',
        help='format of the file specified with --vocabulary argument, one of '
             '"words" (one word per line, default), "classes" (word and class '
             'ID per line), "srilm-classes" (class name, membership '
             'probability, and word per line)')
    argument_group = parser.add_argument_group("network architecture")
    argument_group.add_argument(
        '--architecture', metavar='FILE', type=str, default='lstm300',
        help='path to neural network architecture description, or a standard '
             'architecture name, "lstm300" or "lstm1500" (default "lstm300")')
    argument_group.add_argument(
        '--num-classes', metavar='N', type=int, default=None,
        help='generate N classes using a simple word frequency based algorithm '
             'when --vocabulary argument is not given (default is to not use '
             'word classes)')

    argument_group = parser.add_argument_group("training process")
    argument_group.add_argument(
        '--sampling', metavar='FRACTION', type=float, nargs='*', default=[],
        help='randomly sample only FRACTION of each training file on each '
             'epoch (list the fractions in the same order as the training '
             'files)')
    argument_group.add_argument(
        '--training-strategy', metavar='NAME', type=str, default='local-mean',
        help='selects a training and validation strategy, one of "basic", '
            '"local-mean", "local-median", "validation-average" (default '
            '"local-mean")')
    argument_group.add_argument(
        '--sequence-length', metavar='N', type=int, default=100,
        help='ignore sentences longer than N words (default 100)')
    argument_group.add_argument(
        '--batch-size', metavar='N', type=int, default=16,
        help='each mini-batch will contain N sentences (default 16)')
    argument_group.add_argument(
        '--validation-frequency', metavar='N', type=int, default='5',
        help='cross-validate for reducing learning rate or early stopping N '
             'times per training epoch (default 5)')
    argument_group.add_argument(
        '--patience', metavar='N', type=int, default=4,
        help='allow perplexity to increase N consecutive cross-validations, '
             'before decreasing learning rate; if less than zero, never '
             'decrease learning rate (default 4)')
    argument_group.add_argument(
        '--random-seed', metavar='N', type=int, default=None,
        help='seed to initialize the random state (default is to seed from a '
             'random source provided by the oprating system)')
    
    argument_group = parser.add_argument_group("optimization")
    argument_group.add_argument(
        '--optimization-method', metavar='NAME', type=str, default='adagrad',
        help='optimization method, one of "sgd", "nesterov", "adagrad", '
             '"adadelta", "rmsprop-sgd", "rmsprop-nesterov", "adam" '
             '(default "adagrad")')
    argument_group.add_argument(
        '--learning-rate', metavar='ALPHA', type=float, default=0.1,
        help='initial learning rate (default 0.1)')
    argument_group.add_argument(
        '--momentum', metavar='BETA', type=float, default=0.9,
        help='momentum coefficient for momentum optimization methods (default '
             '0.9)')
    argument_group.add_argument(
        '--gradient-decay-rate', metavar='GAMMA', type=float, default=0.9,
        help='geometric rate for averaging gradients (default 0.9)')
    argument_group.add_argument(
        '--sqr-gradient-decay-rate', metavar='GAMMA', type=float, default=0.999,
        help='geometric rate for averaging squared gradients in Adam optimizer '
             '(default 0.999)')
    argument_group.add_argument(
        '--numerical-stability-term', metavar='EPSILON', type=float,
        default=1e-6,
        help='a value that is used to prevent instability when dividing by '
             'very small numbers (default 1e-6)')
    argument_group.add_argument(
        '--gradient-normalization', metavar='THRESHOLD', type=float,
        default=5,
        help='scale down the gradients if necessary to make sure their norm '
             '(normalized by mini-batch size) will not exceed THRESHOLD '
             '(default 5)')
    argument_group.add_argument(
        '--unk-penalty', metavar='LOGPROB', type=float, default=None,
        help="if LOGPROB is zero, do not include <unk> tokens in perplexity "
             "computation; otherwise use constant LOGPROB as <unk> token score "
             "(default is to use the network to predict <unk> probability)")
    argument_group.add_argument(
        '--weights', metavar='LAMBDA', type=float, nargs='*', default=[],
        help='scale a mini-batch update by LAMBDA if the data is from the '
             'corresponding training file (list the weights in the same order '
             'as the training files)')

    argument_group = parser.add_argument_group("early stopping")
    argument_group.add_argument(
        '--stopping-criterion', metavar='NAME', type=str,
        default='annealing-count',
        help='selects a criterion for early-stopping, one of "epoch-count" '
             '(fixed number of epochs), "no-improvement" (no improvement since '
             'learning rate was decreased), "annealing-count" (default, '
             'learning rate is decreased a fixed number of times)')
    argument_group.add_argument(
        '--min-epochs', metavar='N', type=int, default=1,
        help='perform at least N training epochs (default 1)')
    argument_group.add_argument(
        '--max-epochs', metavar='N', type=int, default=100,
        help='perform at most N training epochs (default 100)')
    argument_group.add_argument(
        '--max-annealing-count', metavar='N', type=int, default=0,
        help='when using annealing-count stopping criterion, continue training '
             'after decreasing learning rate at most N times (default 0)')
    
    argument_group = parser.add_argument_group("logging and debugging")
    argument_group.add_argument(
        '--log-file', metavar='FILE', type=str, default='-',
        help='path where to write log file (default is standard output)')
    argument_group.add_argument(
        '--log-level', metavar='LEVEL', type=str, default='info',
        help='minimum level of events to log, one of "debug", "info", "warn" '
             '(default "info")')
    argument_group.add_argument(
        '--log-interval', metavar='N', type=int, default=1000,
        help='print statistics of every Nth mini-batch update; quiet if less '
             'than one (default 1000)')
    argument_group.add_argument(
        '--debug', action="store_true",
        help='enables debugging Theano errors')
    argument_group.add_argument(
        '--profile', action="store_true",
        help='enables profiling Theano functions')