def add_arguments(parser): argument_group = parser.add_argument_group("files") argument_group.add_argument( 'model_path', metavar='MODEL-FILE', type=str, help='the model file that will be used to score text') argument_group.add_argument( 'input_file', metavar='TEXT-FILE', type=TextFileType('r'), help='text file containing text to be scored (UTF-8, one sentence per ' 'line, assumed to be compressed if the name ends in ".gz")') argument_group.add_argument( '--output-file', metavar='FILE', type=TextFileType('w'), default='-', help='where to write the statistics (default stdout, will be ' 'compressed if the name ends in ".gz")') argument_group = parser.add_argument_group("scoring") argument_group.add_argument( '--output', metavar='DETAIL', type=str, default='perplexity', help='what to output, one of "perplexity", "utterance-scores", ' '"word-scores" (default "perplexity")') argument_group.add_argument( '--log-base', metavar='B', type=int, default=None, help='convert output log probabilities to base B (default is the ' 'natural logarithm)') argument_group.add_argument( '--unk-penalty', metavar='LOGPROB', type=float, default=None, help="if LOGPROB is zero, do not include <unk> tokens in perplexity " "computation; otherwise use constant LOGPROB as <unk> token score " "(default is to use the network to predict <unk> probability)")
def add_arguments(parser): """Specifies the command line arguments supported by the "theanolm sample" command. :type parser: argparse.ArgumentParser :param parser: a command line argument parser """ argument_group = parser.add_argument_group("files") argument_group.add_argument( 'model_path', metavar='MODEL-FILE', type=str, help='the model file that will be used to generate text') argument_group.add_argument( '--output-file', metavar='FILE', type=TextFileType('w'), default='-', help='where to write the generated sentences (default stdout, will be ' 'compressed if the name ends in ".gz")') argument_group = parser.add_argument_group("sampling") argument_group.add_argument( '--num-sentences', metavar='N', type=int, default=10, help='generate N sentences') argument_group.add_argument( '--random-seed', metavar='N', type=int, default=None, help='seed to initialize the random state (default is to seed from a ' 'random source provided by the oprating system)') argument_group = parser.add_argument_group("debugging") argument_group.add_argument( '--debug', action="store_true", help='enables debugging Theano errors')
def main(): parser = argparse.ArgumentParser(prog='wctool') argument_group = parser.add_argument_group("files") argument_group.add_argument( '--training-set', metavar='FILE', type=TextFileType('r'), nargs='+', required=True, help='text or .gz files containing training data (one sentence per ' 'line)') argument_group.add_argument( '--vocabulary', metavar='FILE', type=TextFileType('r'), default=None, help='text or .gz file containing a list of words to include in class ' 'forming, and possibly their initial classes') argument_group.add_argument( '--vocabulary-format', metavar='FORMAT', type=str, default='words', help='vocabulary format, one of "words" (one word per line, default), ' '"classes" (word and class ID per line), "srilm-classes" (class ' 'name, membership probability, and word per line)') argument_group.add_argument( '--output-file', metavar='FILE', type=TextFileType('w'), default='-', help='where to write the word classes (default stdout)') argument_group.add_argument( '--output-format', metavar='FORMAT', type=str, default='srilm-classes', help='format of the output file, one of "classes" (word and class ID ' 'per line), "srilm-classes" (default; class name, membership ' 'probability, and word per line)') argument_group.add_argument( '--output-frequency', metavar='N', type=int, default='1', help='save classes N times per optimization iteration (default 1)') argument_group = parser.add_argument_group("optimization") argument_group.add_argument( '--num-classes', metavar='N', type=int, default=2000, help='number of classes to form, if vocabulary is not specified ' '(default 2000)') argument_group.add_argument( '--method', metavar='NAME', type=str, default='bigram-theano', help='method for creating word classes, one of "bigram-theano", ' '"bigram-numpy" (default "bigram-theano")') argument_group = parser.add_argument_group("logging and debugging") argument_group.add_argument( '--log-file', metavar='FILE', type=str, default='-', help='path where to write log file (default is standard output)') argument_group.add_argument( '--log-level', metavar='LEVEL', type=str, default='info', help='minimum level of events to log, one of "debug", "info", "warn" ' '(default "info")') argument_group.add_argument( '--log-interval', metavar='N', type=int, default=1000, help='print statistics after every Nth word; quiet if less than one ' '(default 1000)') args = parser.parse_args() log_file = args.log_file log_level = getattr(logging, args.log_level.upper(), None) if not isinstance(log_level, int): raise ValueError("Invalid logging level requested: " + args.log_level) log_format = '%(asctime)s %(funcName)s: %(message)s' if args.log_file == '-': logging.basicConfig(stream=sys.stdout, format=log_format, level=log_level) else: logging.basicConfig(filename=log_file, format=log_format, level=log_level) if args.vocabulary is None: vocabulary = Vocabulary.from_corpus(args.training_set, args.num_classes) for subset_file in args.training_set: subset_file.seek(0) else: vocabulary = Vocabulary.from_file(args.vocabulary, args.vocabulary_format) print("Number of words in vocabulary:", vocabulary.num_words()) print("Number of word classes:", vocabulary.num_classes()) print("Number of normal word classes:", vocabulary.num_normal_classes) logging.info("Reading word unigram and bigram statistics.") statistics = WordStatistics(args.training_set, vocabulary) if args.method == 'bigram-theano': optimizer = TheanoBigramOptimizer(statistics, vocabulary) elif args.method == 'bigram-numpy': optimizer = NumpyBigramOptimizer(statistics, vocabulary) else: raise ValueError("Invalid method requested: " + args.method) iteration = 1 while True: logging.info("Starting iteration %d.", iteration) num_words = 0 num_moves = 0 for word in vocabulary.words(): start_time = time() num_words += 1 if optimizer.move_to_best_class(word): num_moves += 1 duration = time() - start_time if (args.log_interval >= 1) and \ (num_words % args.log_interval == 0): logging.info( "[%d] (%.1f %%) of iteration %d -- moves = %d, cost = %.2f, duration = %.1f ms", num_words, num_words / vocabulary.num_words() * 100, iteration, num_moves, optimizer.log_likelihood(), duration * 100) if is_scheduled(num_words, args.output_frequency, vocabulary.num_words()): save(optimizer, args.output_file, args.output_format) if num_moves == 0: break iteration += 1 logging.info("Optimization finished.") save(optimizer, args.output_file, args.output_format)
def add_arguments(parser): """Specifies the command line arguments supported by the "theanolm train" command. :type parser: argparse.ArgumentParser :param parser: a command line argument parser """ argument_group = parser.add_argument_group("data") argument_group.add_argument( 'model_path', metavar='MODEL-FILE', type=str, help='path where the best model state will be saved in HDF5 binary ' 'data format') argument_group.add_argument( '--training-set', metavar='FILE', type=TextFileType('r'), nargs='+', required=True, help='text files containing training data (UTF-8, one sentence per ' 'line, assumed to be compressed if the name ends in ".gz")') argument_group.add_argument( '--validation-file', metavar='VALID-FILE', type=TextFileType('r'), default=None, help='text file containing validation data for early stopping (UTF-8, ' 'one sentence per line, assumed to be compressed if the name ends ' 'in ".gz")') argument_group = parser.add_argument_group("vocabulary") argument_group.add_argument( '--vocabulary', metavar='FILE', type=str, default=None, help='word or class vocabulary to be used in the neural network input ' 'and output, in the format specified by the --vocabulary-format ' 'argument (UTF-8 text, default is to use all the words from the ' 'training data)') argument_group.add_argument( '--vocabulary-format', metavar='FORMAT', type=str, default='words', help='format of the file specified with --vocabulary argument, one of ' '"words" (one word per line, default), "classes" (word and class ' 'ID per line), "srilm-classes" (class name, membership ' 'probability, and word per line)') argument_group.add_argument( '--num-classes', metavar='N', type=int, default=None, help='generate N classes using a simple word frequency based algorithm ' 'when --vocabulary argument is not given (default is to not use ' 'word classes)') argument_group = parser.add_argument_group("network architecture") argument_group.add_argument( '--architecture', metavar='FILE', type=str, default='lstm300', help='path to neural network architecture description, or a standard ' 'architecture name, "lstm300" or "lstm1500" (default "lstm300")') argument_group = parser.add_argument_group("training process") argument_group.add_argument( '--sampling', metavar='FRACTION', type=float, nargs='*', default=[], help='randomly sample only FRACTION of each training file on each ' 'epoch (list the fractions in the same order as the training ' 'files)') argument_group.add_argument( '--sequence-length', metavar='N', type=int, default=100, help='ignore sentences longer than N words (default 100)') argument_group.add_argument( '--batch-size', metavar='N', type=int, default=16, help='each mini-batch will contain N sentences (default 16)') argument_group.add_argument( '--validation-frequency', metavar='N', type=int, default='5', help='cross-validate for reducing learning rate or early stopping N ' 'times per training epoch (default 5)') argument_group.add_argument( '--patience', metavar='N', type=int, default=4, help='allow perplexity to increase N consecutive cross-validations, ' 'before decreasing learning rate; if less than zero, never ' 'decrease learning rate (default 4)') argument_group.add_argument( '--random-seed', metavar='N', type=int, default=None, help='seed to initialize the random state (default is to seed from a ' 'random source provided by the oprating system)') argument_group = parser.add_argument_group("optimization") argument_group.add_argument( '--optimization-method', metavar='NAME', type=str, default='adagrad', help='optimization method, one of "sgd", "nesterov", "adagrad", ' '"adadelta", "rmsprop-sgd", "rmsprop-nesterov", "adam" ' '(default "adagrad")') argument_group.add_argument('--learning-rate', metavar='ALPHA', type=float, default=0.1, help='initial learning rate (default 0.1)') argument_group.add_argument( '--momentum', metavar='BETA', type=float, default=0.9, help='momentum coefficient for momentum optimization methods (default ' '0.9)') argument_group.add_argument( '--gradient-decay-rate', metavar='GAMMA', type=float, default=0.9, help='geometric rate for averaging gradients (default 0.9)') argument_group.add_argument( '--sqr-gradient-decay-rate', metavar='GAMMA', type=float, default=0.999, help='geometric rate for averaging squared gradients in Adam optimizer ' '(default 0.999)') argument_group.add_argument( '--numerical-stability-term', metavar='EPSILON', type=float, default=1e-6, help='a value that is used to prevent instability when dividing by ' 'very small numbers (default 1e-6)') argument_group.add_argument( '--gradient-normalization', metavar='THRESHOLD', type=float, default=5, help='scale down the gradients if necessary to make sure their norm ' '(normalized by mini-batch size) will not exceed THRESHOLD ' '(default 5)') argument_group.add_argument( '--cost', metavar='NAME', type=str, default='cross-entropy', help='cost function, one of "cross-entropy" (default), "nce" ' '(noise-contrastive estimation), or "blackout"') argument_group.add_argument( '--num-noise-samples', metavar='K', type=int, default=5, help='sampling based costs sample K noise words per one training word ' '(default 5)') argument_group.add_argument( '--noise-sharing', metavar='SHARING', type=str, default=None, help='can be "seq" for sharing noise samples between mini-batch ' 'sequences, or "batch" for sharing noise samples across einter ' 'mini-batch for improved speed (default is no sharing, which is ' 'very slow)') argument_group.add_argument( '--noise-dampening', metavar='ALPHA', type=float, default=0.5, help='the empirical unigram distribution is raised to the power ALPHA ' 'before sampling noise words; 0.0 corresponds to the uniform ' 'distribution and 1.0 corresponds to the unigram distribution ' '(default 0.5)') argument_group.add_argument( '--exclude-unk', action="store_true", help="exclude <unk> tokens from cost and perplexity computations") argument_group.add_argument( '--weights', metavar='LAMBDA', type=float, nargs='*', default=[], help='scale a mini-batch update by LAMBDA if the data is from the ' 'corresponding training file (list the weights in the same order ' 'as the training files)') argument_group = parser.add_argument_group("early stopping") argument_group.add_argument( '--stopping-criterion', metavar='NAME', type=str, default='annealing-count', help='selects a criterion for early-stopping, one of "epoch-count" ' '(fixed number of epochs), "no-improvement" (no improvement since ' 'learning rate was decreased), "annealing-count" (default, ' 'learning rate is decreased a fixed number of times)') argument_group.add_argument( '--min-epochs', metavar='N', type=int, default=1, help='perform at least N training epochs (default 1)') argument_group.add_argument( '--max-epochs', metavar='N', type=int, default=100, help='perform at most N training epochs (default 100)') argument_group.add_argument( '--max-annealing-count', metavar='N', type=int, default=0, help='when using annealing-count stopping criterion, continue training ' 'after decreasing learning rate at most N times (default 0)') argument_group = parser.add_argument_group("configuration") argument_group.add_argument( '--default-device', metavar='DEVICE', type=str, default=None, help='when multiple GPUs are present, use DEVICE as default') argument_group = parser.add_argument_group("logging and debugging") argument_group.add_argument( '--log-file', metavar='FILE', type=str, default='-', help='path where to write log file (default is standard output)') argument_group.add_argument( '--log-level', metavar='LEVEL', type=str, default='info', help='minimum level of events to log, one of "debug", "info", "warn" ' '(default "info")') argument_group.add_argument( '--log-interval', metavar='N', type=int, default=1000, help='print statistics of every Nth mini-batch update; quiet if less ' 'than one (default 1000)') argument_group.add_argument( '--debug', action="store_true", help='use test values to get better error messages from Theano') argument_group.add_argument('--print-graph', action="store_true", help='print Theano computation graph') argument_group.add_argument('--profile', action="store_true", help='enable profiling Theano functions')
def add_arguments(parser): """Specifies the command line arguments supported by the "theanolm score" command. :type parser: argparse.ArgumentParser :param parser: a command line argument parser """ argument_group = parser.add_argument_group("files") argument_group.add_argument( 'model_path', metavar='MODEL-FILE', type=str, help='the model file that will be used to score text') argument_group.add_argument( 'input_file', metavar='TEXT-FILE', type=TextFileType('r'), help='text file containing text to be scored (UTF-8, one sentence per ' 'line, assumed to be compressed if the name ends in ".gz")') argument_group.add_argument( '--output-file', metavar='FILE', type=TextFileType('w'), default='-', help='where to write the statistics (default stdout, will be ' 'compressed if the name ends in ".gz")') argument_group = parser.add_argument_group("scoring") argument_group.add_argument( '--output', metavar='DETAIL', type=str, default='perplexity', help='what to output, one of "perplexity", "utterance-scores", ' '"word-scores" (default "perplexity")') argument_group.add_argument( '--log-base', metavar='B', type=int, default=None, help='convert output log probabilities to base B (default is the ' 'natural logarithm)') argument_group.add_argument( '--exclude-unk', action="store_true", help="exclude <unk> tokens from perplexity computation") argument_group.add_argument( '--subwords', metavar='MARKING', type=str, default=None, help='the subword vocabulary uses MARKING to indicate how words are ' 'formed from subwords; one of "word-boundary" (<w> token ' 'separates words), "prefix-affix" (subwords that can be ' 'concatenated are prefixed or affixed with +, e.g. "cat+ +s")') argument_group.add_argument( '--shortlist', action="store_true", help='distribute <unk> token probability among the out-of-shortlist ' 'words according to their unigram frequencies in the training ' 'data') argument_group = parser.add_argument_group("logging and debugging") argument_group.add_argument( '--log-file', metavar='FILE', type=str, default='-', help='path where to write log file (default is standard output)') argument_group.add_argument( '--log-level', metavar='LEVEL', type=str, default='info', help='minimum level of events to log, one of "debug", "info", "warn" ' '(default "info")') argument_group.add_argument( '--debug', action="store_true", help='use test values to get better error messages from Theano') argument_group.add_argument('--profile', action="store_true", help='enable profiling Theano functions')
def decode(args): """A function that performs the "theanolm decode" command. :type args: argparse.Namespace :param args: a collection of command line arguments """ log_file = args.log_file log_level = getattr(logging, args.log_level.upper(), None) if not isinstance(log_level, int): print("Invalid logging level requested:", args.log_level) sys.exit(1) log_format = '%(asctime)s %(funcName)s: %(message)s' if args.log_file == '-': logging.basicConfig(stream=sys.stdout, format=log_format, level=log_level) else: logging.basicConfig(filename=log_file, format=log_format, level=log_level) if args.debug: theano.config.compute_test_value = 'warn' else: theano.config.compute_test_value = 'off' theano.config.profile = args.profile theano.config.profile_memory = args.profile network = Network.from_file(args.model_path, mode=Network.Mode(minibatch=False)) log_scale = 1.0 if args.log_base is None else numpy.log(args.log_base) if args.wi_penalty is None: wi_penalty = None else: wi_penalty = args.wi_penalty * log_scale if args.unk_penalty is None: ignore_unk = False unk_penalty = None elif args.unk_penalty == 0: ignore_unk = True unk_penalty = None else: ignore_unk = False unk_penalty = args.unk_penalty decoding_options = { 'nnlm_weight': args.nnlm_weight, 'lm_scale': args.lm_scale, 'wi_penalty': wi_penalty, 'ignore_unk': ignore_unk, 'unk_penalty': unk_penalty, 'linear_interpolation': args.linear_interpolation, 'max_tokens_per_node': args.max_tokens_per_node, 'beam': args.beam, 'recombination_order': args.recombination_order } logging.debug("DECODING OPTIONS") for option_name, option_value in decoding_options.items(): logging.debug("%s: %s", option_name, str(option_value)) print("Building word lattice decoder.") sys.stdout.flush() decoder = LatticeDecoder(network, decoding_options) # Combine paths from command line and lattice list. lattices = args.lattices if args.lattice_list is not None: lattices.extend(args.lattice_list.readlines()) lattices = [path.strip() for path in lattices] # Ignore empty lines in the lattice list. lattices = [x for x in lattices if x] # Pick every Ith lattice, if --num-jobs is specified and > 1. if args.num_jobs < 1: print("Invalid number of jobs specified:", args.num_jobs) sys.exit(1) if (args.job < 0) or (args.job > args.num_jobs - 1): print("Invalid job specified:", args.job) sys.exit(1) lattices = lattices[args.job::args.num_jobs] file_type = TextFileType('r') for index, path in enumerate(lattices): logging.info("Reading word lattice: %s", path) lattice_file = file_type(path) lattice = SLFLattice(lattice_file) if lattice.utterance_id is not None: utterance_id = lattice.utterance_id else: utterance_id = os.path.basename(lattice_file.name) logging.info("Utterance `%s' -- %d/%d of job %d", utterance_id, index + 1, len(lattices), args.job) tokens = decoder.decode(lattice) for index in range(min(args.n_best, len(tokens))): line = format_token(tokens[index], utterance_id, network.vocabulary, log_scale, args.output) args.output_file.write(line + "\n")
def add_arguments(parser): """Specifies the command line arguments supported by the "theanolm decode" command. :type parser: argparse.ArgumentParser :param parser: a command line argument parser """ argument_group = parser.add_argument_group("files") argument_group.add_argument( 'model_path', metavar='MODEL-FILE', type=str, help='the model file that will be used to decode the lattice') argument_group.add_argument( '--lattices', metavar='FILE', type=str, nargs='*', default=[], help='word lattices to be decoded (SLF, assumed to be compressed if ' 'the name ends in ".gz")') argument_group.add_argument( '--lattice-list', metavar='FILE', type=TextFileType('r'), help='text file containing a list of word lattices to be decoded (one ' 'path to an SLF file per line, the list and the SLF files are ' 'assumed to be compressed if the name ends in ".gz")') argument_group.add_argument( '--output-file', metavar='FILE', type=TextFileType('w'), default='-', help='where to write the best paths through the lattices (default ' 'stdout, will be compressed if the name ends in ".gz")') argument_group.add_argument( '--num-jobs', metavar='J', type=int, default=1, help='divide the set of lattice files into J distinct batches, and ' 'process only batch I') argument_group.add_argument( '--job', metavar='I', type=int, default=0, help='the index of the batch that this job should process, between 0 ' 'and J-1') argument_group = parser.add_argument_group("decoding") argument_group.add_argument( '--output', metavar='FORMAT', type=str, default='ref', help='format of the output, one of "ref" (default, utterance ID ' 'followed by words), "trn" (words followed by utterance ID in ' 'parentheses), "full" (utterance ID, acoustic score, language ' 'score, and number of words, followed by words)') argument_group.add_argument( '--n-best', metavar='N', type=int, default=1, help='print N best paths of each lattice (default 1)') argument_group.add_argument( '--nnlm-weight', metavar='LAMBDA', type=float, default=1.0, help="language model probabilities given by the model read from " "MODEL-FILE will be weighted by LAMBDA, when interpolating with " "the language model probabilities in the lattice (default is 1.0, " "meaning that the LM probabilities in the lattice will be " "ignored)") argument_group.add_argument( '--lm-scale', metavar='LMSCALE', type=float, default=None, help="scale language model log probabilities by LMSCALE when computing " "the total probability of a path (default is to use the LM scale " "specified in the lattice file, or 1.0 if not specified)") argument_group.add_argument( '--wi-penalty', metavar='WIP', type=float, default=None, help="penalize word insertion by adding WIP to the total log " "probability as many times as there are words in the path " "(without scaling WIP by LMSCALE)") argument_group.add_argument( '--log-base', metavar='B', type=int, default=None, help="convert output log probabilities to base B and WIP from base B " "(default is natural logarithm; this does not affect reading " "lattices, since they specify their internal log base)") argument_group.add_argument( '--unk-penalty', metavar='LOGPROB', type=float, default=None, help="if LOGPROB is zero, do not include <unk> tokens in perplexity " "computation; otherwise use constant LOGPROB as <unk> token score " "(default is to use the network to predict <unk> probability)") argument_group.add_argument( '--linear-interpolation', action="store_true", help="use linear interpolation of language model probabilities, " "instead of (pseudo) log-linear") argument_group = parser.add_argument_group("pruning") argument_group.add_argument( '--max-tokens-per-node', metavar='T', type=int, default=None, help="keep only at most T tokens at each node when decoding a lattice " "(default is no limit)") argument_group.add_argument( '--beam', metavar='B', type=float, default=None, help="prune tokens whose log probability is at least B smaller than " "the log probability of the best token at any given time (default " "is no beam pruning)") argument_group.add_argument( '--recombination-order', metavar='O', type=int, default=None, help="keep only the best token, when at least O previous words are " "identical (default is to recombine tokens only if the entire " "word history matches)") argument_group = parser.add_argument_group("logging and debugging") argument_group.add_argument( '--log-file', metavar='FILE', type=str, default='-', help='path where to write log file (default is standard output)') argument_group.add_argument( '--log-level', metavar='LEVEL', type=str, default='info', help='minimum level of events to log, one of "debug", "info", "warn" ' '(default "info")') argument_group.add_argument('--debug', action="store_true", help='enables debugging Theano errors') argument_group.add_argument('--profile', action="store_true", help='enables profiling Theano functions')
def add_arguments(parser): argument_group = parser.add_argument_group("files") argument_group.add_argument( 'model_path', metavar='MODEL-FILE', type=str, help='path where the best model state will be saved in HDF5 binary ' 'data format') argument_group.add_argument( 'validation_file', metavar='VALID-FILE', type=TextFileType('r'), help='text file containing validation data for early stopping (UTF-8, ' 'one sentence per line, assumed to be compressed if the name ends ' 'in ".gz")') argument_group.add_argument( '--training-set', metavar='FILE', type=TextFileType('r'), nargs='+', required=True, help='text files containing training data (UTF-8, one sentence per ' 'line, assumed to be compressed if the name ends in ".gz")') argument_group.add_argument( '--vocabulary', metavar='FILE', type=str, default=None, help='word or class vocabulary to be used in the neural network input ' 'and output, in the format specified by the --vocabulary-format ' 'argument (UTF-8 text, default is to use all the words from the ' 'training data)') argument_group.add_argument( '--vocabulary-format', metavar='FORMAT', type=str, default='words', help='format of the file specified with --vocabulary argument, one of ' '"words" (one word per line, default), "classes" (word and class ' 'ID per line), "srilm-classes" (class name, membership ' 'probability, and word per line)') argument_group = parser.add_argument_group("network architecture") argument_group.add_argument( '--architecture', metavar='FILE', type=str, default='lstm300', help='path to neural network architecture description, or a standard ' 'architecture name, "lstm300" or "lstm1500" (default "lstm300")') argument_group.add_argument( '--num-classes', metavar='N', type=int, default=None, help='generate N classes using a simple word frequency based algorithm ' 'when --vocabulary argument is not given (default is to not use ' 'word classes)') argument_group = parser.add_argument_group("training process") argument_group.add_argument( '--sampling', metavar='FRACTION', type=float, nargs='*', default=[], help='randomly sample only FRACTION of each training file on each ' 'epoch (list the fractions in the same order as the training ' 'files)') argument_group.add_argument( '--training-strategy', metavar='NAME', type=str, default='local-mean', help='selects a training and validation strategy, one of "basic", ' '"local-mean", "local-median", "validation-average" (default ' '"local-mean")') argument_group.add_argument( '--sequence-length', metavar='N', type=int, default=100, help='ignore sentences longer than N words (default 100)') argument_group.add_argument( '--batch-size', metavar='N', type=int, default=16, help='each mini-batch will contain N sentences (default 16)') argument_group.add_argument( '--validation-frequency', metavar='N', type=int, default='5', help='cross-validate for reducing learning rate or early stopping N ' 'times per training epoch (default 5)') argument_group.add_argument( '--patience', metavar='N', type=int, default=4, help='allow perplexity to increase N consecutive cross-validations, ' 'before decreasing learning rate; if less than zero, never ' 'decrease learning rate (default 4)') argument_group.add_argument( '--random-seed', metavar='N', type=int, default=None, help='seed to initialize the random state (default is to seed from a ' 'random source provided by the oprating system)') argument_group = parser.add_argument_group("optimization") argument_group.add_argument( '--optimization-method', metavar='NAME', type=str, default='adagrad', help='optimization method, one of "sgd", "nesterov", "adagrad", ' '"adadelta", "rmsprop-sgd", "rmsprop-nesterov", "adam" ' '(default "adagrad")') argument_group.add_argument( '--learning-rate', metavar='ALPHA', type=float, default=0.1, help='initial learning rate (default 0.1)') argument_group.add_argument( '--momentum', metavar='BETA', type=float, default=0.9, help='momentum coefficient for momentum optimization methods (default ' '0.9)') argument_group.add_argument( '--gradient-decay-rate', metavar='GAMMA', type=float, default=0.9, help='geometric rate for averaging gradients (default 0.9)') argument_group.add_argument( '--sqr-gradient-decay-rate', metavar='GAMMA', type=float, default=0.999, help='geometric rate for averaging squared gradients in Adam optimizer ' '(default 0.999)') argument_group.add_argument( '--numerical-stability-term', metavar='EPSILON', type=float, default=1e-6, help='a value that is used to prevent instability when dividing by ' 'very small numbers (default 1e-6)') argument_group.add_argument( '--gradient-normalization', metavar='THRESHOLD', type=float, default=5, help='scale down the gradients if necessary to make sure their norm ' '(normalized by mini-batch size) will not exceed THRESHOLD ' '(default 5)') argument_group.add_argument( '--unk-penalty', metavar='LOGPROB', type=float, default=None, help="if LOGPROB is zero, do not include <unk> tokens in perplexity " "computation; otherwise use constant LOGPROB as <unk> token score " "(default is to use the network to predict <unk> probability)") argument_group.add_argument( '--weights', metavar='LAMBDA', type=float, nargs='*', default=[], help='scale a mini-batch update by LAMBDA if the data is from the ' 'corresponding training file (list the weights in the same order ' 'as the training files)') argument_group = parser.add_argument_group("early stopping") argument_group.add_argument( '--stopping-criterion', metavar='NAME', type=str, default='annealing-count', help='selects a criterion for early-stopping, one of "epoch-count" ' '(fixed number of epochs), "no-improvement" (no improvement since ' 'learning rate was decreased), "annealing-count" (default, ' 'learning rate is decreased a fixed number of times)') argument_group.add_argument( '--min-epochs', metavar='N', type=int, default=1, help='perform at least N training epochs (default 1)') argument_group.add_argument( '--max-epochs', metavar='N', type=int, default=100, help='perform at most N training epochs (default 100)') argument_group.add_argument( '--max-annealing-count', metavar='N', type=int, default=0, help='when using annealing-count stopping criterion, continue training ' 'after decreasing learning rate at most N times (default 0)') argument_group = parser.add_argument_group("logging and debugging") argument_group.add_argument( '--log-file', metavar='FILE', type=str, default='-', help='path where to write log file (default is standard output)') argument_group.add_argument( '--log-level', metavar='LEVEL', type=str, default='info', help='minimum level of events to log, one of "debug", "info", "warn" ' '(default "info")') argument_group.add_argument( '--log-interval', metavar='N', type=int, default=1000, help='print statistics of every Nth mini-batch update; quiet if less ' 'than one (default 1000)') argument_group.add_argument( '--debug', action="store_true", help='enables debugging Theano errors') argument_group.add_argument( '--profile', action="store_true", help='enables profiling Theano functions')