def get_args(): """Construct the argument parser.""" parser = argparse.ArgumentParser( description='Word embedding evaluation with Gluon.', formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Embeddings arguments group = parser.add_argument_group('Embedding arguments') group.add_argument('--embedding-path', type=str, help='Path to a .vec in Word2Vec text foramt or ' '.bin binary fastText model file. ') group.add_argument('--embedding-name', type=str, help=('Name of embedding type to load. ' 'Valid entries: {}'.format( ', '.join( nlp.embedding.list_sources().keys())))) group.add_argument('--embedding-source', type=str, help=('Source from which to initialize the embedding.' 'Pass --list-embedding-sources to get a list of ' 'valid sources for a given --embedding-name.')) group.add_argument( '--fasttext-load-ngrams', action='store_true', help=('Specify load_ngrams=True ' 'when loading pretrained fastText embedding.')) group.add_argument( '--max-vocab-size', type=int, default=None, help=('Only retain the X first tokens from the pre-trained embedding. ' 'The tokens are ordererd by decreasing frequency.' 'As the analogy task takes the whole vocabulary into account, ' 'removing very infrequent words improves performance.')) group.add_argument('--list-embedding-sources', action='store_true') # Computation options group = parser.add_argument_group('Computation arguments') group.add_argument('--batch-size', type=int, default=1024, help='Batch size to use on analogy task. ' 'Decrease batch size if evaluation crashes.') group.add_argument('--gpu', type=int, help=('Number (index) of GPU to run on, e.g. 0. ' 'If not specified, uses CPU.')) group.add_argument('--no-hybridize', action='store_true', help='Disable hybridization of gluon HybridBlocks.') # Logging group = parser.add_argument_group('Logging arguments') group.add_argument('--logdir', type=str, default='logs', help='Directory to store logs.') # Evaluation options evaluation.add_parameters(parser) args = parser.parse_args() validate_args(args) evaluation.validate_args(args) return args
def get_args(): """Construct the argument parser.""" parser = argparse.ArgumentParser( description='Word embedding evaluation with Gluon.', formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Embeddings arguments group = parser.add_argument_group('Embedding arguments') group.add_argument('--embedding-path', type=str, help='Path to a .vec in Word2Vec text foramt or ' '.bin binary fastText model file. ') group.add_argument('--embedding-name', type=str, help=('Name of embedding type to load. ' 'Valid entries: {}'.format( ', '.join( nlp.embedding.list_sources().keys())))) group.add_argument('--embedding-source', type=str, help=('Source from which to initialize the embedding.' 'Pass --list-embedding-sources to get a list of ' 'valid sources for a given --embedding-name.')) group.add_argument( '--fasttext-load-ngrams', action='store_true', help=('Specify load_ngrams=True ' 'when loading pretrained fastText embedding.')) group.add_argument( '--analogy-max-vocab-size', type=int, default=None, help=('Only retain the X first tokens from the pre-trained embedding. ' 'The tokens are ordered by decreasing frequency.' 'As the analogy task takes the whole vocabulary into account, ' 'removing very infrequent words improves performance.')) group.add_argument('--list-embedding-sources', action='store_true') # Computation options group = parser.add_argument_group('Computation arguments') group.add_argument('--batch-size', type=int, default=1024, help='Batch size to use on analogy task. ' 'Decrease batch size if evaluation crashes.') group.add_argument('--gpu', type=int, help=('Number (index) of GPU to run on, e.g. 0. ' 'If not specified, uses CPU.')) group.add_argument('--no-hybridize', action='store_true', help='Disable hybridization of gluon HybridBlocks.') # Logging group = parser.add_argument_group('Logging arguments') group.add_argument('--logdir', type=str, default='logs', help='Directory to store logs.') # Evaluation options evaluation.add_parameters(parser) args = parser.parse_args() validate_args(args) evaluation.validate_args(args) return args
def get_args(): """Construct the argument parser.""" parser = argparse.ArgumentParser( description='Word embedding evaluation with Gluon.', formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Embeddings arguments group = parser.add_argument_group('Embedding arguments') group.add_argument('path', type=str, help='Path to pretrained TokenEmbedding file.') group.add_argument( '--max-vocab-size', type=int, default=None, help=('Only retain the X first tokens from the pretrained embedding. ' 'The tokens are ordererd by decreasing frequency.' 'As the analogy task takes the whole vocabulary into account, ' 'removing very infrequent words improves performance.')) group.add_argument('--list-embedding-sources', action='store_true') # Computation options group = parser.add_argument_group('Computation arguments') group.add_argument('--batch-size', type=int, default=32, help='Batch size to use on analogy task.' 'Decrease batch size if evaluation crashes.') group.add_argument('--gpu', type=int, nargs='+', help=('Number (index) of GPU to run on, e.g. 0. ' 'If not specified, uses CPU.')) group.add_argument('--no-hybridize', action='store_true', help='Disable hybridization of gluon HybridBlocks ' 'used for evaluation.') # Logging group = parser.add_argument_group('Logging arguments') group.add_argument('--logdir', type=str, default='logs', help='Directory to store logs.') # Evaluation options evaluation.add_parameters(parser) args = parser.parse_args() evaluation.validate_args(args) return args
def parse_args(): """Parse command line arguments.""" parser = argparse.ArgumentParser( description='Text Classification with FastText', formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Computation options group = parser.add_argument_group('Computation arguments') group.add_argument('--input', type=str, help='Input file location') group.add_argument('--validation', type=str, help='Validation file Location ') group.add_argument('--output', type=str, help='Location to save trained model') group.add_argument('--ngrams', type=int, default=1, help='NGrams used for training') group.add_argument('--batch-size', type=int, default=16, help='Batch size for training.') group.add_argument('--epochs', type=int, default=10, help='Epoch limit') group.add_argument('--gpu', type=int, help=('Number (index) of GPU to run on, e.g. 0. ' 'If not specified, uses CPU.')) group.add_argument('--no-hybridize', action='store_true', help='Disable hybridization of gluon HybridBlocks.') # Model group = parser.add_argument_group('Model arguments') group.add_argument('--emsize', type=int, default=100, help='Size of embedding vectors.') # Optimization options group = parser.add_argument_group('Optimization arguments') group.add_argument('--optimizer', type=str, default='adam') group.add_argument('--lr', type=float, default=0.05) group.add_argument('--batch_size', type=float, default=16) # Evaluation options evaluation.add_parameters(parser) args = parser.parse_args() evaluation.validate_args(args) return args
def parse_args(): """Parse command line arguments.""" parser = argparse.ArgumentParser( description='Word embedding training with Gluon.', formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Data options group = parser.add_argument_group('Data arguments') group.add_argument('--data', type=str, default='text8', help='Training dataset.') group.add_argument('--wiki-root', type=str, default='text8', help='Root under which preprocessed wiki dump.') group.add_argument('--wiki-language', type=str, default='text8', help='Language of wiki dump.') group.add_argument('--wiki-date', help='Date of wiki dump.') # Computation options group = parser.add_argument_group('Computation arguments') group.add_argument('--batch-size', type=int, default=1024, help='Batch size for training.') group.add_argument('--epochs', type=int, default=5, help='Epoch limit') group.add_argument('--gpu', type=int, nargs='+', help=('Number (index) of GPU to run on, e.g. 0. ' 'If not specified, uses CPU.')) group.add_argument('--no-hybridize', action='store_true', help='Disable hybridization of gluon HybridBlocks.') group.add_argument( '--no-static-alloc', action='store_true', help='Disable static memory allocation for HybridBlocks.') group.add_argument('--no-sparse-grad', action='store_true', help='Disable sparse gradient support.') # Model group = parser.add_argument_group('Model arguments') group.add_argument('--emsize', type=int, default=300, help='Size of embedding vectors.') group.add_argument('--ngrams', type=int, nargs='+', default=[3, 4, 5, 6]) group.add_argument( '--ngram-buckets', type=int, default=2000000, help='Size of word_context set of the ngram hash function. ' 'Set this to 0 for Word2Vec style training.') group.add_argument('--model', type=str, default='skipgram', help='SkipGram or CBOW.') group.add_argument('--window', type=int, default=5, help='Context window size.') group.add_argument('--negative', type=int, default=5, help='Number of negative samples ' 'per source-context word pair.') group.add_argument('--frequent-token-subsampling', type=float, default=1E-4, help='Frequent token subsampling constant.') # Optimization options group = parser.add_argument_group('Optimization arguments') group.add_argument('--optimizer', type=str, default='adagrad') group.add_argument('--lr', type=float, default=0.1) group.add_argument('--seed', type=int, default=1, help='random seed') # Logging group = parser.add_argument_group('Logging arguments') group.add_argument('--logdir', type=str, default='logs', help='Directory to store logs.') group.add_argument('--log-interval', type=int, default=100) group.add_argument('--eval-interval', type=int, help='Evaluate every --eval-interval iterations ' 'in addition to at the end of every epoch.') group.add_argument('--no-eval-analogy', action='store_true', help='Don\'t evaluate on the analogy task.') # Evaluation options evaluation.add_parameters(parser) args = parser.parse_args() evaluation.validate_args(args) random.seed(args.seed) mx.random.seed(args.seed) np.random.seed(args.seed) return args
def parse_args(): """Parse command line arguments.""" parser = argparse.ArgumentParser( description='GloVe with GluonNLP', formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Data options group = parser.add_argument_group('Data arguments') group.add_argument( 'cooccurrences', type=str, help='Path to cooccurrences.npz containing a sparse (COO) ' 'representation of the co-occurrence matrix in numpy archive format. ' 'Output of ./cooccur') group.add_argument('vocab', type=str, help='Vocabulary indices. Output of vocab_count tool.') # Computation options group = parser.add_argument_group('Computation arguments') group.add_argument('--batch-size', type=int, default=512, help='Batch size for training.') group.add_argument('--epochs', type=int, default=50, help='Epoch limit') group.add_argument( '--gpu', type=int, nargs='+', help='Number (index) of GPU to run on, e.g. 0. ' 'If not specified, uses CPU.') group.add_argument('--no-hybridize', action='store_true', help='Disable hybridization of gluon HybridBlocks.') group.add_argument( '--no-static-alloc', action='store_true', help='Disable static memory allocation for HybridBlocks.') # Model group = parser.add_argument_group('Model arguments') group.add_argument('--emsize', type=int, default=300, help='Size of embedding vectors.') group.add_argument('--x-max', type=int, default=100) group.add_argument('--alpha', type=float, default=0.75) # Optimization options group = parser.add_argument_group('Optimization arguments') group.add_argument('--adagrad-eps', type=float, default=1, help='Initial AdaGrad state value.') group.add_argument('--lr', type=float, default=0.1, help='Learning rate') group.add_argument('--seed', type=int, default=1, help='Random seed') group.add_argument('--dropout', type=float, default=0.15) # Logging group = parser.add_argument_group('Logging arguments') group.add_argument('--logdir', type=str, default='logs', help='Directory to store logs.') group.add_argument('--log-interval', type=int, default=100) group.add_argument( '--eval-interval', type=int, help='Evaluate every --eval-interval iterations ' 'in addition to at the end of every epoch.') group.add_argument('--no-eval-analogy', action='store_true', help='Don\'t evaluate on the analogy task.') # Evaluation options evaluation.add_parameters(parser) args = parser.parse_args() evaluation.validate_args(args) random.seed(args.seed) mx.random.seed(args.seed) np.random.seed(args.seed) return args
def parse_args(): """Parse command line arguments.""" parser = argparse.ArgumentParser( description='Word embedding training with Gluon.', formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Data options group = parser.add_argument_group('Data arguments') group.add_argument('--data', type=str, default='text8', help='Training dataset.') group.add_argument('--wiki-root', type=str, default='text8', help='Root under which preprocessed wiki dump.') group.add_argument('--wiki-language', type=str, default='text8', help='Language of wiki dump.') group.add_argument('--wiki-date', help='Date of wiki dump.') # Computation options group = parser.add_argument_group('Computation arguments') group.add_argument('--batch-size', type=int, default=1024, help='Batch size for training.') group.add_argument('--epochs', type=int, default=5, help='Epoch limit') group.add_argument( '--gpu', type=int, nargs='+', help=('Number (index) of GPU to run on, e.g. 0. ' 'If not specified, uses CPU.')) group.add_argument('--no-prefetch-batch', action='store_true', help='Disable multi-threaded nogil batch prefetching.') group.add_argument('--num-prefetch-epoch', type=int, default=3, help='Start data pipeline for next N epochs when beginning current epoch.') group.add_argument('--no-hybridize', action='store_true', help='Disable hybridization of gluon HybridBlocks.') # Model group = parser.add_argument_group('Model arguments') group.add_argument('--emsize', type=int, default=300, help='Size of embedding vectors.') group.add_argument('--ngrams', type=int, nargs='+', default=[3, 4, 5, 6]) group.add_argument( '--ngram-buckets', type=int, default=2000000, help='Size of word_context set of the ngram hash function. ' 'Set this to 0 for Word2Vec style training.') group.add_argument('--model', type=str, default='skipgram', help='SkipGram or CBOW.') group.add_argument('--window', type=int, default=5, help='Context window size.') group.add_argument( '--negative', type=int, default=5, help='Number of negative samples ' 'per source-context word pair.') group.add_argument('--frequent-token-subsampling', type=float, default=1E-4, help='Frequent token subsampling constant.') group.add_argument( '--max-vocab-size', type=int, help='Limit the number of words considered. ' 'OOV words will be ignored.') # Optimization options group = parser.add_argument_group('Optimization arguments') group.add_argument('--optimizer', type=str, default='groupadagrad') group.add_argument('--lr', type=float, default=0.1) group.add_argument('--seed', type=int, default=1, help='random seed') # Logging group = parser.add_argument_group('Logging arguments') group.add_argument('--logdir', type=str, default='logs', help='Directory to store logs.') group.add_argument('--log-interval', type=int, default=100) group.add_argument( '--eval-interval', type=int, help='Evaluate every --eval-interval iterations ' 'in addition to at the end of every epoch.') group.add_argument('--no-eval-analogy', action='store_true', help='Don\'t evaluate on the analogy task.') # Evaluation options evaluation.add_parameters(parser) args = parser.parse_args() evaluation.validate_args(args) random.seed(args.seed) mx.random.seed(args.seed) np.random.seed(args.seed) return args
def parse_args(): """Parse command line arguments.""" parser = argparse.ArgumentParser( description='Word embedding training with Gluon.', formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Computation options group = parser.add_argument_group('Computation arguments') group.add_argument('--batch-size', type=int, default=1024, help='Batch size for training.') group.add_argument('--epochs', type=int, default=5, help='Epoch limit') group.add_argument('--gpu', type=int, nargs='+', help=('Number (index) of GPU to run on, e.g. 0. ' 'If not specified, uses CPU.')) group.add_argument('--no-hybridize', action='store_true', help='Disable hybridization of gluon HybridBlocks.') group.add_argument( '--no-static-alloc', action='store_true', help='Disable static memory allocation for HybridBlocks.') group.add_argument('--no-sparse-grad', action='store_true', help='Disable sparse gradient support.') # Model group = parser.add_argument_group('Model arguments') group.add_argument('--emsize', type=int, default=300, help='Size of embedding vectors.') group.add_argument('--ngrams', type=int, nargs='+', default=[3, 4, 5, 6]) group.add_argument( '--ngram-buckets', type=int, default=500000, help='Size of word_context set of the ngram hash function.') group.add_argument('--model', type=str, default='skipgram', help='SkipGram or CBOW.') group.add_argument('--window', type=int, default=5, help='Context window size.') group.add_argument('--negative', type=int, default=5, help='Number of negative samples.') # Optimization options group = parser.add_argument_group('Optimization arguments') group.add_argument('--optimizer', type=str, default='adagrad') group.add_argument('--lr', type=float, default=0.05) group.add_argument('--optimizer-subwords', type=str, default='adagrad') group.add_argument('--lr-subwords', type=float, default=0.01) # Logging group = parser.add_argument_group('Logging arguments') group.add_argument('--logdir', type=str, default='logs', help='Directory to store logs.') group.add_argument('--eval-interval', type=int, default=50000, help='Evaluate every --eval-interval iterations ' 'in addition to at the end of every epoch.') group.add_argument('--no-eval-analogy', action='store_true', help='Don\'t evaluate on the analogy task.') # Evaluation options evaluation.add_parameters(parser) args = parser.parse_args() evaluation.validate_args(args) return args