from dictlearn.config_registry import ConfigRegistry lm_config_registry = ConfigRegistry() lm_config_registry.set_root_config({ # data 'data_path': 'onebillionword/', 'dict_path': "", 'vocab_path': "", 'dict_vocab_path': "", 'layout': 'standard', 'num_input_words': 10000, 'def_num_input_words': 0, #0 => num_input_words 'num_output_words': 10000, 'max_length': 100, 'batch_size': 64, 'batch_size_valid': 64, 'max_def_length': 100, 'max_def_per_word': 1000, 'exclude_top_k': 0, # model 'emb_dim': 500, 'emb_def_dim': 500, 'dim': 500, 'compose_type': 'sum', 'disregard_word_embeddings': False, 'learning_rate': 0.001, 'momentum': 0.9, 'grad_clip_threshold': 5.0, # embeddings
from dictlearn.config_registry import ConfigRegistry nli_esim_config_registry = ConfigRegistry() # Each epoch has ~500k examples # Params copied from https://github.com/NYU-MLL/multiNLI/blob/master/python/util/parameters.py nli_esim_config_registry.set_root_config({ 'data_path': 'snli', 'layout': 'snli', # Lookup params 'max_def_per_word': 100000, 'emb_dim': 300, 'bn': 0, 'dim': 300, 'dict_path': '', 'vocab': '', 'vocab_text': '', # Defaults to vocab. Use when original vocab cannot be used for frequency in dict 'encoder': 'bilstm', # Also used in NYU-MLI
from dictlearn.config_registry import ConfigRegistry configs_ae = ConfigRegistry() configs_ae.set_root_config({ # data_path: not useful to use that, it's better to use FUEL_DATA_PATH # so that we can keep identical configs for different dictionaries 'data_path': '', # the following param was useful to run a baseline without an encoder # would be similar to word2vec with only one target word (the defined word) # this is NOT the baseline in the paper, it is weaker than word2vec 'vocab_keys_path': '', 'layout' : 'dict', # don't change. TODO remove this option # num_input_words can be set lower than the number of lines in vocab.txt # this allows to replace rare words with UNK (for example, if set to all the words # from line 10000 on will be replaced by UNK token if it is set to 10000) 'num_input_words' : 10000, # same for num_output_words: the loss will ignore words that are ranked # above the value 'num_output_words': 10000, # max definition length 'max_length' : 100, 'batch_size' : 32, 'batch_size_valid' : 32, # model 'encoder': 'lstm', # experimental code with bilstm variants (see seq2seq.py) 'decoder': 'skip-gram', # do not change? # You should use emb_dim = dim unless you're playing with more experimental # code. 'emb_dim' : 300,
from dictlearn.config_registry import ConfigRegistry qa_config_registry = ConfigRegistry() qa_config_registry.set_root_config({ # data 'data_path': "", 'dict_path': "", 'vocab_path': "", 'dict_vocab_path': "", 'embedding_path': "", 'layout': 'standard', 'num_input_words': 10000, 'def_num_input_words': 0, 'max_length': 100, 'batch_size': 32, 'batch_size_valid': 32, # retrieval hacks 'max_def_length': 1000, 'with_too_long_defs': 'drop', 'max_def_per_word': 1000, 'with_too_many_defs': 'random', 'exclude_top_k': 0, # model 'def_reader': 'LSTMReadDefinitions', 'dim': 128, 'emb_dim': 0, 'readout_dims': [], 'coattention': True, 'learning_rate': 0.001,
from dictlearn.config_registry import ConfigRegistry snli_config_registry = ConfigRegistry() snli_config_registry.set_root_config({ 'data_path': 'snli/', 'layout': 'snli', # Lookup params 'translate_dim': 300, 'max_def_per_word': 100000, 'bn': True, 'mlp_dim': 600, 'emb_dim': 300, # Used for def and word lookup 'dict_path': '', # Remove by default embeddings. Our goal ATM is to beat random init 'embedding_path': '', #/data/lisa/exp/jastrzes/dict_based_learning/data/snli/glove.840B.300d.npy', 'vocab_def': '', 'vocab_text': '', # If passed will be used for exclude_top_k in Retrieval only 'vocab': '', 'def_dim': 300, # LSTM reader hidden state or translate in MeanPool 'def_emb_dim': -1, # Dimensionality of vectors used in definitions 'compose_type': '', 'disregard_word_embeddings': False, 'exclude_top_k': -1, 'max_def_length': 50, 'with_too_long_defs': 'drop', 'train_emb':
from dictlearn.config_registry import ConfigRegistry lm_config_registry = ConfigRegistry() lm_config_registry.set_root_config({ # data 'data_path': "", 'dict_path': "", 'layout': 'standard', 'num_input_words': 10000, 'num_output_words': 10000, 'max_length': 100, 'batch_size': 32, 'batch_size_valid': 32, 'max_def_length': 1000, 'exclude_top_k': -1, # model 'dim': 128, 'compose_type': 'sum', 'standalone_def_rnn': True, 'disregard_word_embeddings': False, 'learning_rate': 0.001, 'momentum': 0.9, 'grad_clip_threshold': 5.0, # monitoring and checkpointing 'mon_freq_train': 10, 'mon_freq_valid': 1000, 'save_freq_batches': 1000, 'n_batches': 0, 'monitor_parameters': False