def test_save_load_cache_models(): cache_language_models = ['awd_lstm_lm_1150', 'awd_lstm_lm_600', 'standard_lstm_lm_200', 'standard_lstm_lm_650', 'standard_lstm_lm_1500'] datasets = ['wikitext-2'] for name in cache_language_models: for dataset_name in datasets: cache_cell = nlp.model.train.get_cache_model(name, dataset_name, window=1, theta=0.6, lambdas=0.2) print(cache_cell) cache_cell.save_parameters( os.path.join(get_home_dir(), 'models', name + '-' + dataset_name + '.params')) cache_cell.load_parameters( os.path.join(get_home_dir(), 'models', name + '-' + dataset_name + '.params'))
def test_get_cache_model_noncache_models(): language_models_params = { 'awd_lstm_lm_1150': 'awd_lstm_lm_1150_wikitext-2-f9562ed0.params', 'awd_lstm_lm_600': 'awd_lstm_lm_600_wikitext-2-e952becc.params', 'standard_lstm_lm_200': 'standard_lstm_lm_200_wikitext-2-b233c700.params', 'standard_lstm_lm_650': 'standard_lstm_lm_650_wikitext-2-631f3904.params', 'standard_lstm_lm_1500': 'standard_lstm_lm_1500_wikitext-2-a4163513.params'} datasets = ['wikitext-2'] for name in language_models_params.keys(): for dataset_name in datasets: _, vocab = nlp.model.get_model(name=name, dataset_name=dataset_name, pretrained=True) ntokens = len(vocab) cache_cell_0 = nlp.model.train.get_cache_model(name, dataset_name, window=1, theta=0.6, lambdas=0.2) print(cache_cell_0) model, _ = nlp.model.get_model(name=name, dataset_name=dataset_name, pretrained=True) cache_cell_1 = nlp.model.train.CacheCell( model, ntokens, window=1, theta=0.6, lambdas=0.2) cache_cell_1.load_parameters( os.path.join(get_home_dir(), 'models', language_models_params.get(name))) print(cache_cell_1) outs0, word_history0, cache_history0, hidden0 = cache_cell_0( mx.nd.arange(10).reshape(10, 1), mx.nd.arange(10).reshape(10, 1), None, None) outs1, word_history1, cache_history1, hidden1 = cache_cell_1( mx.nd.arange(10).reshape(10, 1), mx.nd.arange(10).reshape(10, 1), None, None) assert outs0.shape == outs1.shape, outs0.shape assert len(word_history0) == len(word_history1), len(word_history0) assert len(cache_history0) == len(cache_history1), len(cache_history0) assert len(hidden0) == len(hidden1), len(hidden0)
def __init__(self, segment='train', root=os.path.join(get_home_dir(), 'datasets', 'glue_sst'), return_all_fields=False): self._data_file = { 'train': ('train', 'bcde781bed5caa30d5e9a9d24e5c826965ed02a2', 'ffbb67a55e27525e925b79fee110ca19585d70ca'), 'dev': ('dev', '85698e465ff6573fb80d0b34229c76df84cd766b', 'e166f986cec68fd4cca0ae5ce5869b917f88a2fa'), 'test': ('test', 'efac1c275553ed78500e9b8d8629408f5f867b20', '3ce8041182bf82dbbbbfe13738b39d3c69722744') } data_file = self._data_file[segment] if segment in ['train', 'dev']: A_IDX, LABEL_IDX = 0, 1 field_indices = [A_IDX, LABEL_IDX ] if not return_all_fields else None num_discard_samples = 1 elif segment == 'test': A_IDX = 1 field_indices = [A_IDX] if not return_all_fields else None num_discard_samples = 1 super(GlueSST2, self).__init__(root, data_file, num_discard_samples=num_discard_samples, field_indices=field_indices)
def __init__(self, ctx=mx.cpu(), dtype='float32', model='bert_12_768_12', dataset_name='book_corpus_wiki_en_uncased', params_path=None, max_seq_length=25, batch_size=256, root=os.path.join(get_home_dir(), 'models')): self.ctx = ctx self.dtype = dtype self.max_seq_length = max_seq_length self.batch_size = batch_size self.dataset_name = dataset_name # Don't download the pretrained models if we have a parameter path self.bert, self.vocab = gluonnlp.model.get_model(model, dataset_name=self.dataset_name, pretrained=params_path is None, ctx=self.ctx, use_pooler=False, use_decoder=False, use_classifier=False, root=root) self.bert.cast(self.dtype) if params_path: logger.info('Loading params from %s', params_path) self.bert.load_parameters(params_path, ctx=ctx, ignore_extra=True) lower = 'uncased' in self.dataset_name self.tokenizer = BERTTokenizer(self.vocab, lower=lower) self.transform = BERTSentenceTransform(tokenizer=self.tokenizer, max_seq_length=self.max_seq_length, pair=False)
def __init__(self, segment='train', root=os.path.join(get_home_dir(), 'datasets', 'glue_stsb'), return_all_fields=False): self._data_file = { 'train': ('train', '9378bd341576810730a5c666ed03122e4c5ecc9f', '501e55248c6db2a3f416c75932a63693000a82bc'), 'dev': ('dev', '529c3e7c36d0807d88d0b2a5d4b954809ddd4228', 'f8bcc33b01dfa2e9ba85601d0140020735b8eff3'), 'test': ('test', '6284872d6992d8ec6d96320af89c2f46ac076d18', '36553e5e2107b817257232350e95ff0f3271d844') } data_file = self._data_file[segment] if segment in ['train', 'dev']: A_IDX, B_IDX, LABEL_IDX = 7, 8, 9 field_indices = [A_IDX, B_IDX, LABEL_IDX ] if not return_all_fields else None num_discard_samples = 1 elif segment == 'test': A_IDX, B_IDX, = 7, 8 field_indices = [A_IDX, B_IDX] if not return_all_fields else None num_discard_samples = 1 super(GlueSTSB, self).__init__(root, data_file, num_discard_samples=num_discard_samples, field_indices=field_indices)
def __init__(self, segment='train', src_lang='en', tgt_lang='de', root=os.path.join(get_home_dir(), 'datasets', 'translation_test')): self._supported_segments = ['train', 'val', 'test'] self._archive_file = {_get_pair_key('en', 'de'): ('translation_test.zip', '14f6c8e31ac6ec84ce469b4c196d60b4c86a179d')} self._data_file = {_get_pair_key('en', 'de'): {'train_en': ('train.en', 'aa7f22b91eb93390fd342a57a81f51f53ed29542'), 'train_de': ('train.de', 'f914217ce23ddd8cac07e761a75685c043d4f6d3'), 'val_en': ('train.en', 'aa7f22b91eb93390fd342a57a81f51f53ed29542'), 'val_de': ('train.de', 'f914217ce23ddd8cac07e761a75685c043d4f6d3'), 'test_en': ('train.en', 'aa7f22b91eb93390fd342a57a81f51f53ed29542'), 'test_de': ('train.de', 'f914217ce23ddd8cac07e761a75685c043d4f6d3'), 'vocab_en': ('vocab.en.json', 'c7c6af4603ea70f0a4af2460a622333fbd014050'), 'vocab_de' : ('vocab.de.json', '5b6f1be36a3e3cb9946b86e5d0fc73d164fda99f')}} super(TOY, self).__init__('translation_test', segment=segment, src_lang=src_lang, tgt_lang=tgt_lang, root=root)
def __init__(self, segment='train', root=os.path.join(get_home_dir(), 'datasets', 'glue_qqp'), return_all_fields=False): self._data_file = { 'train': ('train', '494f280d651f168ad96d6cd05f8d4ddc6be73ce9', '95c01e711ac8dbbda8f67f3a4291e583a72b6988'), 'dev': ('dev', '9957b60c4c62f9b98ec91b26a9d43529d2ee285d', '755e0bf2899b8ad315d4bd7d4c85ec51beee5ad0'), 'test': ('test', '1e325cc5dbeeb358f9429c619ebe974fc2d1a8ca', '0f50d1a62dd51fe932ba91be08238e47c3e2504a') } data_file = self._data_file[segment] if segment in ['train', 'dev']: A_IDX, B_IDX, LABEL_IDX = 3, 4, 5 field_indices = [A_IDX, B_IDX, LABEL_IDX ] if not return_all_fields else None num_discard_samples = 1 elif segment == 'test': A_IDX, B_IDX, = 1, 2 field_indices = [A_IDX, B_IDX] if not return_all_fields else None num_discard_samples = 1 # QQP may include broken samples super(GlueQQP, self).__init__(root, data_file, num_discard_samples=num_discard_samples, field_indices=field_indices, allow_missing=True)
def __init__(self, segment='train', root=os.path.join(get_home_dir(), 'datasets', 'glue_rte'), return_all_fields=False): self._data_file = { 'train': ('train', 'a23b0633f4f4dfa866c672af2e94f7e07344888f', 'ec2b246745bb5c9d92aee0800684c08902742730'), 'dev': ('dev', 'a6cde090d12a10744716304008cf33dd3f0dbfcb', 'ade75e0673862dcac9c653efb9f59f51be2749aa'), 'test': ('test', '7e4e58a6fa80b1f05e603b4e220524be7976b488', 'ddda5c967fb5a4934b429bb52aaa144e70900000') } data_file = self._data_file[segment] if segment in ['train', 'dev']: A_IDX, B_IDX, LABEL_IDX = 1, 2, 3 field_indices = [A_IDX, B_IDX, LABEL_IDX ] if not return_all_fields else None num_discard_samples = 1 elif segment == 'test': A_IDX, B_IDX, = 1, 2 field_indices = [A_IDX, B_IDX] if not return_all_fields else None num_discard_samples = 1 super(GlueRTE, self).__init__(root, data_file, num_discard_samples=num_discard_samples, field_indices=field_indices)
def __init__(self, segment='train', root=os.path.join(get_home_dir(), 'datasets', 'glue_qnli'), return_all_fields=False): self._data_file = { 'train': ('train', '95fae96fb1ffa6a2804192c9036d3435e63b48e8', 'd90a84eb40c6ba32bc2b34284ceaa962c46f8753'), 'dev': ('dev', '5652b9d4d5c8d115c080bcf64101927ea2b3a1e0', 'd14a61290301c2a9d26459c4cd036742e8591428'), 'test': ('test', '23dfb2f38adb14d3e792dbaecb7f5fd5dfa8db7e', 'f3da1a2e471ebfee81d91574b42e0f5d39153c59') } data_file = self._data_file[segment] if segment in ['train', 'dev']: A_IDX, B_IDX, LABEL_IDX = 1, 2, 3 field_indices = [A_IDX, B_IDX, LABEL_IDX ] if not return_all_fields else None num_discard_samples = 1 elif segment == 'test': A_IDX, B_IDX, = 1, 2 field_indices = [A_IDX, B_IDX] if not return_all_fields else None num_discard_samples = 1 super(GlueQNLI, self).__init__(root, data_file, num_discard_samples=num_discard_samples, field_indices=field_indices)
def __init__(self, segment='train', root=os.path.join(get_home_dir(), 'datasets', 'superglue_wsc')): self._segment = segment self._data_file = { 'train': ('train', 'ed0fe96914cfe1ae8eb9978877273f6baed621cf', 'fa978f6ad4b014b5f5282dee4b6fdfdaeeb0d0df'), 'dev': ('dev', 'cebec2f5f00baa686573ae901bb4d919ca5d3483', 'ea2413e4e6f628f2bb011c44e1d8bae301375211'), 'test': ('test', '3313896f315e0cb2bb1f24f3baecec7fc93124de', 'a47024aa81a5e7c9bc6e957b36c97f1d1b5da2fd') } data_file = self._data_file[segment] if segment in ['train', 'dev']: field_keys = [ "target", "text", [["span1_index", "span1_text"], ["span2_index", "span2_text"]], "label" ] elif segment == 'test': field_keys = [ "target", "text", [["span1_index", "span1_text"], ["span2_index", "span2_text"]] ] super(SuperGlueWSC, self).__init__(root, data_file, field_keys, task="WSC")
def __init__(self, segment='train', root=os.path.join(get_home_dir(), 'datasets', 'glue_wnli'), return_all_fields=False): self._data_file = { 'train': ('train', '8db0004d0e58640751a9f2875dd66c8000504ddb', 'b497281c1d848b619ea8fe427b3a6e4dc8e7fa92'), 'dev': ('dev', 'd54834960555073fb497cf2766edb77fb62c3646', '6bbdb866d0cccaac57c3a2505cf53103789b69a9'), 'test': ('test', '431e596a1c6627fb168e7741b3e32ef681da3c7b', '6ba8fcf3e5b451c101a3902fb4ba3fc1dea42e50') } data_file = self._data_file[segment] if segment in ['train', 'dev']: A_IDX, B_IDX, LABEL_IDX = 1, 2, 3 field_indices = [A_IDX, B_IDX, LABEL_IDX ] if not return_all_fields else None num_discard_samples = 1 elif segment == 'test': A_IDX, B_IDX, = 1, 2 field_indices = [A_IDX, B_IDX] if not return_all_fields else None num_discard_samples = 1 super(GlueWNLI, self).__init__(root, data_file, num_discard_samples=num_discard_samples, field_indices=field_indices)
def __init__(self, segment='train', root=os.path.join(get_home_dir(), 'datasets', 'glue_mrpc')): self._root = root assert segment in ['train', 'dev', 'test'], 'Unsupported segment: %s' % segment self._data_file = { 'train': ('msr_paraphrase_train.txt', '716e0f67af962f08220b7e97d229b293077ef41f', '131675ffd3d2f04f286049d31cca506c8acba69e'), 'dev': ('msr_paraphrase_train.txt', '716e0f67af962f08220b7e97d229b293077ef41f', 'e4486577c4cb2e5c2a3fd961eb24f03c623ea02d'), 'test': ('msr_paraphrase_test.txt', '4265196c15cf75620b0b592b8b921f543bda7e6c', '3602b2ca26cf574e84183c14d6c0901669ee2d0a') } self._generate(segment) path = os.path.join(root, '%s.tsv' % segment) A_IDX, B_IDX, LABEL_IDX = 3, 4, 0 if segment == 'test': fields = [A_IDX, B_IDX] else: fields = [A_IDX, B_IDX, LABEL_IDX] super(GlueMRPC, self).__init__(path, num_discard_samples=1, field_indices=fields)
def gpt2_345m(dataset_name=None, vocab=None, pretrained=True, ctx=mx.cpu(), root=os.path.join(get_home_dir(), 'models'), **kwargs): """Generic GPT-2 model. The number of layers (L) is 24, number of units (H) is 1024, and the number of self-attention heads (A) is 16. Parameters ---------- dataset_name : str or None, default None If not None, the dataset name is used to load a vocabulary for the dataset. If the `pretrained` argument is set to True, the dataset name is further used to select the pretrained parameters to load. Options include 'book_corpus_wiki_en_uncased' and 'book_corpus_wiki_en_cased'. vocab : gluonnlp.vocab.BERTVocab or None, default None Vocabulary for the dataset. Must be provided if dataset_name is not specified. Ignored if dataset_name is specified. pretrained : bool, default True Whether to load the pretrained weights for model. ctx : Context, default CPU The context in which to load the pretrained weights. root : str, default '$MXNET_HOME/models' Location for keeping the model parameters. MXNET_HOME defaults to '~/.mxnet'. Returns ------- GPT2Model, gluonnlp.vocab.Vocab """ return _get_gpt2_model('gpt2_345m', dataset_name=dataset_name, vocab=vocab, pretrained=pretrained, ctx=ctx, root=root, **kwargs)
def __init__(self, segment='train', root=os.path.join(get_home_dir(), 'datasets', 'glue_cola'), return_all_fields=False): self._data_file = { 'train': ('train', '662227ed4d98bb96b3495234b650e37826a5ef72', '7760a9c4b1fb05f6d003475cc7bb0d0118875190'), 'dev': ('dev', '6f3f5252b004eab187bf22ab5b0af31e739d3a3f', '30ece4de38e1929545c4154d4c71ad297c7f54b4'), 'test': ('test', 'b88180515ad041935793e74e3a76470b0c1b2c50', 'f38b43d31bb06accf82a3d5b2fe434a752a74c9f') } data_file = self._data_file[segment] if segment in ['train', 'dev']: A_IDX, LABEL_IDX = 3, 1 field_indices = [A_IDX, LABEL_IDX ] if not return_all_fields else None num_discard_samples = 0 elif segment == 'test': A_IDX = 1 field_indices = [A_IDX] if not return_all_fields else None num_discard_samples = 1 super(GlueCoLA, self).__init__(root, data_file, num_discard_samples=num_discard_samples, field_indices=field_indices)
def bert_12_768_12(dataset_name=None, vocab=None, pretrained=True, ctx=mx.cpu(), root=os.path.join(get_home_dir(), 'models'), use_pooler=True, use_decoder=True, use_classifier=True, input_size=None, seq_length=None, **kwargs): """Hybrid BERT BASE model. The number of layers (L) is 12, number of units (H) is 768, and the number of self-attention heads (A) is 12. Parameters ---------- dataset_name : str or None, default None Options include 'book_corpus_wiki_en_cased', 'book_corpus_wiki_en_uncased', 'wiki_cn_cased', 'wiki_multilingual_uncased' and 'wiki_multilingual_cased'. vocab : gluonnlp.vocab.BERTVocab or None, default None Vocabulary for the dataset. Must be provided if dataset is not specified. pretrained : bool, default True Whether to load the pretrained weights for model. ctx : Context, default CPU The context in which to load the pretrained weights. root : str, default '$MXNET_HOME/models' Location for keeping the model parameters. MXNET_HOME defaults to '~/.mxnet'. use_pooler : bool, default True Whether to include the pooler which converts the encoded sequence tensor of shape (batch_size, seq_length, units) to a tensor of shape (batch_size, units) for for segment level classification task. use_decoder : bool, default True Whether to include the decoder for masked language model prediction. use_classifier : bool, default True Whether to include the classifier for next sentence classification. input_size : int, default None Represents the embedding size of the input. seq_length : int, default None Stands for the sequence length of the input. Returns ------- HybridBERTModel, gluonnlp.vocab.BERTVocab """ return get_hybrid_bert_model(model_name='bert_12_768_12', vocab=vocab, dataset_name=dataset_name, pretrained=pretrained, ctx=ctx, use_pooler=use_pooler, use_decoder=use_decoder, use_classifier=use_classifier, root=root, input_size=input_size, seq_length=seq_length, **kwargs)
def __init__(self, ctx=mx.cpu(), dtype='float32', model='bert_12_768_12', dataset_name='book_corpus_wiki_en_uncased', params_path=None, max_seq_length=25, batch_size=256, sentencepiece=None, root=os.path.join(get_home_dir(), 'models')): self.ctx = ctx self.dtype = dtype self.max_seq_length = max_seq_length self.batch_size = batch_size self.dataset_name = dataset_name # use sentencepiece vocab and a checkpoint # we need to set dataset_name to None, otherwise it uses the downloaded vocab if params_path and sentencepiece: dataset_name = None else: dataset_name = self.dataset_name if sentencepiece: vocab = gluonnlp.vocab.BERTVocab.from_sentencepiece(sentencepiece) else: vocab = None self.bert, self.vocab = gluonnlp.model.get_model( model, dataset_name=dataset_name, pretrained=params_path is None, ctx=self.ctx, use_pooler=False, use_decoder=False, use_classifier=False, root=root, vocab=vocab) self.bert.cast(self.dtype) if params_path: logger.info('Loading params from %s', params_path) self.bert.load_parameters(params_path, ctx=ctx, ignore_extra=True, cast_dtype=True) lower = 'uncased' in self.dataset_name if sentencepiece: self.tokenizer = BERTSPTokenizer(sentencepiece, self.vocab, lower=lower) else: self.tokenizer = BERTTokenizer(self.vocab, lower=lower) self.transform = BERTSentenceTransform( tokenizer=self.tokenizer, max_seq_length=self.max_seq_length, pair=False)
def xlnet_cased_l24_h1024_a16(dataset_name: Optional[str] = None, vocab: Optional[nlp.Vocab] = None, tokenizer: Optional[XLNetTokenizer] = None, pretrained: bool = True, ctx: mx.Context = mx.cpu(), root=os.path.join(get_home_dir(), 'models'), do_lower_case=False, **kwargs): """XLNet model. References: Yang, Z., Dai, Z., Yang, Y., Carbonell, J., Salakhutdinov, R., & Le, Q. V. (2019). XLNet: Generalized Autoregressive Pretraining for Language Understanding. arXiv preprint arXiv:1906.08237. Parameters ---------- dataset_name : str or None, default None If not None, the dataset name is used to load a vocabulary for the dataset. If the `pretrained` argument is set to True, the dataset name is further used to select the pretrained parameters to load. Options include 'books_enwiki_giga5_clueweb2012b_commoncrawl'. vocab : gluonnlp.vocab.Vocab or None, default None Vocabulary for the dataset. Must be provided if dataset_name is not specified. Ignored if dataset_name is specified. tokenizer : XLNetTokenizer or None, default None XLNetTokenizer for the dataset. Must be provided if dataset_name is not specified. Ignored if dataset_name is specified. pretrained : bool, default True Whether to load the pretrained weights for model. ctx : Context, default CPU The context in which to load the pretrained weights. root : str, default '$MXNET_HOME/models' Location for keeping the model parameters. MXNET_HOME defaults to '~/.mxnet'. Returns ------- XLNet, gluonnlp.Vocab, XLNetTokenizer """ kwargs.update(**{ 'hidden_size': 4096, 'units': 1024, 'activation': 'approx_gelu', 'num_heads': 16, 'num_layers': 24, }) if vocab is None or dataset_name is not None: vocab = _load_vocab('xlnet_' + dataset_name, vocab, root) net = XLNet(vocab_size=len(vocab), **kwargs) if pretrained: _load_pretrained_params(net=net, model_name='xlnet_cased_l24_h1024_a16', dataset_name=dataset_name, root=root, ctx=ctx, ignore_extra=not kwargs.get('use_decoder', True)) if tokenizer is None or dataset_name is not None: tokenizer = _get_xlnet_tokenizer(dataset_name, root, do_lower_case) return net, vocab, tokenizer
def _get_gpt2_model(model_name=None, dataset_name=None, vocab=None, pretrained=True, ctx=mx.cpu(), root=os.path.join(get_home_dir(), 'models'), **kwargs): """Any predefined GPT-2 model. Parameters ---------- model_name : str or None, default None Options include 'gpt2_117m' and 'gpt2_345m'. dataset_name : str or None, default None If not None, the dataset name is used to load a vocabulary for the dataset. If the `pretrained` argument is set to True, the dataset name is further used to select the pretrained parameters to load. The supported datasets for model_name of either bert_24_1024_16 and bert_12_768_12 are 'openai_webtext'. vocab : gluonnlp.vocab.BERTVocab or None, default None Vocabulary for the dataset. Must be provided if dataset_name is not specified. Ignored if dataset_name is specified. pretrained : bool, default True Whether to load the pretrained weights for model. ctx : Context, default CPU The context in which to load the pretrained weights. root : str, default '$MXNET_HOME/models' Location for keeping the model parameters. MXNET_HOME defaults to '~/.mxnet'. Returns ------- GPT2Model, gluonnlp.vocab.Vocab """ predefined_args = gpt2_hparams[model_name] mutable_args = ['dropout'] mutable_args = frozenset(mutable_args) assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \ 'Cannot override predefined model settings.' predefined_args.update(kwargs) vocab = _load_vocab(dataset_name, vocab, root) # BERT net = GPT2Model(units=predefined_args['units'], vocab_size=len(vocab), max_length=predefined_args['max_length'], num_layers=predefined_args['num_layers'], num_heads=predefined_args['num_heads'], dropout=predefined_args['dropout'], **kwargs) if pretrained: _load_pretrained_params(net, model_name, dataset_name, root, ctx) for i in range(net._num_layers): net._ffn_layers[i]._act._support_erf = False return net, vocab
def get_bort_model(model_name=None, dataset_name=None, vocab=None, pretrained=True, ctx=mx.cpu(), use_decoder=True, output_attention=False, output_all_encodings=False, root=os.path.join(get_home_dir(), 'models'), **kwargs): predefined_args = predefined_borts[model_name] logging.info(f"get_bort_model: {model_name}") mutable_args = ['use_residual', 'dropout', 'embed_dropout', 'word_embed'] mutable_args = frozenset(mutable_args) print("model_name: ", model_name, ", predefined_args: ", predefined_args) assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \ 'Cannot override predefined model settings.' predefined_args.update(kwargs) # encoder encoder = BERTEncoder(attention_cell=predefined_args['attention_cell'], num_layers=predefined_args['num_layers'], units=predefined_args['units'], hidden_size=predefined_args['hidden_size'], max_length=predefined_args['max_length'], num_heads=predefined_args['num_heads'], scaled=predefined_args['scaled'], dropout=predefined_args['dropout'], output_attention=output_attention, output_all_encodings=output_all_encodings, use_residual=predefined_args['use_residual'], activation=predefined_args.get('activation', 'gelu'), layer_norm_eps=predefined_args.get( 'layer_norm_eps', None)) from gluonnlp.vocab import Vocab bort_vocab = _load_vocab(dataset_name, vocab, root, cls=Vocab) net = BortModel(encoder, len(bort_vocab), units=predefined_args['units'], embed_size=predefined_args['embed_size'], embed_dropout=predefined_args['embed_dropout'], word_embed=predefined_args['word_embed'], use_decoder=use_decoder) if pretrained: ignore_extra = not use_decoder _load_pretrained_params(net, model_name, dataset_name, root, ctx, ignore_extra=ignore_extra, allow_missing=False) return net, bort_vocab
def get_dataset(self, segment='train', root=os.path.join(get_home_dir(), 'datasets', 'baidu_ernie_data')): """Get the corresponding dataset for ChnSentiCorp. Parameters ---------- segment : str, default 'train' Dataset segments. Options are 'dev', 'test', 'train' root : str, default $BAIDU_ERNIE_DATA_DIR/ Path to the folder which stores the dataset. """ return BaiduErnieChnSentiCorp(segment, root=root)
def _get_gpt2_model(model_name=None, dataset_name=None, vocab=None, pretrained=True, ctx=mx.cpu(), root=os.path.join(get_home_dir(), 'models'), hparam_allow_override=False, **kwargs): """Any predefined GPT-2 model. Parameters ---------- model_name : str or None, default None Options include 'gpt2_117m' and 'gpt2_345m'. dataset_name : str or None, default None If not None, the dataset name is used to load a vocabulary for the dataset. If the `pretrained` argument is set to True, the dataset name is further used to select the pretrained parameters to load. The supported datasets for model_name of either bert_24_1024_16 and bert_12_768_12 are 'openai_webtext'. vocab : gluonnlp.vocab.BERTVocab or None, default None Vocabulary for the dataset. Must be provided if dataset_name is not specified. Ignored if dataset_name is specified. pretrained : bool, default True Whether to load the pretrained weights for model. ctx : Context, default CPU The context in which to load the pretrained weights. root : str, default '$MXNET_HOME/models' Location for keeping the model parameters. MXNET_HOME defaults to '~/.mxnet'. hparam_allow_override : bool, default False If set to True, pre-defined hyper-parameters of the model (e.g. the number of layers, hidden units) can be overriden. Returns ------- GPT2Model, gluonnlp.vocab.Vocab """ predefined_args = gpt2_hparams[model_name].copy() if not hparam_allow_override: mutable_args = ['dropout'] mutable_args = frozenset(mutable_args) assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \ 'Cannot override predefined model settings.' predefined_args.update(kwargs) vocab = _load_vocab(dataset_name, vocab, root) # GPT2 net = GPT2Model(vocab_size=len(vocab), **predefined_args) if pretrained: _load_pretrained_params(net, model_name, dataset_name, root, ctx) return net, vocab
def RACEHash(dataset_location, task_name, segment='test'): """ Because of the way we've setup RACE-H/RACE-M, we need to figure out a way to translate it back into a viable answer. """ if dataset_location is None: dataset_location = os.path.join(get_home_dir(), 'datasets', 'race') task = "high" if task_name[-1] == "H" else "middle" test_dataset_location = os.path.join(dataset_location, segment, task) filenames = [os.path.expanduser(f) for f in os.listdir( test_dataset_location) if fnmatch.fnmatch(f, '*.txt')] filenames.sort() dataset = [] for f in filenames: dataset += [json.loads(l, object_pairs_hook=OrderedDict) for l in open(os.path.join(test_dataset_location, f), 'r').readlines()] return dataset
def __init__(self, segment='test', root=os.path.join(get_home_dir(), 'datasets', 'superglue_ax_b')): if segment in ['train', 'dev']: raise ValueError("Only \"test\" is supported for AX-b") elif segment == 'test': field_keys = ["sentence1", "sentence2"] self._segment = segment self._data_file = { 'test': ('AX-b', '398c5a376eb436f790723cd217ac040334140000', '50fd8ac409897b652daa4b246917097c3c394bc8') } data_file = self._data_file[segment] super(SuperGlueAXb, self).__init__(root, data_file, field_keys)
def __init__(self, segment='test', root=os.path.join(get_home_dir(), 'datasets', 'superglue_ax_g')): if segment in ['train', 'dev']: raise ValueError("Only \"test\" is supported for AX-g") elif segment == 'test': field_keys = ["premise", "hypothesis"] self._segment = segment self._data_file = { "test": ('AX-g', 'd8c92498496854807dfeacd344eddf466d7f468a', '8a8cbfe00fd88776a2a2f20b477e5b0c6cc8ebae') } data_file = self._data_file[segment] super(SuperGlueAXg, self).__init__(root, data_file, field_keys)
def __init__(self, segment='train', root=os.path.join(get_home_dir(), 'datasets', 'superglue_record')): self._segment = segment self._data_file = { 'train': ('train', '047282c912535c9a3bcea519935fde882feb619d', '65592074cefde2ecd1b27ce7b35eb8beb86c691a'), 'dev': ('dev', '442d8470bff2c9295231cd10262a7abf401edc64', '9d1850e4dfe2eca3b71bfea191d5f4b412c65309'), 'test': ('test', 'fc639a18fa87befdc52f14c1092fb40475bf50d0', 'b79b22f54b5a49f98fecd05751b122ccc6947c81') } data_file = self._data_file[segment] field_keys = [] super(SuperGlueReCoRD, self).__init__(root, data_file, field_keys, task="ReCoRD")
def __init__(self, segment='train', root=os.path.join(get_home_dir(), 'datasets', 'superglue_multirc')): self._segment = segment # This implementation needs the actual SuperGLUE # data, available at: # https://github.com/nyu-mll/jiant/blob/master/scripts/download_superglue_data.py self._data_file = { 'train': ('train', '', ''), 'dev': ('dev', '', ''), 'test': ('test', '', '') } data_file = self._data_file[segment] field_keys = [] super(SuperGlueMultiRC, self).__init__(root, data_file, field_keys, task="MultiRC")
def __init__(self, segment='train', root=os.path.join(get_home_dir(), 'datasets', 'baidu_ernie_data'), return_all_fields=False): A_IDX, B_IDX, LABEL_IDX = 0, 1, 2 if segment in ['train', 'dev']: field_indices = [A_IDX, B_IDX, LABEL_IDX ] if not return_all_fields else None num_discard_samples = 1 elif segment == 'test': field_indices = [A_IDX, B_IDX] if not return_all_fields else None num_discard_samples = 1 super(BaiduErnieXNLI, self).__init__(root, 'xnli', segment, num_discard_samples=num_discard_samples, field_indices=field_indices)
def __init__(self, segment='train', root=os.path.join(get_home_dir(), 'datasets', 'superglue_wic')): self._segment = segment self._data_file = { 'train': ('train', 'ec1e265bbdcde1d8da0b56948ed30d86874b1f12', '831a58c553def448e1b1d0a8a36e2b987c81bc9c'), 'dev': ('dev', '2046c43e614d98d538a03924335daae7881f77cf', '73b71136a2dc2eeb3be7ab455a08f20b8dbe7526'), 'test': ('test', '77af78a49aac602b7bbf080a03b644167b781ba9', '1be93932d46c8f8dc665eb7af6703c56ca1b1e08') } data_file = self._data_file[segment] # We'll hope the hypernymy is clear from the sentence if segment in ['train', 'dev']: field_keys = ["sentence1", "sentence2", "label"] elif segment == 'test': field_keys = ["sentence1", "sentence2"] super(SuperGlueWiC, self).__init__(root, data_file, field_keys)
def __init__(self, segment='train', root=os.path.join(get_home_dir(), 'datasets', 'superglue_boolq')): self._segment = segment self._data_file = { 'train': ('train', '89507ff3015c3212b72318fb932cfb6d4e8417ef', 'd5be523290f49fc0f21f4375900451fb803817c0'), 'dev': ('dev', 'fd39562fc2c9d0b2b8289d02a8cf82aa151d0ad4', '9b09ece2b1974e4da20f0173454ba82ff8ee1710'), 'test': ('test', 'a805d4bd03112366d548473a6848601c042667d3', '98c308620c6d6c0768ba093858c92e5a5550ce9b') } data_file = self._data_file[segment] if segment in ['train', 'dev']: field_keys = ["passage", "question", "label"] elif segment == 'test': field_keys = ["passage", "question"] super(SuperGlueBoolQ, self).__init__(root, data_file, field_keys)
def __init__(self, segment='train', root=os.path.join(get_home_dir(), 'datasets', 'superglue_rte')): self._segment = segment self._data_file = { 'train': ('train', 'a4471b47b23f6d8bc2e89b2ccdcf9a3a987c69a1', '01ebec38ff3d2fdd849d3b33c2a83154d1476690'), 'dev': ('dev', '17f23360f77f04d03aee6c42a27a61a6378f1fd9', '410f8607d9fc46572c03f5488387327b33589069'), 'test': ('test', 'ef2de5f8351ef80036c4aeff9f3b46106b4f2835', '69f9d9b4089d0db5f0605eeaebc1c7abc044336b') } data_file = self._data_file[segment] if segment in ['train', 'dev']: field_keys = ["premise", "hypothesis", "label"] elif segment == 'test': field_keys = ["premise", "hypothesis"] super(SuperGlueRTE, self).__init__(root, data_file, field_keys)