Exemple #1
0
def test_save_load_cache_models():
    cache_language_models = ['awd_lstm_lm_1150', 'awd_lstm_lm_600', 'standard_lstm_lm_200',
                             'standard_lstm_lm_650', 'standard_lstm_lm_1500']
    datasets = ['wikitext-2']
    for name in cache_language_models:
        for dataset_name in datasets:
            cache_cell = nlp.model.train.get_cache_model(name, dataset_name, window=1, theta=0.6,
                                                         lambdas=0.2)
            print(cache_cell)
            cache_cell.save_parameters(
                os.path.join(get_home_dir(), 'models', name + '-' + dataset_name + '.params'))
            cache_cell.load_parameters(
                os.path.join(get_home_dir(), 'models', name + '-' + dataset_name + '.params'))
Exemple #2
0
def test_get_cache_model_noncache_models():
    language_models_params = {
        'awd_lstm_lm_1150': 'awd_lstm_lm_1150_wikitext-2-f9562ed0.params',
        'awd_lstm_lm_600': 'awd_lstm_lm_600_wikitext-2-e952becc.params',
        'standard_lstm_lm_200': 'standard_lstm_lm_200_wikitext-2-b233c700.params',
        'standard_lstm_lm_650': 'standard_lstm_lm_650_wikitext-2-631f3904.params',
        'standard_lstm_lm_1500': 'standard_lstm_lm_1500_wikitext-2-a4163513.params'}
    datasets = ['wikitext-2']
    for name in language_models_params.keys():
        for dataset_name in datasets:
            _, vocab = nlp.model.get_model(name=name, dataset_name=dataset_name, pretrained=True)
            ntokens = len(vocab)

            cache_cell_0 = nlp.model.train.get_cache_model(name, dataset_name, window=1, theta=0.6,
                                                           lambdas=0.2)
            print(cache_cell_0)

            model, _ = nlp.model.get_model(name=name, dataset_name=dataset_name, pretrained=True)
            cache_cell_1 = nlp.model.train.CacheCell(
                model, ntokens, window=1, theta=0.6, lambdas=0.2)
            cache_cell_1.load_parameters(
                os.path.join(get_home_dir(), 'models', language_models_params.get(name)))
            print(cache_cell_1)

            outs0, word_history0, cache_history0, hidden0 = cache_cell_0(
                mx.nd.arange(10).reshape(10, 1), mx.nd.arange(10).reshape(10, 1), None, None)
            outs1, word_history1, cache_history1, hidden1 = cache_cell_1(
                mx.nd.arange(10).reshape(10, 1), mx.nd.arange(10).reshape(10, 1), None, None)

            assert outs0.shape == outs1.shape, outs0.shape
            assert len(word_history0) == len(word_history1), len(word_history0)
            assert len(cache_history0) == len(cache_history1), len(cache_history0)
            assert len(hidden0) == len(hidden1), len(hidden0)
Exemple #3
0
    def __init__(self,
                 segment='train',
                 root=os.path.join(get_home_dir(), 'datasets', 'glue_sst'),
                 return_all_fields=False):
        self._data_file = {
            'train': ('train', 'bcde781bed5caa30d5e9a9d24e5c826965ed02a2',
                      'ffbb67a55e27525e925b79fee110ca19585d70ca'),
            'dev': ('dev', '85698e465ff6573fb80d0b34229c76df84cd766b',
                    'e166f986cec68fd4cca0ae5ce5869b917f88a2fa'),
            'test': ('test', 'efac1c275553ed78500e9b8d8629408f5f867b20',
                     '3ce8041182bf82dbbbbfe13738b39d3c69722744')
        }
        data_file = self._data_file[segment]
        if segment in ['train', 'dev']:
            A_IDX, LABEL_IDX = 0, 1
            field_indices = [A_IDX, LABEL_IDX
                             ] if not return_all_fields else None
            num_discard_samples = 1
        elif segment == 'test':
            A_IDX = 1
            field_indices = [A_IDX] if not return_all_fields else None
            num_discard_samples = 1

        super(GlueSST2, self).__init__(root,
                                       data_file,
                                       num_discard_samples=num_discard_samples,
                                       field_indices=field_indices)
Exemple #4
0
    def __init__(self, ctx=mx.cpu(), dtype='float32', model='bert_12_768_12',
                 dataset_name='book_corpus_wiki_en_uncased', params_path=None,
                 max_seq_length=25, batch_size=256,
                 root=os.path.join(get_home_dir(), 'models')):
        self.ctx = ctx
        self.dtype = dtype
        self.max_seq_length = max_seq_length
        self.batch_size = batch_size
        self.dataset_name = dataset_name

        # Don't download the pretrained models if we have a parameter path
        self.bert, self.vocab = gluonnlp.model.get_model(model,
                                                         dataset_name=self.dataset_name,
                                                         pretrained=params_path is None,
                                                         ctx=self.ctx,
                                                         use_pooler=False,
                                                         use_decoder=False,
                                                         use_classifier=False,
                                                         root=root)
        self.bert.cast(self.dtype)

        if params_path:
            logger.info('Loading params from %s', params_path)
            self.bert.load_parameters(params_path, ctx=ctx, ignore_extra=True)

        lower = 'uncased' in self.dataset_name
        self.tokenizer = BERTTokenizer(self.vocab, lower=lower)
        self.transform = BERTSentenceTransform(tokenizer=self.tokenizer,
                                               max_seq_length=self.max_seq_length,
                                               pair=False)
Exemple #5
0
    def __init__(self,
                 segment='train',
                 root=os.path.join(get_home_dir(), 'datasets', 'glue_stsb'),
                 return_all_fields=False):
        self._data_file = {
            'train': ('train', '9378bd341576810730a5c666ed03122e4c5ecc9f',
                      '501e55248c6db2a3f416c75932a63693000a82bc'),
            'dev': ('dev', '529c3e7c36d0807d88d0b2a5d4b954809ddd4228',
                    'f8bcc33b01dfa2e9ba85601d0140020735b8eff3'),
            'test': ('test', '6284872d6992d8ec6d96320af89c2f46ac076d18',
                     '36553e5e2107b817257232350e95ff0f3271d844')
        }
        data_file = self._data_file[segment]
        if segment in ['train', 'dev']:
            A_IDX, B_IDX, LABEL_IDX = 7, 8, 9
            field_indices = [A_IDX, B_IDX, LABEL_IDX
                             ] if not return_all_fields else None
            num_discard_samples = 1
        elif segment == 'test':
            A_IDX, B_IDX, = 7, 8
            field_indices = [A_IDX, B_IDX] if not return_all_fields else None
            num_discard_samples = 1

        super(GlueSTSB, self).__init__(root,
                                       data_file,
                                       num_discard_samples=num_discard_samples,
                                       field_indices=field_indices)
Exemple #6
0
 def __init__(self, segment='train', src_lang='en', tgt_lang='de',
              root=os.path.join(get_home_dir(), 'datasets', 'translation_test')):
     self._supported_segments = ['train', 'val', 'test']
     self._archive_file = {_get_pair_key('en', 'de'):
                               ('translation_test.zip',
                                '14f6c8e31ac6ec84ce469b4c196d60b4c86a179d')}
     self._data_file = {_get_pair_key('en', 'de'):
                            {'train_en': ('train.en',
                                          'aa7f22b91eb93390fd342a57a81f51f53ed29542'),
                             'train_de': ('train.de',
                                          'f914217ce23ddd8cac07e761a75685c043d4f6d3'),
                             'val_en': ('train.en',
                                        'aa7f22b91eb93390fd342a57a81f51f53ed29542'),
                             'val_de': ('train.de',
                                        'f914217ce23ddd8cac07e761a75685c043d4f6d3'),
                             'test_en': ('train.en',
                                         'aa7f22b91eb93390fd342a57a81f51f53ed29542'),
                             'test_de': ('train.de',
                                         'f914217ce23ddd8cac07e761a75685c043d4f6d3'),
                             'vocab_en': ('vocab.en.json',
                                          'c7c6af4603ea70f0a4af2460a622333fbd014050'),
                             'vocab_de' : ('vocab.de.json',
                                           '5b6f1be36a3e3cb9946b86e5d0fc73d164fda99f')}}
     super(TOY, self).__init__('translation_test', segment=segment, src_lang=src_lang,
                               tgt_lang=tgt_lang, root=root)
Exemple #7
0
 def __init__(self,
              segment='train',
              root=os.path.join(get_home_dir(), 'datasets', 'glue_qqp'),
              return_all_fields=False):
     self._data_file = {
         'train': ('train', '494f280d651f168ad96d6cd05f8d4ddc6be73ce9',
                   '95c01e711ac8dbbda8f67f3a4291e583a72b6988'),
         'dev': ('dev', '9957b60c4c62f9b98ec91b26a9d43529d2ee285d',
                 '755e0bf2899b8ad315d4bd7d4c85ec51beee5ad0'),
         'test': ('test', '1e325cc5dbeeb358f9429c619ebe974fc2d1a8ca',
                  '0f50d1a62dd51fe932ba91be08238e47c3e2504a')
     }
     data_file = self._data_file[segment]
     if segment in ['train', 'dev']:
         A_IDX, B_IDX, LABEL_IDX = 3, 4, 5
         field_indices = [A_IDX, B_IDX, LABEL_IDX
                          ] if not return_all_fields else None
         num_discard_samples = 1
     elif segment == 'test':
         A_IDX, B_IDX, = 1, 2
         field_indices = [A_IDX, B_IDX] if not return_all_fields else None
         num_discard_samples = 1
     # QQP may include broken samples
     super(GlueQQP, self).__init__(root,
                                   data_file,
                                   num_discard_samples=num_discard_samples,
                                   field_indices=field_indices,
                                   allow_missing=True)
Exemple #8
0
 def __init__(self,
              segment='train',
              root=os.path.join(get_home_dir(), 'datasets', 'glue_rte'),
              return_all_fields=False):
     self._data_file = {
         'train': ('train', 'a23b0633f4f4dfa866c672af2e94f7e07344888f',
                   'ec2b246745bb5c9d92aee0800684c08902742730'),
         'dev': ('dev', 'a6cde090d12a10744716304008cf33dd3f0dbfcb',
                 'ade75e0673862dcac9c653efb9f59f51be2749aa'),
         'test': ('test', '7e4e58a6fa80b1f05e603b4e220524be7976b488',
                  'ddda5c967fb5a4934b429bb52aaa144e70900000')
     }
     data_file = self._data_file[segment]
     if segment in ['train', 'dev']:
         A_IDX, B_IDX, LABEL_IDX = 1, 2, 3
         field_indices = [A_IDX, B_IDX, LABEL_IDX
                          ] if not return_all_fields else None
         num_discard_samples = 1
     elif segment == 'test':
         A_IDX, B_IDX, = 1, 2
         field_indices = [A_IDX, B_IDX] if not return_all_fields else None
         num_discard_samples = 1
     super(GlueRTE, self).__init__(root,
                                   data_file,
                                   num_discard_samples=num_discard_samples,
                                   field_indices=field_indices)
Exemple #9
0
 def __init__(self,
              segment='train',
              root=os.path.join(get_home_dir(), 'datasets', 'glue_qnli'),
              return_all_fields=False):
     self._data_file = {
         'train': ('train', '95fae96fb1ffa6a2804192c9036d3435e63b48e8',
                   'd90a84eb40c6ba32bc2b34284ceaa962c46f8753'),
         'dev': ('dev', '5652b9d4d5c8d115c080bcf64101927ea2b3a1e0',
                 'd14a61290301c2a9d26459c4cd036742e8591428'),
         'test': ('test', '23dfb2f38adb14d3e792dbaecb7f5fd5dfa8db7e',
                  'f3da1a2e471ebfee81d91574b42e0f5d39153c59')
     }
     data_file = self._data_file[segment]
     if segment in ['train', 'dev']:
         A_IDX, B_IDX, LABEL_IDX = 1, 2, 3
         field_indices = [A_IDX, B_IDX, LABEL_IDX
                          ] if not return_all_fields else None
         num_discard_samples = 1
     elif segment == 'test':
         A_IDX, B_IDX, = 1, 2
         field_indices = [A_IDX, B_IDX] if not return_all_fields else None
         num_discard_samples = 1
     super(GlueQNLI, self).__init__(root,
                                    data_file,
                                    num_discard_samples=num_discard_samples,
                                    field_indices=field_indices)
Exemple #10
0
    def __init__(self,
                 segment='train',
                 root=os.path.join(get_home_dir(), 'datasets',
                                   'superglue_wsc')):
        self._segment = segment
        self._data_file = {
            'train': ('train', 'ed0fe96914cfe1ae8eb9978877273f6baed621cf',
                      'fa978f6ad4b014b5f5282dee4b6fdfdaeeb0d0df'),
            'dev': ('dev', 'cebec2f5f00baa686573ae901bb4d919ca5d3483',
                    'ea2413e4e6f628f2bb011c44e1d8bae301375211'),
            'test': ('test', '3313896f315e0cb2bb1f24f3baecec7fc93124de',
                     'a47024aa81a5e7c9bc6e957b36c97f1d1b5da2fd')
        }
        data_file = self._data_file[segment]

        if segment in ['train', 'dev']:
            field_keys = [
                "target", "text",
                [["span1_index", "span1_text"], ["span2_index", "span2_text"]],
                "label"
            ]
        elif segment == 'test':
            field_keys = [
                "target", "text",
                [["span1_index", "span1_text"], ["span2_index", "span2_text"]]
            ]

        super(SuperGlueWSC, self).__init__(root,
                                           data_file,
                                           field_keys,
                                           task="WSC")
Exemple #11
0
 def __init__(self,
              segment='train',
              root=os.path.join(get_home_dir(), 'datasets', 'glue_wnli'),
              return_all_fields=False):
     self._data_file = {
         'train': ('train', '8db0004d0e58640751a9f2875dd66c8000504ddb',
                   'b497281c1d848b619ea8fe427b3a6e4dc8e7fa92'),
         'dev': ('dev', 'd54834960555073fb497cf2766edb77fb62c3646',
                 '6bbdb866d0cccaac57c3a2505cf53103789b69a9'),
         'test': ('test', '431e596a1c6627fb168e7741b3e32ef681da3c7b',
                  '6ba8fcf3e5b451c101a3902fb4ba3fc1dea42e50')
     }
     data_file = self._data_file[segment]
     if segment in ['train', 'dev']:
         A_IDX, B_IDX, LABEL_IDX = 1, 2, 3
         field_indices = [A_IDX, B_IDX, LABEL_IDX
                          ] if not return_all_fields else None
         num_discard_samples = 1
     elif segment == 'test':
         A_IDX, B_IDX, = 1, 2
         field_indices = [A_IDX, B_IDX] if not return_all_fields else None
         num_discard_samples = 1
     super(GlueWNLI, self).__init__(root,
                                    data_file,
                                    num_discard_samples=num_discard_samples,
                                    field_indices=field_indices)
Exemple #12
0
    def __init__(self,
                 segment='train',
                 root=os.path.join(get_home_dir(), 'datasets', 'glue_mrpc')):
        self._root = root
        assert segment in ['train', 'dev',
                           'test'], 'Unsupported segment: %s' % segment
        self._data_file = {
            'train': ('msr_paraphrase_train.txt',
                      '716e0f67af962f08220b7e97d229b293077ef41f',
                      '131675ffd3d2f04f286049d31cca506c8acba69e'),
            'dev': ('msr_paraphrase_train.txt',
                    '716e0f67af962f08220b7e97d229b293077ef41f',
                    'e4486577c4cb2e5c2a3fd961eb24f03c623ea02d'),
            'test': ('msr_paraphrase_test.txt',
                     '4265196c15cf75620b0b592b8b921f543bda7e6c',
                     '3602b2ca26cf574e84183c14d6c0901669ee2d0a')
        }

        self._generate(segment)
        path = os.path.join(root, '%s.tsv' % segment)
        A_IDX, B_IDX, LABEL_IDX = 3, 4, 0
        if segment == 'test':
            fields = [A_IDX, B_IDX]
        else:
            fields = [A_IDX, B_IDX, LABEL_IDX]
        super(GlueMRPC, self).__init__(path,
                                       num_discard_samples=1,
                                       field_indices=fields)
Exemple #13
0
def gpt2_345m(dataset_name=None, vocab=None, pretrained=True, ctx=mx.cpu(),
              root=os.path.join(get_home_dir(), 'models'), **kwargs):
    """Generic GPT-2 model.

    The number of layers (L) is 24, number of units (H) is 1024, and the
    number of self-attention heads (A) is 16.

    Parameters
    ----------
    dataset_name : str or None, default None
        If not None, the dataset name is used to load a vocabulary for the
        dataset. If the `pretrained` argument is set to True, the dataset name
        is further used to select the pretrained parameters to load.
        Options include 'book_corpus_wiki_en_uncased' and 'book_corpus_wiki_en_cased'.
    vocab : gluonnlp.vocab.BERTVocab or None, default None
        Vocabulary for the dataset. Must be provided if dataset_name is not
        specified. Ignored if dataset_name is specified.
    pretrained : bool, default True
        Whether to load the pretrained weights for model.
    ctx : Context, default CPU
        The context in which to load the pretrained weights.
    root : str, default '$MXNET_HOME/models'
        Location for keeping the model parameters.
        MXNET_HOME defaults to '~/.mxnet'.

    Returns
    -------
    GPT2Model, gluonnlp.vocab.Vocab
    """
    return _get_gpt2_model('gpt2_345m', dataset_name=dataset_name, vocab=vocab,
                           pretrained=pretrained, ctx=ctx, root=root,
                           **kwargs)
Exemple #14
0
    def __init__(self,
                 segment='train',
                 root=os.path.join(get_home_dir(), 'datasets', 'glue_cola'),
                 return_all_fields=False):
        self._data_file = {
            'train': ('train', '662227ed4d98bb96b3495234b650e37826a5ef72',
                      '7760a9c4b1fb05f6d003475cc7bb0d0118875190'),
            'dev': ('dev', '6f3f5252b004eab187bf22ab5b0af31e739d3a3f',
                    '30ece4de38e1929545c4154d4c71ad297c7f54b4'),
            'test': ('test', 'b88180515ad041935793e74e3a76470b0c1b2c50',
                     'f38b43d31bb06accf82a3d5b2fe434a752a74c9f')
        }
        data_file = self._data_file[segment]
        if segment in ['train', 'dev']:
            A_IDX, LABEL_IDX = 3, 1
            field_indices = [A_IDX, LABEL_IDX
                             ] if not return_all_fields else None
            num_discard_samples = 0
        elif segment == 'test':
            A_IDX = 1
            field_indices = [A_IDX] if not return_all_fields else None
            num_discard_samples = 1

        super(GlueCoLA, self).__init__(root,
                                       data_file,
                                       num_discard_samples=num_discard_samples,
                                       field_indices=field_indices)
Exemple #15
0
def bert_12_768_12(dataset_name=None,
                   vocab=None,
                   pretrained=True,
                   ctx=mx.cpu(),
                   root=os.path.join(get_home_dir(), 'models'),
                   use_pooler=True,
                   use_decoder=True,
                   use_classifier=True,
                   input_size=None,
                   seq_length=None,
                   **kwargs):
    """Hybrid BERT BASE model.

    The number of layers (L) is 12, number of units (H) is 768, and the
    number of self-attention heads (A) is 12.

    Parameters
    ----------
    dataset_name : str or None, default None
        Options include 'book_corpus_wiki_en_cased', 'book_corpus_wiki_en_uncased',
        'wiki_cn_cased', 'wiki_multilingual_uncased' and 'wiki_multilingual_cased'.
    vocab : gluonnlp.vocab.BERTVocab or None, default None
        Vocabulary for the dataset. Must be provided if dataset is not specified.
    pretrained : bool, default True
        Whether to load the pretrained weights for model.
    ctx : Context, default CPU
        The context in which to load the pretrained weights.
    root : str, default '$MXNET_HOME/models'
        Location for keeping the model parameters.
        MXNET_HOME defaults to '~/.mxnet'.
    use_pooler : bool, default True
        Whether to include the pooler which converts the encoded sequence tensor of shape
        (batch_size, seq_length, units) to a tensor of shape (batch_size, units)
        for for segment level classification task.
    use_decoder : bool, default True
        Whether to include the decoder for masked language model prediction.
    use_classifier : bool, default True
        Whether to include the classifier for next sentence classification.
    input_size : int, default None
        Represents the embedding size of the input.
    seq_length : int, default None
        Stands for the sequence length of the input.

    Returns
    -------
    HybridBERTModel, gluonnlp.vocab.BERTVocab
    """
    return get_hybrid_bert_model(model_name='bert_12_768_12',
                                 vocab=vocab,
                                 dataset_name=dataset_name,
                                 pretrained=pretrained,
                                 ctx=ctx,
                                 use_pooler=use_pooler,
                                 use_decoder=use_decoder,
                                 use_classifier=use_classifier,
                                 root=root,
                                 input_size=input_size,
                                 seq_length=seq_length,
                                 **kwargs)
Exemple #16
0
    def __init__(self,
                 ctx=mx.cpu(),
                 dtype='float32',
                 model='bert_12_768_12',
                 dataset_name='book_corpus_wiki_en_uncased',
                 params_path=None,
                 max_seq_length=25,
                 batch_size=256,
                 sentencepiece=None,
                 root=os.path.join(get_home_dir(), 'models')):
        self.ctx = ctx
        self.dtype = dtype
        self.max_seq_length = max_seq_length
        self.batch_size = batch_size
        self.dataset_name = dataset_name

        # use sentencepiece vocab and a checkpoint
        # we need to set dataset_name to None, otherwise it uses the downloaded vocab
        if params_path and sentencepiece:
            dataset_name = None
        else:
            dataset_name = self.dataset_name
        if sentencepiece:
            vocab = gluonnlp.vocab.BERTVocab.from_sentencepiece(sentencepiece)
        else:
            vocab = None

        self.bert, self.vocab = gluonnlp.model.get_model(
            model,
            dataset_name=dataset_name,
            pretrained=params_path is None,
            ctx=self.ctx,
            use_pooler=False,
            use_decoder=False,
            use_classifier=False,
            root=root,
            vocab=vocab)

        self.bert.cast(self.dtype)
        if params_path:
            logger.info('Loading params from %s', params_path)
            self.bert.load_parameters(params_path,
                                      ctx=ctx,
                                      ignore_extra=True,
                                      cast_dtype=True)

        lower = 'uncased' in self.dataset_name
        if sentencepiece:
            self.tokenizer = BERTSPTokenizer(sentencepiece,
                                             self.vocab,
                                             lower=lower)
        else:
            self.tokenizer = BERTTokenizer(self.vocab, lower=lower)
        self.transform = BERTSentenceTransform(
            tokenizer=self.tokenizer,
            max_seq_length=self.max_seq_length,
            pair=False)
Exemple #17
0
def xlnet_cased_l24_h1024_a16(dataset_name: Optional[str] = None, vocab: Optional[nlp.Vocab] = None,
                              tokenizer: Optional[XLNetTokenizer] = None, pretrained: bool = True,
                              ctx: mx.Context = mx.cpu(),
                              root=os.path.join(get_home_dir(), 'models'),
                              do_lower_case=False, **kwargs):
    """XLNet model.

    References:
    Yang, Z., Dai, Z., Yang, Y., Carbonell, J., Salakhutdinov, R., & Le, Q. V.
    (2019). XLNet: Generalized Autoregressive Pretraining for Language
    Understanding. arXiv preprint arXiv:1906.08237.


    Parameters
    ----------
    dataset_name : str or None, default None
        If not None, the dataset name is used to load a vocabulary for the
        dataset. If the `pretrained` argument is set to True, the dataset name
        is further used to select the pretrained parameters to load.
        Options include 'books_enwiki_giga5_clueweb2012b_commoncrawl'.
    vocab : gluonnlp.vocab.Vocab or None, default None
        Vocabulary for the dataset. Must be provided if dataset_name is not
        specified. Ignored if dataset_name is specified.
    tokenizer : XLNetTokenizer or None, default None
        XLNetTokenizer for the dataset. Must be provided if dataset_name is not
        specified. Ignored if dataset_name is specified.
    pretrained : bool, default True
        Whether to load the pretrained weights for model.
    ctx : Context, default CPU
        The context in which to load the pretrained weights.
    root : str, default '$MXNET_HOME/models'
        Location for keeping the model parameters.
        MXNET_HOME defaults to '~/.mxnet'.

    Returns
    -------
    XLNet, gluonnlp.Vocab, XLNetTokenizer

    """
    kwargs.update(**{
        'hidden_size': 4096,
        'units': 1024,
        'activation': 'approx_gelu',
        'num_heads': 16,
        'num_layers': 24,
    })
    if vocab is None or dataset_name is not None:
        vocab = _load_vocab('xlnet_' + dataset_name, vocab, root)
    net = XLNet(vocab_size=len(vocab), **kwargs)
    if pretrained:
        _load_pretrained_params(net=net, model_name='xlnet_cased_l24_h1024_a16',
                                dataset_name=dataset_name, root=root, ctx=ctx,
                                ignore_extra=not kwargs.get('use_decoder', True))
    if tokenizer is None or dataset_name is not None:
        tokenizer = _get_xlnet_tokenizer(dataset_name, root, do_lower_case)
    return net, vocab, tokenizer
def _get_gpt2_model(model_name=None,
                    dataset_name=None,
                    vocab=None,
                    pretrained=True,
                    ctx=mx.cpu(),
                    root=os.path.join(get_home_dir(), 'models'),
                    **kwargs):
    """Any predefined GPT-2 model.

    Parameters
    ----------
    model_name : str or None, default None
        Options include 'gpt2_117m' and 'gpt2_345m'.
    dataset_name : str or None, default None
        If not None, the dataset name is used to load a vocabulary for the
        dataset. If the `pretrained` argument is set to True, the dataset name
        is further used to select the pretrained parameters to load.
        The supported datasets for model_name of either bert_24_1024_16 and
        bert_12_768_12 are 'openai_webtext'.
    vocab : gluonnlp.vocab.BERTVocab or None, default None
        Vocabulary for the dataset. Must be provided if dataset_name is not
        specified. Ignored if dataset_name is specified.
    pretrained : bool, default True
        Whether to load the pretrained weights for model.
    ctx : Context, default CPU
        The context in which to load the pretrained weights.
    root : str, default '$MXNET_HOME/models'
        Location for keeping the model parameters.
        MXNET_HOME defaults to '~/.mxnet'.

    Returns
    -------
    GPT2Model, gluonnlp.vocab.Vocab
    """
    predefined_args = gpt2_hparams[model_name]
    mutable_args = ['dropout']
    mutable_args = frozenset(mutable_args)
    assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \
        'Cannot override predefined model settings.'
    predefined_args.update(kwargs)
    vocab = _load_vocab(dataset_name, vocab, root)
    # BERT
    net = GPT2Model(units=predefined_args['units'],
                    vocab_size=len(vocab),
                    max_length=predefined_args['max_length'],
                    num_layers=predefined_args['num_layers'],
                    num_heads=predefined_args['num_heads'],
                    dropout=predefined_args['dropout'],
                    **kwargs)
    if pretrained:
        _load_pretrained_params(net, model_name, dataset_name, root, ctx)
    for i in range(net._num_layers):
        net._ffn_layers[i]._act._support_erf = False
    return net, vocab
Exemple #19
0
def get_bort_model(model_name=None,
                   dataset_name=None,
                   vocab=None,
                   pretrained=True,
                   ctx=mx.cpu(),
                   use_decoder=True,
                   output_attention=False,
                   output_all_encodings=False,
                   root=os.path.join(get_home_dir(), 'models'),
                   **kwargs):
    predefined_args = predefined_borts[model_name]
    logging.info(f"get_bort_model: {model_name}")
    mutable_args = ['use_residual', 'dropout', 'embed_dropout', 'word_embed']
    mutable_args = frozenset(mutable_args)
    print("model_name: ", model_name, ", predefined_args: ", predefined_args)
    assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \
        'Cannot override predefined model settings.'
    predefined_args.update(kwargs)
    # encoder
    encoder = BERTEncoder(attention_cell=predefined_args['attention_cell'],
                          num_layers=predefined_args['num_layers'],
                          units=predefined_args['units'],
                          hidden_size=predefined_args['hidden_size'],
                          max_length=predefined_args['max_length'],
                          num_heads=predefined_args['num_heads'],
                          scaled=predefined_args['scaled'],
                          dropout=predefined_args['dropout'],
                          output_attention=output_attention,
                          output_all_encodings=output_all_encodings,
                          use_residual=predefined_args['use_residual'],
                          activation=predefined_args.get('activation', 'gelu'),
                          layer_norm_eps=predefined_args.get(
                              'layer_norm_eps', None))

    from gluonnlp.vocab import Vocab
    bort_vocab = _load_vocab(dataset_name, vocab, root, cls=Vocab)

    net = BortModel(encoder,
                    len(bort_vocab),
                    units=predefined_args['units'],
                    embed_size=predefined_args['embed_size'],
                    embed_dropout=predefined_args['embed_dropout'],
                    word_embed=predefined_args['word_embed'],
                    use_decoder=use_decoder)
    if pretrained:
        ignore_extra = not use_decoder
        _load_pretrained_params(net,
                                model_name,
                                dataset_name,
                                root,
                                ctx,
                                ignore_extra=ignore_extra,
                                allow_missing=False)
    return net, bort_vocab
Exemple #20
0
    def get_dataset(self, segment='train',
                    root=os.path.join(get_home_dir(), 'datasets', 'baidu_ernie_data')):
        """Get the corresponding dataset for ChnSentiCorp.

        Parameters
        ----------
        segment : str, default 'train'
            Dataset segments. Options are 'dev', 'test', 'train'
        root : str, default $BAIDU_ERNIE_DATA_DIR/
            Path to the folder which stores the dataset.
        """
        return BaiduErnieChnSentiCorp(segment, root=root)
Exemple #21
0
def _get_gpt2_model(model_name=None,
                    dataset_name=None,
                    vocab=None,
                    pretrained=True,
                    ctx=mx.cpu(),
                    root=os.path.join(get_home_dir(), 'models'),
                    hparam_allow_override=False,
                    **kwargs):
    """Any predefined GPT-2 model.

    Parameters
    ----------
    model_name : str or None, default None
        Options include 'gpt2_117m' and 'gpt2_345m'.
    dataset_name : str or None, default None
        If not None, the dataset name is used to load a vocabulary for the
        dataset. If the `pretrained` argument is set to True, the dataset name
        is further used to select the pretrained parameters to load.
        The supported datasets for model_name of either bert_24_1024_16 and
        bert_12_768_12 are 'openai_webtext'.
    vocab : gluonnlp.vocab.BERTVocab or None, default None
        Vocabulary for the dataset. Must be provided if dataset_name is not
        specified. Ignored if dataset_name is specified.
    pretrained : bool, default True
        Whether to load the pretrained weights for model.
    ctx : Context, default CPU
        The context in which to load the pretrained weights.
    root : str, default '$MXNET_HOME/models'
        Location for keeping the model parameters.
        MXNET_HOME defaults to '~/.mxnet'.
    hparam_allow_override : bool, default False
        If set to True, pre-defined hyper-parameters of the model
        (e.g. the number of layers, hidden units) can be overriden.

    Returns
    -------
    GPT2Model, gluonnlp.vocab.Vocab
    """
    predefined_args = gpt2_hparams[model_name].copy()
    if not hparam_allow_override:
        mutable_args = ['dropout']
        mutable_args = frozenset(mutable_args)
        assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \
            'Cannot override predefined model settings.'
    predefined_args.update(kwargs)
    vocab = _load_vocab(dataset_name, vocab, root)
    # GPT2
    net = GPT2Model(vocab_size=len(vocab), **predefined_args)
    if pretrained:
        _load_pretrained_params(net, model_name, dataset_name, root, ctx)
    return net, vocab
Exemple #22
0
def RACEHash(dataset_location, task_name, segment='test'):
    """ Because of the way we've setup RACE-H/RACE-M, we need to figure out a way to
        translate it back into a viable answer. 
    """
    if dataset_location is None:
        dataset_location = os.path.join(get_home_dir(), 'datasets', 'race')

    task = "high" if task_name[-1] == "H" else "middle"
    test_dataset_location = os.path.join(dataset_location, segment, task)
    filenames = [os.path.expanduser(f) for f in os.listdir(
        test_dataset_location) if fnmatch.fnmatch(f, '*.txt')]
    filenames.sort()
    dataset = []
    for f in filenames:
        dataset += [json.loads(l, object_pairs_hook=OrderedDict)
                    for l in open(os.path.join(test_dataset_location, f), 'r').readlines()]
    return dataset
Exemple #23
0
    def __init__(self,
                 segment='test',
                 root=os.path.join(get_home_dir(), 'datasets',
                                   'superglue_ax_b')):

        if segment in ['train', 'dev']:
            raise ValueError("Only \"test\" is supported for AX-b")
        elif segment == 'test':
            field_keys = ["sentence1", "sentence2"]

        self._segment = segment
        self._data_file = {
            'test': ('AX-b', '398c5a376eb436f790723cd217ac040334140000',
                     '50fd8ac409897b652daa4b246917097c3c394bc8')
        }
        data_file = self._data_file[segment]

        super(SuperGlueAXb, self).__init__(root, data_file, field_keys)
Exemple #24
0
    def __init__(self,
                 segment='test',
                 root=os.path.join(get_home_dir(), 'datasets',
                                   'superglue_ax_g')):

        if segment in ['train', 'dev']:
            raise ValueError("Only \"test\" is supported for AX-g")
        elif segment == 'test':
            field_keys = ["premise", "hypothesis"]

        self._segment = segment
        self._data_file = {
            "test": ('AX-g', 'd8c92498496854807dfeacd344eddf466d7f468a',
                     '8a8cbfe00fd88776a2a2f20b477e5b0c6cc8ebae')
        }
        data_file = self._data_file[segment]

        super(SuperGlueAXg, self).__init__(root, data_file, field_keys)
Exemple #25
0
 def __init__(self,
              segment='train',
              root=os.path.join(get_home_dir(), 'datasets',
                                'superglue_record')):
     self._segment = segment
     self._data_file = {
         'train': ('train', '047282c912535c9a3bcea519935fde882feb619d',
                   '65592074cefde2ecd1b27ce7b35eb8beb86c691a'),
         'dev': ('dev', '442d8470bff2c9295231cd10262a7abf401edc64',
                 '9d1850e4dfe2eca3b71bfea191d5f4b412c65309'),
         'test': ('test', 'fc639a18fa87befdc52f14c1092fb40475bf50d0',
                  'b79b22f54b5a49f98fecd05751b122ccc6947c81')
     }
     data_file = self._data_file[segment]
     field_keys = []
     super(SuperGlueReCoRD, self).__init__(root,
                                           data_file,
                                           field_keys,
                                           task="ReCoRD")
Exemple #26
0
    def __init__(self,
                 segment='train',
                 root=os.path.join(get_home_dir(), 'datasets',
                                   'superglue_multirc')):
        self._segment = segment

        # This implementation needs the actual SuperGLUE
        # data, available at:
        # https://github.com/nyu-mll/jiant/blob/master/scripts/download_superglue_data.py
        self._data_file = {
            'train': ('train', '', ''),
            'dev': ('dev', '', ''),
            'test': ('test', '', '')
        }
        data_file = self._data_file[segment]
        field_keys = []
        super(SuperGlueMultiRC, self).__init__(root,
                                               data_file,
                                               field_keys,
                                               task="MultiRC")
Exemple #27
0
    def __init__(self,
                 segment='train',
                 root=os.path.join(get_home_dir(), 'datasets',
                                   'baidu_ernie_data'),
                 return_all_fields=False):
        A_IDX, B_IDX, LABEL_IDX = 0, 1, 2
        if segment in ['train', 'dev']:
            field_indices = [A_IDX, B_IDX, LABEL_IDX
                             ] if not return_all_fields else None
            num_discard_samples = 1
        elif segment == 'test':
            field_indices = [A_IDX, B_IDX] if not return_all_fields else None
            num_discard_samples = 1

        super(BaiduErnieXNLI,
              self).__init__(root,
                             'xnli',
                             segment,
                             num_discard_samples=num_discard_samples,
                             field_indices=field_indices)
Exemple #28
0
    def __init__(self,
                 segment='train',
                 root=os.path.join(get_home_dir(), 'datasets',
                                   'superglue_wic')):
        self._segment = segment
        self._data_file = {
            'train': ('train', 'ec1e265bbdcde1d8da0b56948ed30d86874b1f12',
                      '831a58c553def448e1b1d0a8a36e2b987c81bc9c'),
            'dev': ('dev', '2046c43e614d98d538a03924335daae7881f77cf',
                    '73b71136a2dc2eeb3be7ab455a08f20b8dbe7526'),
            'test': ('test', '77af78a49aac602b7bbf080a03b644167b781ba9',
                     '1be93932d46c8f8dc665eb7af6703c56ca1b1e08')
        }
        data_file = self._data_file[segment]
        # We'll hope the hypernymy is clear from the sentence
        if segment in ['train', 'dev']:
            field_keys = ["sentence1", "sentence2", "label"]
        elif segment == 'test':
            field_keys = ["sentence1", "sentence2"]

        super(SuperGlueWiC, self).__init__(root, data_file, field_keys)
Exemple #29
0
    def __init__(self,
                 segment='train',
                 root=os.path.join(get_home_dir(), 'datasets',
                                   'superglue_boolq')):
        self._segment = segment
        self._data_file = {
            'train': ('train', '89507ff3015c3212b72318fb932cfb6d4e8417ef',
                      'd5be523290f49fc0f21f4375900451fb803817c0'),
            'dev': ('dev', 'fd39562fc2c9d0b2b8289d02a8cf82aa151d0ad4',
                    '9b09ece2b1974e4da20f0173454ba82ff8ee1710'),
            'test': ('test', 'a805d4bd03112366d548473a6848601c042667d3',
                     '98c308620c6d6c0768ba093858c92e5a5550ce9b')
        }
        data_file = self._data_file[segment]

        if segment in ['train', 'dev']:
            field_keys = ["passage", "question", "label"]
        elif segment == 'test':
            field_keys = ["passage", "question"]

        super(SuperGlueBoolQ, self).__init__(root, data_file, field_keys)
Exemple #30
0
    def __init__(self,
                 segment='train',
                 root=os.path.join(get_home_dir(), 'datasets',
                                   'superglue_rte')):
        self._segment = segment
        self._data_file = {
            'train': ('train', 'a4471b47b23f6d8bc2e89b2ccdcf9a3a987c69a1',
                      '01ebec38ff3d2fdd849d3b33c2a83154d1476690'),
            'dev': ('dev', '17f23360f77f04d03aee6c42a27a61a6378f1fd9',
                    '410f8607d9fc46572c03f5488387327b33589069'),
            'test': ('test', 'ef2de5f8351ef80036c4aeff9f3b46106b4f2835',
                     '69f9d9b4089d0db5f0605eeaebc1c7abc044336b')
        }
        data_file = self._data_file[segment]

        if segment in ['train', 'dev']:
            field_keys = ["premise", "hypothesis", "label"]
        elif segment == 'test':
            field_keys = ["premise", "hypothesis"]

        super(SuperGlueRTE, self).__init__(root, data_file, field_keys)