Example #1
0
    def __init__(self, root=os.path.join(get_home_dir(), 'models')):
        try:
            import regex  # pylint: disable=import-outside-toplevel
            self._regex = regex
        except ImportError:
            raise ImportError('GPT2BPETokenizer requires regex. '
                              'To install regex, use pip install -U regex')
        super(GPT2BPETokenizer, self).__init__()
        root = os.path.expanduser(root)
        file_name, sha1_hash = self.bpe_ranks_file_hash
        file_path = os.path.join(root, file_name)
        if not os.path.exists(file_path) or not check_sha1(
                file_path, sha1_hash):
            if os.path.exists(file_path):
                print(
                    'Detected mismatch in the content of BPE rank file. Downloading again.'
                )
            else:
                print('BPE rank file is not found. Downloading.')
            if not os.path.exists(root):
                try:
                    os.makedirs(root)
                except OSError as e:
                    if e.errno == errno.EEXIST and os.path.isdir(root):
                        pass
                    else:
                        raise e

            prefix = str(time.time())
            zip_file_path = os.path.join(root, prefix + file_name)
            repo_url = _get_repo_url()
            if repo_url[-1] != '/':
                repo_url = repo_url + '/'
            archive_name, archive_hash = self.bpe_ranks_archive_hash
            _url_format = '{repo_url}gluon/dataset/vocab/{file_name}'
            download(_url_format.format(repo_url=repo_url,
                                        file_name=archive_name),
                     path=zip_file_path,
                     sha1_hash=archive_hash,
                     overwrite=True)
            with zipfile.ZipFile(zip_file_path) as zf:
                if not os.path.exists(file_path):
                    zf.extractall(root)
            try:
                os.remove(zip_file_path)
            except OSError as e:
                # file has already been removed.
                if e.errno == 2:
                    pass
                else:
                    raise e

            if not check_sha1(file_path, sha1_hash):
                raise ValueError(
                    'Downloaded file has different hash. Please try again.')
        self._read_bpe_ranks(file_path)
        self._cache = {}
        self._token_pattern = self._regex.compile(
            r'\'s|\'t|\'re|\'ve|\'m|\'ll|\'d| ?\p{L}+'
            r'| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+')
Example #2
0
def _load_pretrained_vocab(name, root, cls=None):
    """Load the accompanying vocabulary object for pre-trained model.

    Parameters
    ----------
    name : str
        Name of the vocabulary, usually the name of the dataset.
    root : str
        Location for keeping the model vocabulary.
    cls : nlp.Vocab or nlp.vocab.BERTVocab, default nlp.Vocab

    Returns
    -------
    Vocab or nlp.vocab.BERTVocab
        Loaded vocabulary object for the pre-trained model.
    """
    file_name = '{name}-{short_hash}'.format(name=name,
                                             short_hash=short_hash(name))
    root = os.path.expanduser(root)
    file_path = os.path.join(root, file_name + '.vocab')
    sha1_hash = _vocab_sha1[name]
    if os.path.exists(file_path):
        if check_sha1(file_path, sha1_hash):
            return _load_vocab_file(file_path, cls)
        else:
            print(
                'Detected mismatch in the content of model vocab file. Downloading again.'
            )
    else:
        print('Vocab file is not found. Downloading.')

    if not os.path.exists(root):
        try:
            os.makedirs(root)
        except OSError as e:
            if e.errno == errno.EEXIST and os.path.isdir(root):
                pass
            else:
                raise e

    zip_file_path = os.path.join(root, file_name + '.zip')
    repo_url = _get_repo_url()
    if repo_url[-1] != '/':
        repo_url = repo_url + '/'
    download(_url_format.format(repo_url=repo_url, file_name=file_name),
             path=zip_file_path,
             overwrite=True)
    with zipfile.ZipFile(zip_file_path) as zf:
        zf.extractall(root)
    os.remove(zip_file_path)

    if check_sha1(file_path, sha1_hash):
        return _load_vocab_file(file_path, cls)
    else:
        raise ValueError(
            'Downloaded file has different hash. Please try again.')
Example #3
0
def _load_pretrained_vocab(name, root, cls=None):
    """Load the accompanying vocabulary object for pre-trained model.

    Parameters
    ----------
    name : str
        Name of the vocabulary, usually the name of the dataset.
    root : str
        Location for keeping the model vocabulary.
    cls : nlp.Vocab or nlp.vocab.BERTVocab, default nlp.Vocab

    Returns
    -------
    Vocab or nlp.vocab.BERTVocab
        Loaded vocabulary object for the pre-trained model.
    """
    file_name = '{name}-{short_hash}'.format(name=name,
                                             short_hash=short_hash(name))
    root = os.path.expanduser(root)
    file_path = os.path.join(root, file_name + '.vocab')
    sha1_hash = _vocab_sha1[name]

    temp_num = str(random.Random().randint(1, sys.maxsize))
    temp_root = os.path.join(root, temp_num)
    temp_file_path = os.path.join(temp_root, file_name + '.vocab')
    temp_zip_file_path = os.path.join(root, temp_num + file_name + '.zip')
    if os.path.exists(file_path):
        if check_sha1(file_path, sha1_hash):
            return _load_vocab_file(file_path, cls)
        else:
            print('Detected mismatch in the content of model vocab file. Downloading again.')
    else:
        print('Vocab file is not found. Downloading.')

    utils.mkdir(root)

    repo_url = _get_repo_url()
    if repo_url[-1] != '/':
        repo_url = repo_url + '/'
    download(_url_format.format(repo_url=repo_url, file_name=file_name),
             path=temp_zip_file_path, overwrite=True)
    with zipfile.ZipFile(temp_zip_file_path) as zf:
        if not os.path.exists(file_path):
            utils.mkdir(temp_root)
            zf.extractall(temp_root)
            os.replace(temp_file_path, file_path)
            shutil.rmtree(temp_root)

    if check_sha1(file_path, sha1_hash):
        return _load_vocab_file(file_path, cls)
    else:
        raise ValueError('Downloaded file has different hash. Please try again.')
Example #4
0
def _load_pretrained_vocab(name, root=os.path.join('~', '.mxnet', 'models')):
    """Load the accompanying vocabulary object for pretrained model.

    Parameters
    ----------
    name : str
        Name of the vocabulary, usually the name of the dataset.
    root : str, default '~/.mxnet/models'
        Location for keeping the model parameters.

    Returns
    -------
    file_path
        Path to the requested vocabulary object file.
    """
    file_name = '{name}-{short_hash}'.format(name=name,
                                             short_hash=short_hash(name))
    root = os.path.expanduser(root)
    file_path = os.path.join(root, file_name + '.vocab')
    sha1_hash = _vocab_sha1[name]
    if os.path.exists(file_path):
        if check_sha1(file_path, sha1_hash):
            return _load_vocab_file(file_path)
        else:
            print(
                'Detected mismatch in the content of model vocab file. Downloading again.'
            )
    else:
        print('Vocab file is not found. Downloading.')

    if not os.path.exists(root):
        os.makedirs(root)

    zip_file_path = os.path.join(root, file_name + '.zip')
    repo_url = _get_repo_url()
    if repo_url[-1] != '/':
        repo_url = repo_url + '/'
    download(_url_format.format(repo_url=repo_url, file_name=file_name),
             path=zip_file_path,
             overwrite=True)
    with zipfile.ZipFile(zip_file_path) as zf:
        zf.extractall(root)
    os.remove(zip_file_path)

    if check_sha1(file_path, sha1_hash):
        return _load_vocab_file(file_path)
    else:
        raise ValueError(
            'Downloaded file has different hash. Please try again.')
Example #5
0
def _get_xlnet_tokenizer(dataset_name, root):
    assert dataset_name.lower() == '126gb'
    root = os.path.expanduser(root)
    file_path = os.path.join(root, 'xlnet_126gb-871f0b3c.spiece')
    sha1_hash = '871f0b3c13b92fc5aea8fba054a214c420e302fd'
    if os.path.exists(file_path):
        if not check_sha1(file_path, sha1_hash):
            print(
                'Detected mismatch in the content of model tokenizer. Downloading again.'
            )
    else:
        print('Tokenizer file is not found. Downloading.')

    if not os.path.exists(root):
        try:
            os.makedirs(root)
        except OSError as e:
            if e.errno == errno.EEXIST and os.path.isdir(root):
                pass
            else:
                raise e

    repo_url = _get_repo_url()
    prefix = str(time.time())
    zip_file_path = os.path.join(root, prefix + 'xlnet_126gb-871f0b3c.zip')
    if repo_url[-1] != '/':
        repo_url = repo_url + '/'
    download(_url_format.format(repo_url=repo_url,
                                file_name='xlnet_126gb-871f0b3c'),
             path=zip_file_path,
             overwrite=True)
    with zipfile.ZipFile(zip_file_path) as zf:
        if not os.path.exists(file_path):
            zf.extractall(root)
    try:
        os.remove(zip_file_path)
    except OSError as e:
        # file has already been removed.
        if e.errno == 2:
            pass
        else:
            raise e

    if not check_sha1(file_path, sha1_hash):
        raise ValueError(
            'Downloaded file has different hash. Please try again.')

    tokenizer = XLNetTokenizer(file_path)
    return tokenizer
Example #6
0
def _load_pretrained_vocab(name, root=os.path.join('~', '.mxnet', 'models')):
    """Load the accompanying vocabulary object for pre-trained model.

    Parameters
    ----------
    name : str
        Name of the vocabulary, usually the name of the dataset.
    root : str, default '~/.mxnet/models'
        Location for keeping the model parameters.

    Returns
    -------
    Vocab
        Loaded vocabulary object for the pre-trained model.
    """
    file_name = '{name}-{short_hash}'.format(name=name,
                                             short_hash=short_hash(name))
    root = os.path.expanduser(root)
    file_path = os.path.join(root, file_name+'.vocab')
    sha1_hash = _vocab_sha1[name]
    if os.path.exists(file_path):
        if check_sha1(file_path, sha1_hash):
            return _load_vocab_file(file_path)
        else:
            print('Detected mismatch in the content of model vocab file. Downloading again.')
    else:
        print('Vocab file is not found. Downloading.')

    if not os.path.exists(root):
        os.makedirs(root)

    zip_file_path = os.path.join(root, file_name+'.zip')
    repo_url = _get_repo_url()
    if repo_url[-1] != '/':
        repo_url = repo_url + '/'
    download(_url_format.format(repo_url=repo_url, file_name=file_name),
             path=zip_file_path,
             overwrite=True)
    with zipfile.ZipFile(zip_file_path) as zf:
        zf.extractall(root)
    os.remove(zip_file_path)

    if check_sha1(file_path, sha1_hash):
        return _load_vocab_file(file_path)
    else:
        raise ValueError('Downloaded file has different hash. Please try again.')
Example #7
0
def test_pretrained_gpt2(model_name, tmp_path):
    sentence = ' natural language processing tools such as gluonnlp and torchtext'
    model, vocab = get_model(model_name, dataset_name='openai_webtext')
    tokenizer = GPT2BPETokenizer()
    detokenizer = GPT2BPEDetokenizer()
    true_data_hash = {'gpt2_117m': '29526682508d03a7c54c598e889f77f7b4608df0',
                      'gpt2_345m': '6680fd2a3d7b737855536f480bc19d166f15a3ad'}
    file_name = '{model_name}_gt_logits-{short_hash}.npy'.format(
            model_name=model_name,
            short_hash=true_data_hash[model_name][:8])
    url_format = '{repo_url}gluon/dataset/test/{file_name}'
    repo_url = _get_repo_url()
    path = os.path.join(str(tmp_path), file_name)
    download(url_format.format(repo_url=repo_url, file_name=file_name),
             path=path,
             sha1_hash=true_data_hash[model_name])
    gt_logits = np.load(path)
    model.hybridize()
    indices = vocab[tokenizer(sentence)]
    nd_indices = mx.nd.expand_dims(mx.nd.array(indices), axis=0)
    logits, new_states = model(nd_indices, None)
    npt.assert_allclose(logits.asnumpy(), gt_logits, 1E-5, 1E-5)
Example #8
0
def _download_vocab_tokenizer(root, file_name, file_ext, file_path):
    utils.mkdir(root)

    temp_num = str(random.Random().randint(1, sys.maxsize))
    temp_root = os.path.join(root, temp_num)
    temp_file_path = os.path.join(temp_root, file_name + file_ext)
    temp_zip_file_path = os.path.join(temp_root,
                                      temp_num + '_' + file_name + '.zip')

    repo_url = _get_repo_url()
    download(_url_format.format(repo_url=repo_url, file_name=file_name),
             path=temp_zip_file_path,
             overwrite=True)
    with zipfile.ZipFile(temp_zip_file_path) as zf:
        assert file_name + file_ext in zf.namelist(
        ), '{} not part of {}. Only have: {}'.format(file_name + file_ext,
                                                     file_name + '.zip',
                                                     zf.namelist())
        utils.mkdir(temp_root)
        zf.extractall(temp_root)
        os.replace(temp_file_path, file_path)
        shutil.rmtree(temp_root)