def __init__(self,
              tokenizer,
              max_seq_length,
              class_labels=None,
              label_alias=None,
              pad=True,
              pair=True,
              has_label=True):
     self.class_labels = class_labels
     self.tokenizer = tokenizer
     self.max_seq_length = max_seq_length
     self.has_label = has_label
     self.label_alias = None
     self.class_labels = None
     self._label_dtype = 'int32' if class_labels else 'float32'
     self.pair = pair
     self.pad = pad
     self.has_label = has_label
     if has_label and class_labels:
         self._label_map = {}
         for (i, label) in enumerate(class_labels):
             self._label_map[label] = i
         if label_alias:
             for key in label_alias:
                 self._label_map[key] = self._label_map[label_alias[key]]
     self._bert_xform = BERTSentenceTransform(
         tokenizer, max_seq_length, pad=pad, pair=pair)
Example #2
0
    def data_loader(self, sentences, shuffle=False):
        # tokenizer = BERTTokenizer(self.vocab)
        # transform = BERTSentenceTransform(tokenizer=tokenizer,
        #                                   max_seq_length=self.max_seq_length,
        #                                               pair=False)

        class Listtolist(object):
            def __init__(self,
                         vocab,
                         lower=False,
                         max_input_chars_per_word=200):
                self.vocab = vocab
                self.max_input_chars_per_word = max_input_chars_per_word

            def __call__(self, sample):
                return sample

            def convert_tokens_to_ids(self, tokens):
                """Converts a sequence of tokens into ids using the vocab."""
                return self.vocab.to_indices(tokens)

        tokenizer = Listtolist(self.vocab)
        transform = BERTSentenceTransform(tokenizer=tokenizer,
                                          max_seq_length=self.max_seq_length,
                                          pair=False)

        dataset = BertEmbeddingDataset(sentences, transform)
        # for line in dataset: print(line)
        # print(dataset)
        return DataLoader(dataset=dataset,
                          batch_size=self.batch_size,
                          shuffle=shuffle)
Example #3
0
 def __init__(self,
              tokenizer,
              max_seq_length,
              class_labels=None,
              label_alias=None,
              pad=True,
              pair=True,
              has_label=True,
              vectorizer=None,
              bert_vocab_size=0):
     self.class_labels = class_labels
     self.has_label = has_label
     self.use_bert_bow = bert_vocab_size > 0
     self.bert_vocab_size = bert_vocab_size
     self._label_dtype = 'int32' if class_labels else 'float32'
     if has_label and class_labels:
         self._label_map = {}
         for (i, label) in enumerate(class_labels):
             self._label_map[label] = i
         if label_alias:
             for key in label_alias:
                 self._label_map[key] = self._label_map[label_alias[key]]
     self._bert_xform = BERTSentenceTransform(
         tokenizer, max_seq_length, pad=pad, pair=pair)
     self.vectorizer = vectorizer
Example #4
0
    def __init__(self):
        self.token_max_len = 100

        self.kobert_model, vocab = get_pytorch_kobert_model()
        tok_path = get_tokenizer()
        tokenizer = BERTSPTokenizer(tok_path, vocab)
        self.transformer = BERTSentenceTransform(tokenizer, self.token_max_len)
Example #5
0
    def __init__(self, ctx=mx.cpu(), dtype='float32', model='bert_12_768_12',
                 dataset_name='book_corpus_wiki_en_uncased', params_path=None,
                 max_seq_length=25, batch_size=256,
                 root=os.path.join(get_home_dir(), 'models')):
        self.ctx = ctx
        self.dtype = dtype
        self.max_seq_length = max_seq_length
        self.batch_size = batch_size
        self.dataset_name = dataset_name

        # Don't download the pretrained models if we have a parameter path
        self.bert, self.vocab = gluonnlp.model.get_model(model,
                                                         dataset_name=self.dataset_name,
                                                         pretrained=params_path is None,
                                                         ctx=self.ctx,
                                                         use_pooler=False,
                                                         use_decoder=False,
                                                         use_classifier=False,
                                                         root=root)
        self.bert.cast(self.dtype)

        if params_path:
            logger.info('Loading params from %s', params_path)
            self.bert.load_parameters(params_path, ctx=ctx, ignore_extra=True)

        lower = 'uncased' in self.dataset_name
        self.tokenizer = BERTTokenizer(self.vocab, lower=lower)
        self.transform = BERTSentenceTransform(tokenizer=self.tokenizer,
                                               max_seq_length=self.max_seq_length,
                                               pair=False)
Example #6
0
 def data_loader(self, sentences, shuffle=False):
     tokenizer = BERTTokenizer(self.vocab)
     transform = BERTSentenceTransform(tokenizer=tokenizer,
                                       max_seq_length=self.max_seq_length,
                                       pair=False)
     dataset = BertEmbeddingDataset(sentences, transform)
     return DataLoader(dataset=dataset, batch_size=self.batch_size, shuffle=shuffle)
Example #7
0
    def __init__(self,
                 ctx=mx.cpu(),
                 dtype='float32',
                 model='bert_12_768_12',
                 dataset_name='book_corpus_wiki_en_uncased',
                 params_path=None,
                 max_seq_length=25,
                 batch_size=256):
        """
        Encoding from BERT model.

        Parameters
        ----------
        ctx : Context.
            running BertEmbedding on which gpu device id.
        dtype: str
        data type to use for the model.
        model : str, default bert_12_768_12.
            pre-trained BERT model
        dataset_name : str, default book_corpus_wiki_en_uncased.
            pre-trained model dataset
        params_path: str, default None
            path to a parameters file to load instead of the pretrained model.
        max_seq_length : int, default 25
            max length of each sequence
        batch_size : int, default 256
            batch size
        """
        self.ctx = ctx
        self.dtype = dtype
        self.max_seq_length = max_seq_length
        self.batch_size = batch_size
        self.dataset_name = dataset_name
        if params_path is not None:
            # Don't download the pretrained models if we have a parameter path
            pretrained = False
        else:
            pretrained = True
        self.bert, self.vocab = gluonnlp.model.get_model(
            model,
            dataset_name=self.dataset_name,
            pretrained=pretrained,
            ctx=self.ctx,
            use_pooler=False,
            use_decoder=False,
            use_classifier=False)
        self.bert.cast(self.dtype)

        if params_path:
            logger.info('Loading params from %s', params_path)
            self.bert.load_parameters(params_path, ctx=ctx, ignore_extra=True)

        lower = 'uncased' in self.dataset_name

        self.tokenizer = BERTTokenizer(self.vocab, lower=lower)
        self.transform = BERTSentenceTransform(
            tokenizer=self.tokenizer,
            max_seq_length=self.max_seq_length,
            pair=False)
Example #8
0
 def __init__(self, tokenizer, labels, max_seq_length, pad=True):
     self._label_map = {}
     for (i, label) in enumerate(labels):
         self._label_map[label] = i
     self._bert_xform = BERTSentenceTransform(tokenizer,
                                              max_seq_length,
                                              pad=pad,
                                              pair=False)
Example #9
0
    def __init__(self,
                 ctx=mx.cpu(),
                 dtype='float32',
                 model='bert_12_768_12',
                 dataset_name='book_corpus_wiki_en_uncased',
                 params_path=None,
                 max_seq_length=25,
                 batch_size=256,
                 sentencepiece=None,
                 root=os.path.join(get_home_dir(), 'models')):
        self.ctx = ctx
        self.dtype = dtype
        self.max_seq_length = max_seq_length
        self.batch_size = batch_size
        self.dataset_name = dataset_name

        # use sentencepiece vocab and a checkpoint
        # we need to set dataset_name to None, otherwise it uses the downloaded vocab
        if params_path and sentencepiece:
            dataset_name = None
        else:
            dataset_name = self.dataset_name
        if sentencepiece:
            vocab = gluonnlp.vocab.BERTVocab.from_sentencepiece(sentencepiece)
        else:
            vocab = None

        self.bert, self.vocab = gluonnlp.model.get_model(
            model,
            dataset_name=dataset_name,
            pretrained=params_path is None,
            ctx=self.ctx,
            use_pooler=False,
            use_decoder=False,
            use_classifier=False,
            root=root,
            vocab=vocab)

        self.bert.cast(self.dtype)
        if params_path:
            logger.info('Loading params from %s', params_path)
            self.bert.load_parameters(params_path,
                                      ctx=ctx,
                                      ignore_extra=True,
                                      cast_dtype=True)

        lower = 'uncased' in self.dataset_name
        if sentencepiece:
            self.tokenizer = BERTSPTokenizer(sentencepiece,
                                             self.vocab,
                                             lower=lower)
        else:
            self.tokenizer = BERTTokenizer(self.vocab, lower=lower)
        self.transform = BERTSentenceTransform(
            tokenizer=self.tokenizer,
            max_seq_length=self.max_seq_length,
            pair=False)
 def re_init(self):
     self._label_dtype = 'int32' if self.class_labels else 'float32'
     if self.has_label and self.class_labels:
         self._label_map = {}
         for (i, label) in enumerate(self.class_labels):
             self._label_map[label] = i
         if self.label_alias:
             for key in self.label_alias:
                 self._label_map[key] = self._label_map[self.label_alias[key]]
     self._bert_xform = BERTSentenceTransform(
         self.tokenizer, self.max_seq_length, pad=self.pad, pair=self.pair)
Example #11
0
def load_dataset_bert(json_file,
                      voc_size,
                      json_text_key="text",
                      json_sp_key="sp_vec",
                      max_len=64,
                      ctx=mx.cpu()):
    indices = []
    values = []
    indptrs = [0]
    cumulative = 0
    total_num_words = 0
    ndocs = 0
    bert_model = 'bert_12_768_12'
    dname = 'book_corpus_wiki_en_uncased'
    bert_base, vocab = nlp.model.get_model(bert_model,
                                           dataset_name=dname,
                                           pretrained=True,
                                           ctx=ctx,
                                           use_pooler=True,
                                           use_decoder=False,
                                           use_classifier=False)
    tokenizer = BERTTokenizer(vocab)
    transform = BERTSentenceTransform(tokenizer, max_len, pair=False)
    x_ids = []
    x_val_lens = []
    x_segs = []
    with io.open(json_file, 'r', encoding='utf-8') as fp:
        for line in fp:
            if json_text_key:
                js = json.loads(line)
                line = js[json_text_key]
            if len(line.split(' ')) > 4:
                ids, lens, segs = transform(
                    (line, ))  # create BERT-ready inputs
                x_ids.append(ids)
                x_val_lens.append(lens)
                x_segs.append(segs)
            ## Now, get the sparse vector
            ndocs += 1
            sp_vec_els = js[json_sp_key]
            n_pairs, inds, vs = get_single_vec(sp_vec_els)
            cumulative += n_pairs
            total_num_words += sum(vs)
            indptrs.append(cumulative)
            values.extend(vs)
            indices.extend(inds)
    csr_mat = mx.nd.sparse.csr_matrix((values, indices, indptrs),
                                      shape=(ndocs, voc_size))
    data_train = gluon.data.ArrayDataset(
        mx.nd.array(x_ids, dtype='int32'),
        mx.nd.array(x_val_lens, dtype='int32'),
        mx.nd.array(x_segs, dtype='int32'), csr_mat.tostype('default'))
    return data_train, bert_base, vocab, csr_mat
Example #12
0
def _load_dataset_bert(line_gen, voc_size, max_len=64, ctx=mx.cpu()):
    indices = []
    values = []
    indptrs = [0]
    cumulative = 0
    total_num_words = 0
    ndocs = 0
    bert_model = 'bert_12_768_12'
    dname = 'book_corpus_wiki_en_uncased'
    ## This is really only needed here to get the vocab
    ## GluonNLP API doesn't enable that
    bert_base, vocab = nlp.model.get_model(bert_model,
                                           dataset_name=dname,
                                           pretrained=True,
                                           ctx=ctx,
                                           use_pooler=True,
                                           use_decoder=False,
                                           use_classifier=False)
    tokenizer = BERTTokenizer(vocab)
    transform = BERTSentenceTransform(tokenizer, max_len, pair=False)
    x_ids = []
    x_val_lens = []
    x_segs = []
    for t in line_gen:
        if isinstance(t, tuple):
            line = t[0]
            sp_vec_els = t[1]
        else:
            line = t
            sp_vec_els = None
        ids, lens, segs = transform((line, ))  # create BERT-ready inputs
        x_ids.append(ids)
        x_val_lens.append(lens)
        x_segs.append(segs)
        ## Now, get the sparse vector
        ndocs += 1
        if sp_vec_els:
            pairs, inds, vs = get_single_vec(sp_vec_els)
            cumulative += len(pairs)
            total_num_words += sum(vs)
            indptrs.append(cumulative)
            values.extend(vs)
            indices.extend(inds)
    if len(indices) > 0:
        csr_mat = mx.nd.sparse.csr_matrix(
            (values, indices, indptrs),
            shape=(ndocs, voc_size)).tostype('default')
    else:
        csr_mat = None
    return x_ids, x_val_lens, x_segs, bert_base, vocab, csr_mat
Example #13
0
 def __init__(self,
              model,
              bert_vocab,
              max_length,
              bow_vocab=None,
              ctx=mx.cpu()):
     super().__init__(ctx)
     self.model = model
     self.bert_base = model.bert
     self.tokenizer = BERTTokenizer(bert_vocab)
     self.transform = BERTSentenceTransform(self.tokenizer,
                                            max_length,
                                            pair=False)
     self.bow_vocab = bow_vocab
 def __init__(self,
              tokenizer,
              max_seq_length,
              labels=None,
              pad=True,
              pair=True,
              label_dtype='float32'):
     self.label_dtype = label_dtype
     self.labels = labels
     if self.labels:
         self._label_map = {}
         for (i, label) in enumerate(labels):
             self._label_map[label] = i
     self._bert_xform = BERTSentenceTransform(
         tokenizer, max_seq_length, pad=pad, pair=pair)
Example #15
0
 def __init__(self,
              tokenizer,
              max_seq_length,
              class_labels=None,
              pad=True,
              pair=True,
              has_label=True):
     self.class_labels = class_labels
     self.has_label = has_label
     self._label_dtype = 'int32' if class_labels else 'float32'
     if has_label and class_labels:
         self._label_map = {}
         for (i, label) in enumerate(class_labels):
             self._label_map[label] = i
     self._bert_xform = BERTSentenceTransform(
         tokenizer, max_seq_length, pad=pad, pair=pair)
Example #16
0
 def __init__(self,
              model,
              bert_vocab,
              max_length,
              bow_vocab=None,
              pre_vectorizer=None,
              ctx=mx.cpu()):
     super().__init__(ctx)
     self.model = model
     self.bert_base = model.bert
     self.tokenizer = BERTTokenizer(bert_vocab)
     self.transform = BERTSentenceTransform(self.tokenizer,
                                            max_length,
                                            pair=False)
     self.bow_vocab = bow_vocab
     self.vectorizer = pre_vectorizer or TMNTVectorizer(
         initial_vocabulary=bow_vocab)
Example #17
0
def word_piece_tokenizer(sentences):
    ctx = ghp.ctx
    model = 'bert_12_768_12'
    dataset_name = 'book_corpus_wiki_en_uncased'
    max_seq_length = ghp.max_seq_len
    batch_size = 256
    _, vocab = gluonnlp.model.get_model(model,
                                        dataset_name=dataset_name,
                                        pretrained=True,
                                        ctx=ctx,
                                        use_pooler=False,
                                        use_decoder=False,
                                        use_classifier=False)
    tokenizer = BERTTokenizer(vocab)

    transform = BERTSentenceTransform(tokenizer=tokenizer,
                                      max_seq_length=max_seq_length,
                                      pair=False)
    dataset = BertEmbeddingDataset(sentences, transform)
    data_loader = DataLoader(dataset=dataset,
                             batch_size=batch_size,
                             shuffle=False)
    batches = []
    for token_ids, _, _ in data_loader:
        token_ids = token_ids.as_in_context(ctx)

        for token_id in token_ids.asnumpy():
            batches.append(token_id)

    cut_results = []
    for token_ids in batches:
        tokens = []
        for token_id in token_ids:
            if token_id == 1:
                break
            if token_id in (2, 3):
                continue
            token = vocab.idx_to_token[token_id]
            if token.startswith('##'):
                token = token[2:]
                tokens[-1] += token
            else:  # iv, avg last oov
                tokens.append(token)
        cut_results.append(tokens)
    return cut_results
Example #18
0
 def __init__(self,
              param_file=None,
              config_file=None,
              vocab_file=None,
              model_dir=None,
              ctx=mx.cpu()):
     super().__init__(ctx)
     if model_dir is not None:
         param_file = os.path.join(model_dir, 'model.params')
         vocab_file = os.path.join(model_dir, 'vocab.json')
         config_file = os.path.join(model_dir, 'model.config')
     with open(config_file) as f:
         config = json.loads(f.read())
     with open(vocab_file) as f:
         voc_js = f.read()
     self.bow_vocab = nlp.Vocab.from_json(voc_js)
     self.ctx = ctx
     self.bert_base, self.vocab = nlp.model.get_model(
         'bert_12_768_12',
         dataset_name='book_corpus_wiki_en_uncased',
         pretrained=True,
         ctx=ctx,
         use_pooler=True,
         use_decoder=False,
         use_classifier=False)  #, output_attention=True)
     self.latent_dist = config['latent_distribution']['dist_type']
     self.n_latent = config['n_latent']
     self.kappa = config['latent_distribution']['kappa']
     self.pad_id = self.vocab[self.vocab.padding_token]
     self.max_sent_len = config['sent_size']
     self.model = BertBowVED(self.bert_base,
                             self.bow_vocab,
                             latent_distrib=self.latent_dist,
                             n_latent=self.n_latent,
                             kappa=self.kappa,
                             batch_size=1)
     self.tokenizer = BERTTokenizer(self.vocab)
     self.transform = BERTSentenceTransform(self.tokenizer,
                                            self.max_sent_len,
                                            pair=False)
     self.model.load_parameters(str(param_file),
                                allow_missing=False,
                                ignore_extra=True)
from gluonnlp.data import SentencepieceTokenizer, BERTSPTokenizer, BERTSentenceTransform
from kobert.utils import get_tokenizer

from kobert.pytorch_kobert import get_pytorch_kobert_model

bertmodel, vocab = get_pytorch_kobert_model()
tokenizer = get_tokenizer()
sampleText = "[순천시청] 코로나19 감염이 인근(목포, 광주)에서 지속 발생하고 있습니다. 개개인이 방역주체가 되어 마스크 착용 등 방역수칙을 반드시 준수 바랍니다. "
sampleText = "마치 미국애니에서 튀어나온듯한 창의력없는 로봇디자인부터가,고개를 젖게한다"
print(vocab)

tok = BERTSPTokenizer(tokenizer, vocab, lower=False)
print(tok)
print(tok(sampleText))
transform = BERTSentenceTransform(tok, max_seq_length=32, pad=True, pair=False)
print(transform(sampleText))

sp = SentencepieceTokenizer(tokenizer)
print(sp)
print(sp(sampleText))

#
# transform = BERTSentenceTransform(tok, max_seq_length=32, pad=True, pair=False)
# transform2 = BERTSentenceTransform(sp, max_seq_length=32, vocab=None, pad=True, pair=False)
# print(transform("한국어 모델을 공유합니다."))
# print(transform2("한국어 모델을 공유합니다."))