def __init__(self, tokenizer, max_seq_length, class_labels=None, label_alias=None, pad=True, pair=True, has_label=True): self.class_labels = class_labels self.tokenizer = tokenizer self.max_seq_length = max_seq_length self.has_label = has_label self.label_alias = None self.class_labels = None self._label_dtype = 'int32' if class_labels else 'float32' self.pair = pair self.pad = pad self.has_label = has_label if has_label and class_labels: self._label_map = {} for (i, label) in enumerate(class_labels): self._label_map[label] = i if label_alias: for key in label_alias: self._label_map[key] = self._label_map[label_alias[key]] self._bert_xform = BERTSentenceTransform( tokenizer, max_seq_length, pad=pad, pair=pair)
def data_loader(self, sentences, shuffle=False): # tokenizer = BERTTokenizer(self.vocab) # transform = BERTSentenceTransform(tokenizer=tokenizer, # max_seq_length=self.max_seq_length, # pair=False) class Listtolist(object): def __init__(self, vocab, lower=False, max_input_chars_per_word=200): self.vocab = vocab self.max_input_chars_per_word = max_input_chars_per_word def __call__(self, sample): return sample def convert_tokens_to_ids(self, tokens): """Converts a sequence of tokens into ids using the vocab.""" return self.vocab.to_indices(tokens) tokenizer = Listtolist(self.vocab) transform = BERTSentenceTransform(tokenizer=tokenizer, max_seq_length=self.max_seq_length, pair=False) dataset = BertEmbeddingDataset(sentences, transform) # for line in dataset: print(line) # print(dataset) return DataLoader(dataset=dataset, batch_size=self.batch_size, shuffle=shuffle)
def __init__(self, tokenizer, max_seq_length, class_labels=None, label_alias=None, pad=True, pair=True, has_label=True, vectorizer=None, bert_vocab_size=0): self.class_labels = class_labels self.has_label = has_label self.use_bert_bow = bert_vocab_size > 0 self.bert_vocab_size = bert_vocab_size self._label_dtype = 'int32' if class_labels else 'float32' if has_label and class_labels: self._label_map = {} for (i, label) in enumerate(class_labels): self._label_map[label] = i if label_alias: for key in label_alias: self._label_map[key] = self._label_map[label_alias[key]] self._bert_xform = BERTSentenceTransform( tokenizer, max_seq_length, pad=pad, pair=pair) self.vectorizer = vectorizer
def __init__(self): self.token_max_len = 100 self.kobert_model, vocab = get_pytorch_kobert_model() tok_path = get_tokenizer() tokenizer = BERTSPTokenizer(tok_path, vocab) self.transformer = BERTSentenceTransform(tokenizer, self.token_max_len)
def __init__(self, ctx=mx.cpu(), dtype='float32', model='bert_12_768_12', dataset_name='book_corpus_wiki_en_uncased', params_path=None, max_seq_length=25, batch_size=256, root=os.path.join(get_home_dir(), 'models')): self.ctx = ctx self.dtype = dtype self.max_seq_length = max_seq_length self.batch_size = batch_size self.dataset_name = dataset_name # Don't download the pretrained models if we have a parameter path self.bert, self.vocab = gluonnlp.model.get_model(model, dataset_name=self.dataset_name, pretrained=params_path is None, ctx=self.ctx, use_pooler=False, use_decoder=False, use_classifier=False, root=root) self.bert.cast(self.dtype) if params_path: logger.info('Loading params from %s', params_path) self.bert.load_parameters(params_path, ctx=ctx, ignore_extra=True) lower = 'uncased' in self.dataset_name self.tokenizer = BERTTokenizer(self.vocab, lower=lower) self.transform = BERTSentenceTransform(tokenizer=self.tokenizer, max_seq_length=self.max_seq_length, pair=False)
def data_loader(self, sentences, shuffle=False): tokenizer = BERTTokenizer(self.vocab) transform = BERTSentenceTransform(tokenizer=tokenizer, max_seq_length=self.max_seq_length, pair=False) dataset = BertEmbeddingDataset(sentences, transform) return DataLoader(dataset=dataset, batch_size=self.batch_size, shuffle=shuffle)
def __init__(self, ctx=mx.cpu(), dtype='float32', model='bert_12_768_12', dataset_name='book_corpus_wiki_en_uncased', params_path=None, max_seq_length=25, batch_size=256): """ Encoding from BERT model. Parameters ---------- ctx : Context. running BertEmbedding on which gpu device id. dtype: str data type to use for the model. model : str, default bert_12_768_12. pre-trained BERT model dataset_name : str, default book_corpus_wiki_en_uncased. pre-trained model dataset params_path: str, default None path to a parameters file to load instead of the pretrained model. max_seq_length : int, default 25 max length of each sequence batch_size : int, default 256 batch size """ self.ctx = ctx self.dtype = dtype self.max_seq_length = max_seq_length self.batch_size = batch_size self.dataset_name = dataset_name if params_path is not None: # Don't download the pretrained models if we have a parameter path pretrained = False else: pretrained = True self.bert, self.vocab = gluonnlp.model.get_model( model, dataset_name=self.dataset_name, pretrained=pretrained, ctx=self.ctx, use_pooler=False, use_decoder=False, use_classifier=False) self.bert.cast(self.dtype) if params_path: logger.info('Loading params from %s', params_path) self.bert.load_parameters(params_path, ctx=ctx, ignore_extra=True) lower = 'uncased' in self.dataset_name self.tokenizer = BERTTokenizer(self.vocab, lower=lower) self.transform = BERTSentenceTransform( tokenizer=self.tokenizer, max_seq_length=self.max_seq_length, pair=False)
def __init__(self, tokenizer, labels, max_seq_length, pad=True): self._label_map = {} for (i, label) in enumerate(labels): self._label_map[label] = i self._bert_xform = BERTSentenceTransform(tokenizer, max_seq_length, pad=pad, pair=False)
def __init__(self, ctx=mx.cpu(), dtype='float32', model='bert_12_768_12', dataset_name='book_corpus_wiki_en_uncased', params_path=None, max_seq_length=25, batch_size=256, sentencepiece=None, root=os.path.join(get_home_dir(), 'models')): self.ctx = ctx self.dtype = dtype self.max_seq_length = max_seq_length self.batch_size = batch_size self.dataset_name = dataset_name # use sentencepiece vocab and a checkpoint # we need to set dataset_name to None, otherwise it uses the downloaded vocab if params_path and sentencepiece: dataset_name = None else: dataset_name = self.dataset_name if sentencepiece: vocab = gluonnlp.vocab.BERTVocab.from_sentencepiece(sentencepiece) else: vocab = None self.bert, self.vocab = gluonnlp.model.get_model( model, dataset_name=dataset_name, pretrained=params_path is None, ctx=self.ctx, use_pooler=False, use_decoder=False, use_classifier=False, root=root, vocab=vocab) self.bert.cast(self.dtype) if params_path: logger.info('Loading params from %s', params_path) self.bert.load_parameters(params_path, ctx=ctx, ignore_extra=True, cast_dtype=True) lower = 'uncased' in self.dataset_name if sentencepiece: self.tokenizer = BERTSPTokenizer(sentencepiece, self.vocab, lower=lower) else: self.tokenizer = BERTTokenizer(self.vocab, lower=lower) self.transform = BERTSentenceTransform( tokenizer=self.tokenizer, max_seq_length=self.max_seq_length, pair=False)
def re_init(self): self._label_dtype = 'int32' if self.class_labels else 'float32' if self.has_label and self.class_labels: self._label_map = {} for (i, label) in enumerate(self.class_labels): self._label_map[label] = i if self.label_alias: for key in self.label_alias: self._label_map[key] = self._label_map[self.label_alias[key]] self._bert_xform = BERTSentenceTransform( self.tokenizer, self.max_seq_length, pad=self.pad, pair=self.pair)
def load_dataset_bert(json_file, voc_size, json_text_key="text", json_sp_key="sp_vec", max_len=64, ctx=mx.cpu()): indices = [] values = [] indptrs = [0] cumulative = 0 total_num_words = 0 ndocs = 0 bert_model = 'bert_12_768_12' dname = 'book_corpus_wiki_en_uncased' bert_base, vocab = nlp.model.get_model(bert_model, dataset_name=dname, pretrained=True, ctx=ctx, use_pooler=True, use_decoder=False, use_classifier=False) tokenizer = BERTTokenizer(vocab) transform = BERTSentenceTransform(tokenizer, max_len, pair=False) x_ids = [] x_val_lens = [] x_segs = [] with io.open(json_file, 'r', encoding='utf-8') as fp: for line in fp: if json_text_key: js = json.loads(line) line = js[json_text_key] if len(line.split(' ')) > 4: ids, lens, segs = transform( (line, )) # create BERT-ready inputs x_ids.append(ids) x_val_lens.append(lens) x_segs.append(segs) ## Now, get the sparse vector ndocs += 1 sp_vec_els = js[json_sp_key] n_pairs, inds, vs = get_single_vec(sp_vec_els) cumulative += n_pairs total_num_words += sum(vs) indptrs.append(cumulative) values.extend(vs) indices.extend(inds) csr_mat = mx.nd.sparse.csr_matrix((values, indices, indptrs), shape=(ndocs, voc_size)) data_train = gluon.data.ArrayDataset( mx.nd.array(x_ids, dtype='int32'), mx.nd.array(x_val_lens, dtype='int32'), mx.nd.array(x_segs, dtype='int32'), csr_mat.tostype('default')) return data_train, bert_base, vocab, csr_mat
def _load_dataset_bert(line_gen, voc_size, max_len=64, ctx=mx.cpu()): indices = [] values = [] indptrs = [0] cumulative = 0 total_num_words = 0 ndocs = 0 bert_model = 'bert_12_768_12' dname = 'book_corpus_wiki_en_uncased' ## This is really only needed here to get the vocab ## GluonNLP API doesn't enable that bert_base, vocab = nlp.model.get_model(bert_model, dataset_name=dname, pretrained=True, ctx=ctx, use_pooler=True, use_decoder=False, use_classifier=False) tokenizer = BERTTokenizer(vocab) transform = BERTSentenceTransform(tokenizer, max_len, pair=False) x_ids = [] x_val_lens = [] x_segs = [] for t in line_gen: if isinstance(t, tuple): line = t[0] sp_vec_els = t[1] else: line = t sp_vec_els = None ids, lens, segs = transform((line, )) # create BERT-ready inputs x_ids.append(ids) x_val_lens.append(lens) x_segs.append(segs) ## Now, get the sparse vector ndocs += 1 if sp_vec_els: pairs, inds, vs = get_single_vec(sp_vec_els) cumulative += len(pairs) total_num_words += sum(vs) indptrs.append(cumulative) values.extend(vs) indices.extend(inds) if len(indices) > 0: csr_mat = mx.nd.sparse.csr_matrix( (values, indices, indptrs), shape=(ndocs, voc_size)).tostype('default') else: csr_mat = None return x_ids, x_val_lens, x_segs, bert_base, vocab, csr_mat
def __init__(self, model, bert_vocab, max_length, bow_vocab=None, ctx=mx.cpu()): super().__init__(ctx) self.model = model self.bert_base = model.bert self.tokenizer = BERTTokenizer(bert_vocab) self.transform = BERTSentenceTransform(self.tokenizer, max_length, pair=False) self.bow_vocab = bow_vocab
def __init__(self, tokenizer, max_seq_length, labels=None, pad=True, pair=True, label_dtype='float32'): self.label_dtype = label_dtype self.labels = labels if self.labels: self._label_map = {} for (i, label) in enumerate(labels): self._label_map[label] = i self._bert_xform = BERTSentenceTransform( tokenizer, max_seq_length, pad=pad, pair=pair)
def __init__(self, tokenizer, max_seq_length, class_labels=None, pad=True, pair=True, has_label=True): self.class_labels = class_labels self.has_label = has_label self._label_dtype = 'int32' if class_labels else 'float32' if has_label and class_labels: self._label_map = {} for (i, label) in enumerate(class_labels): self._label_map[label] = i self._bert_xform = BERTSentenceTransform( tokenizer, max_seq_length, pad=pad, pair=pair)
def __init__(self, model, bert_vocab, max_length, bow_vocab=None, pre_vectorizer=None, ctx=mx.cpu()): super().__init__(ctx) self.model = model self.bert_base = model.bert self.tokenizer = BERTTokenizer(bert_vocab) self.transform = BERTSentenceTransform(self.tokenizer, max_length, pair=False) self.bow_vocab = bow_vocab self.vectorizer = pre_vectorizer or TMNTVectorizer( initial_vocabulary=bow_vocab)
def word_piece_tokenizer(sentences): ctx = ghp.ctx model = 'bert_12_768_12' dataset_name = 'book_corpus_wiki_en_uncased' max_seq_length = ghp.max_seq_len batch_size = 256 _, vocab = gluonnlp.model.get_model(model, dataset_name=dataset_name, pretrained=True, ctx=ctx, use_pooler=False, use_decoder=False, use_classifier=False) tokenizer = BERTTokenizer(vocab) transform = BERTSentenceTransform(tokenizer=tokenizer, max_seq_length=max_seq_length, pair=False) dataset = BertEmbeddingDataset(sentences, transform) data_loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=False) batches = [] for token_ids, _, _ in data_loader: token_ids = token_ids.as_in_context(ctx) for token_id in token_ids.asnumpy(): batches.append(token_id) cut_results = [] for token_ids in batches: tokens = [] for token_id in token_ids: if token_id == 1: break if token_id in (2, 3): continue token = vocab.idx_to_token[token_id] if token.startswith('##'): token = token[2:] tokens[-1] += token else: # iv, avg last oov tokens.append(token) cut_results.append(tokens) return cut_results
def __init__(self, param_file=None, config_file=None, vocab_file=None, model_dir=None, ctx=mx.cpu()): super().__init__(ctx) if model_dir is not None: param_file = os.path.join(model_dir, 'model.params') vocab_file = os.path.join(model_dir, 'vocab.json') config_file = os.path.join(model_dir, 'model.config') with open(config_file) as f: config = json.loads(f.read()) with open(vocab_file) as f: voc_js = f.read() self.bow_vocab = nlp.Vocab.from_json(voc_js) self.ctx = ctx self.bert_base, self.vocab = nlp.model.get_model( 'bert_12_768_12', dataset_name='book_corpus_wiki_en_uncased', pretrained=True, ctx=ctx, use_pooler=True, use_decoder=False, use_classifier=False) #, output_attention=True) self.latent_dist = config['latent_distribution']['dist_type'] self.n_latent = config['n_latent'] self.kappa = config['latent_distribution']['kappa'] self.pad_id = self.vocab[self.vocab.padding_token] self.max_sent_len = config['sent_size'] self.model = BertBowVED(self.bert_base, self.bow_vocab, latent_distrib=self.latent_dist, n_latent=self.n_latent, kappa=self.kappa, batch_size=1) self.tokenizer = BERTTokenizer(self.vocab) self.transform = BERTSentenceTransform(self.tokenizer, self.max_sent_len, pair=False) self.model.load_parameters(str(param_file), allow_missing=False, ignore_extra=True)
from gluonnlp.data import SentencepieceTokenizer, BERTSPTokenizer, BERTSentenceTransform from kobert.utils import get_tokenizer from kobert.pytorch_kobert import get_pytorch_kobert_model bertmodel, vocab = get_pytorch_kobert_model() tokenizer = get_tokenizer() sampleText = "[순천시청] 코로나19 감염이 인근(목포, 광주)에서 지속 발생하고 있습니다. 개개인이 방역주체가 되어 마스크 착용 등 방역수칙을 반드시 준수 바랍니다. " sampleText = "마치 미국애니에서 튀어나온듯한 창의력없는 로봇디자인부터가,고개를 젖게한다" print(vocab) tok = BERTSPTokenizer(tokenizer, vocab, lower=False) print(tok) print(tok(sampleText)) transform = BERTSentenceTransform(tok, max_seq_length=32, pad=True, pair=False) print(transform(sampleText)) sp = SentencepieceTokenizer(tokenizer) print(sp) print(sp(sampleText)) # # transform = BERTSentenceTransform(tok, max_seq_length=32, pad=True, pair=False) # transform2 = BERTSentenceTransform(sp, max_seq_length=32, vocab=None, pad=True, pair=False) # print(transform("한국어 모델을 공유합니다.")) # print(transform2("한국어 모델을 공유합니다."))