class BertXNLIDataset(Dataset): def __init__(self, directory, prefix, vocab_file, max_length: int = 512): super().__init__() self.max_length = max_length with open(os.path.join(directory, 'xnli_' + prefix), 'r') as f: lines = f.readlines() self.lines = lines self.tokenizer = BertWordPieceTokenizer(vocab_file) self.label_map = { "entailment": 0, "neutral": 1, "contradiction": 2, "contradictory": 2 } def __len__(self): return len(self.lines) def __getitem__(self, idx): line = self.lines[idx] first, second, third = line.strip().split('\t', 2) first_input_ids = self.tokenizer.encode(first, add_special_tokens=False).ids second_input_ids = self.tokenizer.encode(second, add_special_tokens=False).ids label = self.label_map[third] input_ids = first_input_ids + [103] + second_input_ids if len(input_ids) > self.max_length - 2: input_ids = input_ids[:self.max_length - 2] # convert list to tensor input_ids = torch.LongTensor([101] + input_ids + [102]) label = torch.LongTensor([int(label)]) return input_ids, label
def __init__(self, data_path: str, tokenizer: BertWordPieceTokenizer, max_seq_length: int, vocab_file_name: str): # For caching data_dirname = os.path.dirname(os.path.abspath(data_path)) split = os.path.basename(os.path.abspath(data_path)) # Process data self.tokenizer = tokenizer cached_path = os.path.join(data_dirname, "{}_{}_response_selection_cached".format(split, vocab_file_name)) if not os.path.exists(cached_path): self.examples = [] data = json.load(open(data_path)) for example in tqdm(data, desc="Preprocessing"): context = ' <turn> '.join([e['utterance'] for e in example['messages-so-far']]) context = ' '.join(context.split()[-max_seq_length:]) encoded_context = tokenizer.encode(context) response = example['options-for-correct-answers'][0]['utterance'] response = ' '.join(response.split()[-max_seq_length:]) encoded_response = tokenizer.encode(response) candidates = [ ' '.join(e['utterance'].split()[-max_seq_length:]) for e in example['options-for-next'] ] encoded_candidates = [tokenizer.encode(cand) for cand in candidates] correct_id = example['options-for-correct-answers'][0]['candidate-id'] correct_ind = [ i for i,e in enumerate(example['options-for-next']) if e['candidate-id'] == correct_id ] candidate_inputs = [{ "input_ids": np.array(cand.ids), "attention_mask": np.array(cand.attention_mask), "token_type_ids": np.array(cand.type_ids) } for cand in encoded_candidates] self.examples.append({ "ctx_input_ids": np.array(encoded_context.ids), "ctx_attention_mask": np.array(encoded_context.attention_mask), "ctx_token_type_ids": np.array(encoded_context.type_ids), "rsp_input_ids": np.array(encoded_response.ids), "rsp_attention_mask": np.array(encoded_response.attention_mask), "rsp_token_type_ids": np.array(encoded_response.type_ids), "candidates": candidate_inputs, "correct_candidate": correct_ind }) with open(cached_path, "wb") as f: pickle.dump(self.examples, f) else: LOGGER.info("Loading from cached path: {}".format(cached_path)) with open(cached_path, "rb") as f: self.examples = pickle.load(f)
class CheckerDecoder: def __init__(self, model_dir): self.detector = DetectorModel(os.path.join(model_dir, 'detector')) self.corrector = CorrectorModel(os.path.join(model_dir, 'corrector')) self.tokenizer = BertWordPieceTokenizer( os.path.join(model_dir, 'vocab.txt')) mask_id = self.tokenizer.encode('[MASK]').ids[1:-1] assert len(mask_id) == 1 self.mask_id = mask_id[0] def predict(self, text, suggest=False, k=5, max_k=200): tokenized = self.tokenizer.encode(text) if len(tokenized.tokens) > MAX_LEN: raise ValueError('The text is too long (>512) to process') token_ids = tokenized.ids segment_ids = tokenized.type_ids mapping = rematch(tokenized.offsets) token_ids, segment_ids = np.array([token_ids]), np.array([segment_ids]) probas = self.detector.predict(token_ids, segment_ids)[0][0] incorrect_ids = np.where(probas > 0.5)[0] token_ids[0, incorrect_ids] = self.mask_id if not suggest: ret = [] for i in incorrect_ids: ret.append((i - 1, tokenized.tokens[i])) return ret probas = self.corrector.predict(token_ids, segment_ids)[0][0] sorted_probas, sort_indexs = topK(probas, max_k) ret = {} for i in incorrect_ids: if i == 0 or i == len(tokenized.tokens) - 1: continue current_token = text[mapping[i][0]:mapping[i][-1] + 1] current_pinyin = ' '.join(xmnlp.pinyin(current_token)) cands = [] for proba, token in zip( sorted_probas[i], self.tokenizer.decode(sort_indexs[i]).split()): pinyin = ' '.join(xmnlp.pinyin(token)) score = 0 if current_pinyin == pinyin: score = 1 cands.append((token, proba + score)) cands.sort(key=lambda x: x[1], reverse=True) ret[(i - 1, current_token)] = cands[:k] return dict(ret)
def test_train_from_iterator(self): text = ["A first sentence", "Another sentence", "And a last one"] tokenizer = BertWordPieceTokenizer() tokenizer.train_from_iterator(text, show_progress=False) output = tokenizer.encode("A sentence") assert output.tokens == ["a", "sentence"]
def _get_word_tokens_len(word: str, tokenizer: BertWordPieceTokenizer) -> int: return sum( map( lambda token: 1 if token not in SPECIAL_TOKENS else 0, tokenizer.encode(word).tokens, ))
def inf(text, model): class2names = { "DESC": "DESCRIPTION", "ENTY": "ENTITY", "ABBR": "ABBREVIATION", "HUM": "HUMAN", "NUM": "NUMERIC", "LOC": "LOCATION" } class2names = load_pickle('class2names.pkl') subclass2names = load_pickle('subclass2names.pkl') idx2class = load_pickle('idx2class.pkl') idx2subclass = load_pickle('idx2subclass.pkl') tokenizer = BertWordPieceTokenizer('bert-word-piece-custom-wikitext-vocab-10k-vocab.txt', lowercase = True, strip_accents = True) tokens = torch.FloatTensor(tokenizer.encode(text).ids).unsqueeze(0).to('cpu') cls_, subcls = model(tokens) clsIdx = cls_.max(1)[-1].item() subclsIdx = subcls.max(1)[-1].item() return { "class": class2names[idx2class[clsIdx]], "subclass": subclass2names[idx2subclass[subclsIdx]] }
def _build_bert_inputs(self): vocab_path = os.path.join(os.environ["GOOGLE_BERT_PATH"], "uncased_L-6_H-768_A-12", "vocab.txt") tokenizer = BertWordPieceTokenizer(vocab_path) encoding = tokenizer.encode("我爱NLP") input_ids = tf.constant([encoding.ids], dtype=tf.int32, shape=(1, len(encoding.ids))) segment_ids = tf.constant([encoding.type_ids], dtype=tf.int32, shape=(1, len(encoding.type_ids))) attention_mask = tf.constant([encoding.attention_mask], dtype=tf.int32, shape=(1, len(encoding.attention_mask))) return input_ids, segment_ids, attention_mask
def tokenize(sentence): # Instantiate a Bert tokenizers WordPiece = BertWordPieceTokenizer(bertLargeUncased) WordPieceEncoder = WordPiece.encode(sentence) # Print the ids, tokens and offsets print(WordPieceEncoder.ids) print(WordPieceEncoder.tokens) print(WordPieceEncoder.offsets)
def __init__(self, tokenizer: AutoTokenizer, file_path: str, args): print(file_path) assert os.path.isfile(file_path) directory, filename = os.path.split(file_path) cached_features_file = os.path.join( directory, args.bert_model_type + "_cached_mlm_" + filename) if os.path.exists(cached_features_file): print("Loading features from cached file %s", cached_features_file) with open(cached_features_file, "rb") as handle: self.samples = torch.load(handle) else: print("Creating features from dataset file at %s", directory) # Get the faster tokenizer from tokenizers package tokenizer.save_vocabulary(vocab_path='.') fast_tokenizer = BertWordPieceTokenizer("vocab.txt", lowercase=args.lowercase) fast_tokenizer.enable_truncation(tokenizer.max_len) fast_tokenizer.enable_padding(max_length=tokenizer.max_len, pad_token=tokenizer.pad_token) self.samples = [] # Load data over here df = pd.read_json(file_path) print('SQUAD data: ') for _, row in tqdm(df.iterrows(), total=df.shape[0]): for paragraph in row['data']['paragraphs']: context = paragraph['context'] for qa_pair in paragraph['qas']: question = qa_pair['question'] batch = fast_tokenizer.encode(question, context) self.samples.append({ 'input_ids': batch.ids, 'attention_mask': batch.attention_mask }) for encoding in batch.overflowing: self.samples.append({ 'input_ids': encoding.ids, 'attention_mask': encoding.attention_mask }) df = None print("Saving features into cached file: ", cached_features_file) with open(cached_features_file, "wb") as handle: torch.save(self.samples, handle, pickle_protocol=pickle.HIGHEST_PROTOCOL)
def main(args): print(args) if args['train']: tokenizer = BertWordPieceTokenizer( clean_text=True, handle_chinese_chars=True, strip_accents=True, # Must be False if cased model lowercase=True, wordpieces_prefix="##" ) tokenizer.train( files=['/data2/BERT/data/naver_news/news_3_preprocessed/naver_news.txt'], limit_alphabet=6000, vocab_size=32000 ) print(tokenizer.save_model("../BertWordPieceTokenizer_32000")) elif args['test']: test_str = '나는 워드피스 토크나이저를 써요. 성능이 좋은지 테스트 해보려 합니다.' print("=========== tokenizer ===========") tokenizer = BertWordPieceTokenizer("../BertWordPieceTokenizer_32000/vocab.txt") print(tokenizer) encoded_str = tokenizer.encode(test_str) print('encoding: ', encoded_str.ids) decoded_str = tokenizer.decode(encoded_str.ids) print(decoded_str) print("=========== BertTokenizer ===========") tokenizer = BertTokenizer("../BertWordPieceTokenizer_32000/vocab.txt") print(tokenizer) encoded_str = tokenizer.encode(test_str) print('encoding: ', encoded_str) decoded_str = tokenizer.decode(encoded_str) print(decoded_str) print("=========== BertTokenizer2 ===========") tokenizer = BertTokenizer.from_pretrained("../BertWordPieceTokenizer_32000") print(tokenizer) encoded_str = tokenizer.encode(test_str) print('encoding: ', encoded_str) decoded_str = tokenizer.decode(encoded_str) print(decoded_str)
def generate_custom_vocab(self): try: tokenizer = None # root dir path check and generate if not os.path.isdir(self.vocab_root_dir): os.makedirs(self.vocab_root_dir, exist_ok=True) # generate models directory self.vocab_dir = '/BERT_TRAINING_VOCAB_' + self.getCurrent_time()[2] + '/' os.makedirs(self.vocab_root_dir + self.vocab_dir, exist_ok=True) user_defined_symbols = ['[BOS]', '[EOS]', '[UNK]', '[UNK1]', '[UNK2]', '[UNK3]', '[UNK4]', '[UNK5]', '[UNK6]', '[UNK7]', '[UNK8]', '[UNK9]'] unused_token_num = 200 unused_list = ['[unused{}]'.format(n) for n in range(unused_token_num)] user_defined_symbols = user_defined_symbols + unused_list if self.tokenizer_type == 'word': # if lowercase is False must set strip_accents option as 'False' tokenizer = BertWordPieceTokenizer(strip_accents=False, lowercase=True, clean_text=True, handle_chinese_chars=True, wordpieces_prefix="##" ) # when selected 'base' going to use bert-base-uncased tokenizer... close function # training vocab start corpus_file = [self.corpus_path] vocab_size = 32000 limit_alphabet = 6000 min_frequency = 3 tokenizer.train(files=corpus_file, vocab_size=vocab_size, special_tokens=user_defined_symbols, min_frequency=min_frequency, # 단어의 최소 발생 빈도, 3 limit_alphabet=limit_alphabet, # ByteLevelBPETokenizer 학습시엔 주석처리 필요 show_progress=True) self.setPrint('Customer Tokenizer Training is completed') sentence = '전화 통화가 정상적으로 안됨.' output = tokenizer.encode(sentence) self.setPrint('Tokenizer 테스트 문장: {}'.format(sentence)) self.setPrint('Tokenizer 분석 결과\n=>idx: {}\n=>tokens: {}\n=>offset: {}\n=>decode: {}\n'. format(output.ids, output.tokens, output.offsets, tokenizer.decode(output.ids))) # save tokenizer tokenizer.save_model(self.vocab_root_dir + self.vocab_dir) except: self.setPrint('Error: {}. {}, line: {}'.format(sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2].tb_lineno))
class BERTTokenizer(Tokenizer): def __init__(self): super(BERTTokenizer, self).__init__() self.tokenizer = BertWordPieceTokenizer(DATA_DIR / "vocab" / "bert.txt", lowercase=True) def tokenize(self, text): seg_result = self.tokenizer.encode(text).tokens return seg_result
def _build_model_inputs(self): vocab_path = os.path.join(os.environ["GOOGLE_BERT_PATH"], "uncased_L-6_H-768_A-12", "vocab.txt") tokenizer = BertWordPieceTokenizer(vocab_path) encoding = tokenizer.encode("I love NLP, Neural [MASK] Processing is amazing!") logging.info(" ids: %s", encoding.ids) logging.info("tokens: %s", encoding.tokens) input_ids = tf.constant([encoding.ids], dtype=tf.int32, shape=(1, len(encoding.ids))) segment_ids = tf.constant([encoding.type_ids], dtype=tf.int32, shape=(1, len(encoding.type_ids))) attention_mask = tf.constant([encoding.attention_mask], dtype=tf.int32, shape=(1, len(encoding.attention_mask))) return input_ids, segment_ids, attention_mask
def test_basic_encode(self, bert_files): tokenizer = BertWordPieceTokenizer(bert_files["vocab"]) # Encode with special tokens by default output = tokenizer.encode("My name is John", "pair") assert output.ids == [101, 2026, 2171, 2003, 2198, 102, 3940, 102] assert output.tokens == [ "[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]" ] assert output.offsets == [(0, 0), (0, 2), (3, 7), (8, 10), (11, 15), (0, 0), (0, 4), (0, 0)] assert output.type_ids == [0, 0, 0, 0, 0, 0, 1, 1] # Can encode without the special tokens output = tokenizer.encode("My name is John", "pair", add_special_tokens=False) assert output.ids == [2026, 2171, 2003, 2198, 3940] assert output.tokens == ["my", "name", "is", "john", "pair"] assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15), (0, 4)] assert output.type_ids == [0, 0, 0, 0, 1]
def __init__(self, data_path: str, tokenizer: BertWordPieceTokenizer, max_seq_length: int, vocab_file_name: str): # For caching data_dirname = os.path.dirname(os.path.abspath(data_path)) split = os.path.basename(os.path.abspath(data_path)) # Slot categories slot_vocab_path = os.path.join(os.path.dirname(data_path), "vocab.slot") slot_names = [e.strip() for e in open(slot_vocab_path).readlines()] slot_names.insert(0, "[PAD]") self.slot_label_to_idx = dict((label, idx) for idx, label in enumerate(slot_names)) self.slot_idx_to_label = {idx: label for label, idx in self.slot_label_to_idx.items()} # Intent categories intent_vocab_path = os.path.join(data_dirname, "vocab.intent") intent_names = [e.strip() for e in open(intent_vocab_path).readlines()] self.intent_label_to_idx = dict((label, idx) for idx, label in enumerate(intent_names)) self.intent_idx_to_label = {idx: label for label, idx in self.intent_label_to_idx.items()} # Process data self.tokenizer = tokenizer cached_path = os.path.join(data_dirname, "{}_{}_top_cached".format(split, vocab_file_name)) if not os.path.exists(cached_path): self.examples = [] data = [e.strip() for e in open(data_path).readlines() ] for example in tqdm(data): example, intent = example.split(" <=> ") text = " ".join([e.split(":")[0] for e in example.split()]) slots = " ".join([e.split(":")[1] for e in example.split()]) encoded = tokenizer.encode(text) encoded_slot_labels = self.encode_token_labels([text], [slots], len(encoded.ids), tokenizer, self.slot_label_to_idx, max_seq_length) self.examples.append({ "input_ids": np.array(encoded.ids)[-max_seq_length:], "attention_mask": np.array(encoded.attention_mask)[-max_seq_length:], "token_type_ids": np.array(encoded.type_ids)[-max_seq_length:], "slot_labels": encoded_slot_labels[-max_seq_length:], "intent_label": self.intent_label_to_idx[intent], "ind": len(self.examples), }) with open(cached_path, "wb") as f: pickle.dump(self.examples, f) else: LOGGER.info("Loading from cached path: {}".format(cached_path)) with open(cached_path, "rb") as f: self.examples = pickle.load(f)
def tokenizer(file_name, model, is_file=False, remove_punc=False): # Kurdish tokenizer """Given a list of sentences in Kurdish, return the tokenized one as text with spaces between tokens """ if is_file: with open(file_name, "r") as f: text = f.read().split("\n") else: text = file_name models = { "wordpiece": 'tokenization_models/ckb-wordpiece_all_False_50000-vocab.txt', "bpe": 'tokenization_models/ckb_bpe_50k.model', "unigram": 'tokenization_models/ckb_unigram_50k.model', "WordPunct": "" } tokenized_text = list() if model == "wordpiece": WordPiece = BertWordPieceTokenizer(models[model], strip_accents=False, clean_text=False, lowercase=False) for sentence in text: WordPieceEncoder = WordPiece.encode(sentence) sentence_tokenized = " ".join(WordPieceEncoder.tokens) for token in ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "##"]: sentence_tokenized = sentence_tokenized.replace(token, " ") tokenized_text.append(" ".join(sentence_tokenized.split())) elif model == "WordPunct": for sentence in text: tokenized_text.append(" ".join( WordPunctTokenizer().tokenize(sentence))) else: sp = spm.SentencePieceProcessor() sp.Load(models[model]) for sentence in text: # print(" ".join( sp.EncodeAsPieces(sentence)).replace("▁", "") ) tokenized_text.append(" ".join( sp.EncodeAsPieces(sentence)).replace("▁", "")) if remove_punc: return remove_punctuation("\n".join(tokenized_text)) else: return "\n".join(tokenized_text)
def __init__( self, data_path: str, tokenizer: BertWordPieceTokenizer, max_seq_length: int, vocab_file_name: str, ): # For caching data_dirname = os.path.dirname(os.path.abspath(data_path)) split = os.path.basename(os.path.abspath(data_path)) # Intent categories intent_vocab_path = os.path.join(data_dirname, "categories.json") intent_names = json.load(open(intent_vocab_path)) self.intent_label_to_idx = dict( (label, idx) for idx, label in enumerate(intent_names)) self.intent_idx_to_label = { idx: label for label, idx in self.intent_label_to_idx.items() } # Process data self.tokenizer = tokenizer cached_path = os.path.join( data_dirname, "{}_{}_intent_cached".format(split, vocab_file_name)) if not os.path.exists(cached_path): self.examples = [] reader = csv.reader(open(data_path)) next(reader, None) out = [] for utt, intent in tqdm(reader): encoded = tokenizer.encode(utt) self.examples.append({ "input_ids": np.array(encoded.ids)[-max_seq_length:], "attention_mask": np.array(encoded.attention_mask)[-max_seq_length:], "token_type_ids": np.array(encoded.type_ids)[-max_seq_length:], "intent_label": self.intent_label_to_idx[intent], "ind": len(self.examples), }) with open(cached_path, "wb") as f: pickle.dump(self.examples, f) else: LOGGER.info("Loading from cached path: {}".format(cached_path)) with open(cached_path, "rb") as f: self.examples = pickle.load(f)
def preprocess(self,max_len): utterance = self.utterance context = self.context label = self.label max_len=int(max_len) tokenizer = BertWordPieceTokenizer("bert_base_uncased/vocab.txt", lowercase=True) # Tokenize context tokenized_context = tokenizer.encode(context) # Tokenize utterance tokenized_utterance = tokenizer.encode(utterance) # Create inputs input_ids = tokenized_context.ids + tokenized_utterance.ids[1:] token_type_ids = [0] * len(tokenized_context.ids) + [1] * len( tokenized_utterance.ids[1:] ) attention_mask = [1] * len(input_ids) # Pad and create attention masks. # Skip if truncation is needed padding_length = max_len - len(input_ids) if padding_length > 0: # pad input_ids = input_ids + ([0] * padding_length) attention_mask = attention_mask + ([0] * padding_length) token_type_ids = token_type_ids + ([0] * padding_length) elif padding_length < 0: # skip input_ids = input_ids[:max_len] attention_mask = attention_mask[:max_len] token_type_ids = token_type_ids[:max_len] self.input_ids = input_ids self.token_type_ids = token_type_ids self.attention_mask = attention_mask self.label = label self.context_token_to_char = tokenized_context.offsets
def __init__(self, json_data_file, labels, vocab_dict, n_tokens): """ labels is an object of class Labels() """ WordPiece = BertWordPieceTokenizer( "bert-base-uncased-vocab.txt", lowercase=True, add_special_tokens=False, sep_token="", cls_token="", ) self.x = [] self.y = [] self.similarity = torch.zeros((labels.n_labels, labels.n_labels)) vocab = set() vocab.update(["UNK"]) for l in tqdm(open(json_data_file)): d = json.loads(l) WordPieceEncoder = WordPiece.encode(d["text"]) tokens = WordPieceEncoder.tokens self.x.append(tokens) vocab.update(tokens) self.y.append(labels.multihot(d["label"])) li = [labels.stoi[l] for l in d["label"]] for i in li: for j in li: self.similarity[i, j] += 1 self.similarity[j, i] += 1 self.similarity /= len(self.x) self.vocab = {tok: i for i, tok in enumerate(vocab)} if vocab_dict != None: self.vocab = vocab_dict for idx in tqdm(range(len(self.x))): self.x[idx] = [ self.vocab[i] if i in self.vocab else self.vocab["UNK"] for i in self.x[idx] ][:n_tokens] if len(self.x[idx]) < n_tokens: self.x[idx] += [self.vocab["UNK"]] * n_tokens self.x[idx] = self.x[idx][:n_tokens] self.len = len(self.x)
def _build_bert_inputs(self): vocab_path = os.path.join(BASE_DIR, 'bert_uncased_L-6_H-768_A-12', 'vocab.txt') tokenizer = BertWordPieceTokenizer(vocab_path) encoding = tokenizer.encode('我爱NLP') input_ids = tf.constant([encoding.ids], dtype=tf.int32, shape=(1, len(encoding.ids))) segment_ids = tf.constant([encoding.type_ids], dtype=tf.int32, shape=(1, len(encoding.type_ids))) attention_mask = tf.constant([encoding.attention_mask], dtype=tf.int32, shape=(1, len(encoding.attention_mask))) return input_ids, segment_ids, attention_mask
def test_tokenizer(text, model): # # Encode and decode # WordPiece = BertWordPieceTokenizer(model, lowercase=False) WordPiece = BertWordPieceTokenizer(model, strip_accents=True, clean_text=False, lowercase=False) WordPieceEncoder = WordPiece.encode(text) # print(WordPieceEncoder) # print(WordPieceEncoder.ids) # print(WordPieceEncoder.tokens) # print(WordPieceEncoder.offsets) return " ".join(WordPieceEncoder.tokens)
class BertTokenizer: def __init__(self, pretrained_name: str = "bert-base-cased-vocab.txt"): self.tokenizer = BertWordPieceTokenizer(pretrained_name, lowercase=False) def tokenize(self, s: str, offset: int = 0) -> List[Token]: output = self.tokenizer.encode(s) result = [] n = len(output.tokens) for i, (bpe, pos, token_id) in enumerate( zip(output.tokens, output.offsets, output.ids)): if i == 0 or i == n - 1: continue result.append(Token(bpe, idx=offset + pos[0], text_id=token_id)) return result
def __init__(self, data_path: str, tokenizer: BertWordPieceTokenizer, max_seq_length: int, vocab_file_name: str): # For caching data_dirname = os.path.dirname(os.path.abspath(data_path)) split = os.path.basename(os.path.abspath(data_path)) # Slot categories slot_vocab_path = os.path.join(os.path.dirname(data_path), "slots.json") slot_names = json.load(open(slot_vocab_path)) slot_names.insert(0, "[PAD]") self.slot_label_to_idx = dict((label, idx) for idx, label in enumerate(slot_names)) self.slot_idx_to_label = {idx: label for label, idx in self.slot_label_to_idx.items()} # Process data self.tokenizer = tokenizer cached_path = os.path.join(data_dirname, "{}_{}_slots_cached".format(split, vocab_file_name)) texts = [] slotss = [] if not os.path.exists(cached_path): self.examples = [] data = json.load(open(data_path)) for example in tqdm(data): for text,slots in self.parse_example(example, max_seq_length): encoded = tokenizer.encode(text) encoded_slot_labels = self.encode_token_labels([text], [slots], len(encoded.ids), tokenizer, self.slot_label_to_idx, max_seq_length) self.examples.append({ "input_ids": np.array(encoded.ids)[-max_seq_length:], "attention_mask": np.array(encoded.attention_mask)[-max_seq_length:], "token_type_ids": np.array(encoded.type_ids)[-max_seq_length:], "slot_labels": encoded_slot_labels[-max_seq_length:] }) texts.append(text) slotss.append(slots) with open(cached_path, "wb") as f: pickle.dump(self.examples, f) else: LOGGER.info("Loading from cached path: {}".format(cached_path)) with open(cached_path, "rb") as f: self.examples = pickle.load(f)
class Tokenizer: def __init__(self, bert_model = "bert-base-uncased"): self.bert_model = bert_model self.vocabulary_path = "{}-vocab.txt".format(bert_model) print("Vocabulary for BERT model {}: {}".format(bert_model, self.vocabulary_path)) self.tokenizer = BertWordPieceTokenizer(self.vocabulary_path) def encode(self, plain_text: list, max_length=100): """Use encode_plus instead? """ token_ids = np.zeros(shape=(len(plain_text), max_length), dtype=np.int32) for i, text in enumerate(plain_text): encoded = self.tokenizer.encode(text) token_ids[i, 0:len(encoded)] = encoded.ids attention_masks = (token_ids != 0).astype(np.int32) return {"input_ids": token_ids, "attention_masks": attention_masks}
class BERT16SDataset(Dataset): """ A torch dataset class designed to load a 16S data found in a tsv file and encode it for BERT. :param vocab_path: str, path to the pre-trained bert tokenizer vocab file. :param data_path: str, path to the 16S data file. :param block_size: str, maximal BERT input (an encoded sample will be padded to this length if too short) :param max_word_length: int, the maximal word length the tokenizer can encode. """ def __init__(self, vocab_path: str, data_path: str, block_size=512, max_word_length=100): assert os.path.isfile(data_path) assert os.path.isfile(vocab_path) _logger.info(f"Loading BERT tokenizer using vocab file {vocab_path}") self.tokenizer = BertWordPieceTokenizer( vocab_path, handle_chinese_chars=False, lowercase=False) self.tokenizer.enable_truncation(block_size) self.tokenizer.enable_padding(max_length=block_size) _logger.info(f"Loading 16S dataset file at {data_path}...") self._16s_corpus_df = pd.read_csv(data_path, sep='\t') _logger.info(f"16S corpus is of shape {self._16s_corpus_df.shape}") self.samples = self._16s_corpus_df.seq.values.tolist() self.max_word_length = max_word_length def __len__(self): return len(self._16s_corpus_df) def __getitem__(self, i): sample = self._split_sequence_by_max_word_length(self.samples[i]) tokens = self.tokenizer.encode(sample) return torch.tensor(tokens.ids, dtype=torch.long) def _split_sequence_by_max_word_length(self, seq): """ split a 16S sequence (~1K long usually) into white-spaces separated chunks that the tokenizer can encode. :param seq: str, 16S sequence :return: str """ chunks = [seq[i: i + self.max_word_length] for i in range(0, len(seq), self.max_word_length)] return ' '.join(chunks)
class BertTokenizer(Transformer): @timer def setup(self, stopwords=None, punct=None, lower=True, strip=True): self.tokenizer = BertWordPieceTokenizer(VOCAB_FILE, lowercase=lower) self.punct = punct or set(string.punctuation) self.stopwords = stopwords or set(sw.words("english")) def process_single(self, document): tokenized_text = self.tokenizer.encode(document) for token in tokenized_text.tokens: # If stopword, ignore token and continue if token in self.stopwords or token == "[CLS]" or token == "[SEP]": continue # If punctuation, ignore token and continue if all(char in self.punct for char in token): continue yield token
class Tokenizer: def __init__(self, lang): """ A Tokenizer class to load and train a custom tokenizer Using the Hugging Face tokenization library for the same """ self.tokenizer_dir = r"data/{}".format(lang) if not os.path.exists(self.tokenizer_dir): os.mkdir(self.tokenizer_dir) self.vocab = self.tokenizer_dir + "/vocab.txt" if os.path.exists(self.vocab): print("Initialized tokenizer using cached vocab file {}".format(self.vocab)) self.tokenizer = BertWordPieceTokenizer(vocab_file=self.vocab) else: self.tokenizer = BertWordPieceTokenizer() self.tokenizer.enable_padding(max_length=MAX_LENGTH) self.tokenizer.enable_truncation(max_length=MAX_LENGTH) def train_tokenizer(self, sentences): """ Train a tokenizer with a list of sentences """ if not os.path.exists(self.vocab): print("Training tokenizer for {}".format(self.tokenizer_dir)) # Hugging Face only accepts a Temp File with sentences for Training Tokenizer with open(self.tokenizer_dir + "/data.txt", "w+", encoding="utf-8") as f: [f.write(i + "\n") for i in sentences] self.tokenizer.train([self.tokenizer_dir + "/data.txt"]) self.tokenizer.save(self.tokenizer_dir) print("Trained a tokenizer with vocab size {}".format(self.tokenizer.get_vocab_size())) # Removing the temp file os.remove(self.tokenizer_dir + "/data.txt") def encode(self, decoded): return self.tokenizer.encode(decoded) def decode(self, encoded): return self.tokenizer.decode_batch(encoded)
class SentimentModel: def __init__(self, model_dir): # load session and graph self.sess = tf.Session(graph=tf.Graph()) tf.saved_model.loader.load(self.sess, ['serve'], export_dir=model_dir) self.tokenizer = BertWordPieceTokenizer(os.path.join(model_dir, 'vocab.txt')) self.tokenizer.enable_truncation(max_length=MAX_LEN) def predict(self, text): tokenized = self.tokenizer.encode(text) token_ids = tokenized.ids segment_ids = tokenized.type_ids token_ids, segment_ids = np.array([token_ids]), np.array([segment_ids]) # placeholder input_token = self.sess.graph.get_tensor_by_name('Input-Token:0') input_segment = self.sess.graph.get_tensor_by_name('Input-Segment:0') output = self.sess.graph.get_tensor_by_name('label/Softmax:0') probas = self.sess.run([output], feed_dict={input_token: token_ids, input_segment: segment_ids}) return tuple(probas[0][0].tolist())
def sentences_to_paragraphs( sentences: List[str], max_paragraph_len: int, tokenizer: BertWordPieceTokenizer) -> List[Paragraph]: parag_start, parag_end, paragraphs = 0, 0, [] hold_parag, hold_parag_len = '', 0 for sentence in sentences: encoding = tokenizer.encode(sentence) # [1:-1] to exclude [CLS] and [SEP] clipped_offsets = encoding.offsets[1:-1][:max_paragraph_len] (sent_start, _), (_, sent_end) = clipped_offsets[0], clipped_offsets[-1] clipped_sentence = sentence[sent_start:sent_end] n_tokens = len(clipped_offsets) if (hold_parag_len + n_tokens) > max_paragraph_len: paragraphs.append(Paragraph(parag_start, parag_end, hold_parag)) hold_parag, hold_parag_len = '', 0 parag_start = parag_end hold_parag, hold_parag_len = hold_parag + clipped_sentence, hold_parag_len + n_tokens parag_end += len(sentence) if hold_parag: paragraphs.append(Paragraph(parag_start, parag_end, hold_parag)) return paragraphs
class BertChnSentCorpDataset(Dataset): def __init__(self, directory, prefix, vocab_file, max_length: int = 512): super().__init__() self.max_length = max_length with open(os.path.join(directory, prefix + '.tsv'), 'r') as f: lines = f.readlines() self.lines = lines[1:] self.tokenizer = BertWordPieceTokenizer(vocab_file) def __len__(self): return len(self.lines) def __getitem__(self, idx): line = self.lines[idx] label, sentence = line.split('\t', 1) input_ids = self.tokenizer.encode(sentence, add_special_tokens=False).ids if len(input_ids) > self.max_length - 2: input_ids = input_ids[:self.max_length - 2] # convert list to tensor input_ids = torch.LongTensor([101] + input_ids + [102]) label = torch.LongTensor([int(label)]) return input_ids, label