def main(args): logger = TensorBoardLogger(save_dir="./experiment_logs") # load dataset and tokenize train_dataset = (load_dataset( "sentiment140", split="train").shuffle().select(range(45000))) test_dataset = load_dataset("sentiment140", split="test").shuffle() tokenizer = BertWordPieceTokenizer("bert-base-uncased-vocab.txt", lowercase=True) train_dataset = train_dataset.map(lambda e: preprocess(e, tokenizer), num_proc=4) test_dataset = test_dataset.map(lambda e: preprocess(e, tokenizer), num_proc=4) train_dataset.set_format("torch", columns=["text", "sentiment"]) test_dataset.set_format("torch", columns=["text", "sentiment"]) train_dataset, val_dataset = train_dataset.train_test_split( train_size=args.train_frac).values() train_dataloader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=20, collate_fn=collate_fn) val_dataloader = DataLoader(val_dataset, batch_size=args.batch_size, num_workers=20, collate_fn=collate_fn) test_dataloader = DataLoader(test_dataset, batch_size=args.batch_size, num_workers=20, collate_fn=collate_fn) trainer = pl.Trainer.from_argparse_args( args, logger=logger, # log_every_n_steps=10, ) if args.action.lower() == "train": print("VOCAB SIZE") print(tokenizer.get_vocab_size()) model = SentimentLSTM( embedding_dim=args.embedding_dim, hidden_dim=args.hidden_dim, output_dim=NUM_CLASSES, vocab_size=tokenizer.get_vocab_size(), ) trainer.fit(model, train_dataloader, val_dataloader) elif args.action.lower() == "eval": model = SentimentLSTM.load_from_checkpoint(args.model_path) model.eval() trainer.test(model, test_dataloaders=test_dataloader)
class Tokenizer: def __init__(self, lang): """ A Tokenizer class to load and train a custom tokenizer Using the Hugging Face tokenization library for the same """ self.tokenizer_dir = r"data/{}".format(lang) if not os.path.exists(self.tokenizer_dir): os.mkdir(self.tokenizer_dir) self.vocab = self.tokenizer_dir + "/vocab.txt" if os.path.exists(self.vocab): print("Initialized tokenizer using cached vocab file {}".format(self.vocab)) self.tokenizer = BertWordPieceTokenizer(vocab_file=self.vocab) else: self.tokenizer = BertWordPieceTokenizer() self.tokenizer.enable_padding(max_length=MAX_LENGTH) self.tokenizer.enable_truncation(max_length=MAX_LENGTH) def train_tokenizer(self, sentences): """ Train a tokenizer with a list of sentences """ if not os.path.exists(self.vocab): print("Training tokenizer for {}".format(self.tokenizer_dir)) # Hugging Face only accepts a Temp File with sentences for Training Tokenizer with open(self.tokenizer_dir + "/data.txt", "w+", encoding="utf-8") as f: [f.write(i + "\n") for i in sentences] self.tokenizer.train([self.tokenizer_dir + "/data.txt"]) self.tokenizer.save(self.tokenizer_dir) print("Trained a tokenizer with vocab size {}".format(self.tokenizer.get_vocab_size())) # Removing the temp file os.remove(self.tokenizer_dir + "/data.txt") def encode(self, decoded): return self.tokenizer.encode(decoded) def decode(self, encoded): return self.tokenizer.decode_batch(encoded)
def load_model(): tokenizer = BertWordPieceTokenizer('bert-word-piece-custom-wikitext-vocab-10k-vocab.txt', lowercase = True, strip_accents = True) vocab_size = tokenizer.get_vocab_size() pad_id = 0 CLS_label_id = 2 num_class_heads = 2 lst_num_cat_in_classes = [6, 47] seq_len = 100 batch_size = 256 num_workers = 3 model = TwoClassHeadClassificationTransformer( vocab_size=vocab_size, pad_id=pad_id, CLS_label_id=CLS_label_id, num_class_heads=num_class_heads, lst_num_cat_in_classes=lst_num_cat_in_classes, num_pos=seq_len ) model = torch.load('classification_model_best.pt', map_location = 'cpu') model = model.to('cpu') model = model.eval() return model
tokenizer = BertWordPieceTokenizer( r'C:\Users\David\Documents\Machine_learning\NLP\CardioExplorer\vocab.txt', lowercase=True) pretrain = True sentence_block_length = 32 max_sentence_blocks = 48 hidden_size = 256 batch_size = 4 shuffle = True drop_last = True sentence_block_vector = torch.normal(mean=0.0, std=1.0, size=[hidden_size]) sentence_config = BertConfig() sentence_config.vocab_size = tokenizer.get_vocab_size() sentence_config.num_hidden_layers = 6 sentence_config.hidden_size = 256 sentence_config.num_attention_heads = 4 sentence_config.max_position_embeddings = sentence_block_length # sentence_block_length document_config = BertConfig() document_config.vocab_size = tokenizer.get_vocab_size() document_config.num_hidden_layers = 3 document_config.hidden_size = 256 document_config.num_attention_heads = 4 document_config.max_position_embeddings = max_sentence_blocks # sentence_block_length dataset = Dataset(file_path, tokenizer, sentence_block_length,
import os import csv from tokenizers import BertWordPieceTokenizer # Files with commands. data_path = "/home/tkornuta/data/local-leonardo-sierra5k" processed_path = os.path.join(data_path, "processed") command_templates = os.path.join(processed_path, "command_templates.csv") command = os.path.join(processed_path, "command.csv") # Initialize a new tokenizer tokenizer = BertWordPieceTokenizer() # Then train it! tokenizer.train([command_templates, command], vocab_size=100) print("Vocabulary size: ", tokenizer.get_vocab_size()) for k, v in tokenizer.get_vocab().items(): print(k, ": ", v) # Samples from 5k - human labels. # data_00050000_00052798.gif,"Disjoint the given stacks to form a new stack with blue, red blocks.","Make a new stack with blue, red blocks." # data_00150000_00150539.gif,Place all the blocks individually on the surface.,Disjoint the given stack of blocks. # data_00110000_00110725.gif,"Separate the given stack to form yellow, red blocks stack.",Remove 2nd and 4th blocks from the given stack. # data_00120000_00120478.gif,Remove 1st and 2nd block from the given stack and form stack with blue on top of yellow block.,Do not touch green and red block and form another stack with blue and yellow block # Now, let's use it: #input = "I can feel the magic, can you?" #input = "Disjoint the given stacks to form a new stack with blue, red blocks." #input = "Make a new stack with blue, red blocks." input = "Remove 1st and 2nd block from the given stack and form stack with blue on top of yellow block.,Do not touch green and red block and form another stack with blue and yellow block" print(input)
torch.cuda.manual_seed(SEED) def load_pickle(filepath): with open(filepath, 'rb') as fp: return pickle.load(fp) tokenizer = BertWordPieceTokenizer( '../data/bert-word-piece-custom-wikitext-vocab-10k-vocab.txt', lowercase=True, strip_accents=True) data = load_pickle('../data/tokenized_questions_classes_subclasses_dict.pkl') vocab_size = tokenizer.get_vocab_size() pad_id = 0 CLS_label_id = 2 num_class_heads = 2 lst_num_cat_in_classes = [6, 47] seq_len = 100 batch_size = 256 num_workers = 3 model = TwoClassHeadClassificationTransformer( vocab_size=vocab_size, pad_id=pad_id, CLS_label_id=CLS_label_id, num_class_heads=num_class_heads, lst_num_cat_in_classes=lst_num_cat_in_classes, num_pos=seq_len)
class Reader(object): def __init__(self, bert_model: str, tokenizer: BaseTokenizer = None, cls: str = "[CLS]", sep: str = "[SEP]", threshold=6): self.tokenizer: BaseTokenizer = tokenizer self.cls = cls self.sep = sep if self.tokenizer is None: vocab_path: str = "tokenization/" + bert_model + ".txt" self.tokenizer = BertWordPieceTokenizer(vocab_path, lowercase="-cased" not in bert_model) self.threshold = threshold self.subword_alphabet: Optional[Alphabet] = None self.label_alphabet: Optional[Alphabet] = None self.train: Optional[List[SentInst]] = None self.dev: Optional[List[SentInst]] = None self.test: Optional[List[SentInst]] = None def _read_file(self, filename: str, mode: str = 'train') -> List[SentInst]: sent_list = [] max_len = 0 num_thresh = 0 with open(filename, "r", encoding="utf-8") as f: for line in f: line = line.strip() if line == "": # last few blank lines break raw_tokens = line.split(' ') tokens = raw_tokens chars = [list(t) for t in raw_tokens] entities = next(f).strip() if entities == "": # no entities sent_inst = SentInst(tokens, chars, []) else: entity_list = [] entities = entities.split("|") for item in entities: pointers, label = item.split() pointers = pointers.split(",") if int(pointers[1]) > len(tokens): pdb.set_trace() span_len = int(pointers[1]) - int(pointers[0]) if span_len < 0: print("Warning! span_len < 0") continue if span_len > max_len: max_len = span_len if span_len > self.threshold: num_thresh += 1 new_entity = (int(pointers[0]), int(pointers[1]), label) # may be duplicate entities in some datasets if (mode == 'train' and new_entity not in entity_list) or (mode != 'train'): entity_list.append(new_entity) # assert len(entity_list) == len(set(entity_list)) # check duplicate sent_inst = SentInst(tokens, chars, entity_list) assert next(f).strip() == "" # separating line sent_list.append(sent_inst) print("Max length: {}".format(max_len)) print("Threshold {}: {}".format(self.threshold, num_thresh)) return sent_list def _gen_dic(self) -> None: label_set = set() for sent_list in [self.train, self.dev, self.test]: num_mention = 0 for sentInst in sent_list: for entity in sentInst.entities: label_set.add(entity[2]) num_mention += len(sentInst.entities) print("# mentions: {}".format(num_mention)) vocab = [ self.tokenizer.id_to_token(idx) for idx in range(self.tokenizer.get_vocab_size()) ] self.subword_alphabet = Alphabet(vocab, 0) self.label_alphabet = Alphabet(label_set, 0) @staticmethod def _pad_batches(input_ids_batches: List[List[List[int]]], first_subtokens_batches: List[List[List[int]]]) \ -> Tuple[List[List[List[int]]], List[List[List[int]]], List[List[List[bool]]]]: padded_input_ids_batches = [] input_mask_batches = [] mask_batches = [] all_batches = list(zip(input_ids_batches, first_subtokens_batches)) for input_ids_batch, first_subtokens_batch in all_batches: batch_len = len(input_ids_batch) max_subtokens_num = max( [len(input_ids) for input_ids in input_ids_batch]) max_sent_len = max([ len(first_subtokens) for first_subtokens in first_subtokens_batch ]) padded_input_ids_batch = [] input_mask_batch = [] mask_batch = [] for i in range(batch_len): subtokens_num = len(input_ids_batch[i]) sent_len = len(first_subtokens_batch[i]) padded_subtoken_vec = input_ids_batch[i].copy() padded_subtoken_vec.extend([0] * (max_subtokens_num - subtokens_num)) input_mask = [1] * subtokens_num + [0] * (max_subtokens_num - subtokens_num) mask = [True] * sent_len + [False] * (max_sent_len - sent_len) padded_input_ids_batch.append(padded_subtoken_vec) input_mask_batch.append(input_mask) mask_batch.append(mask) padded_input_ids_batches.append(padded_input_ids_batch) input_mask_batches.append(input_mask_batch) mask_batches.append(mask_batch) return padded_input_ids_batches, input_mask_batches, mask_batches def get_batches(self, sentences: List[SentInst], batch_size: int) -> Tuple: subtoken_dic_dic = defaultdict(lambda: defaultdict(list)) first_subtoken_dic_dic = defaultdict(lambda: defaultdict(list)) last_subtoken_dic_dic = defaultdict(lambda: defaultdict(list)) label_dic_dic = defaultdict(lambda: defaultdict(list)) this_input_ids_batches = [] this_first_subtokens_batches = [] this_last_subtokens_batches = [] this_label_batches = [] for sentInst in sentences: subtoken_vec = [] first_subtoken_vec = [] last_subtoken_vec = [] subtoken_vec.append(self.tokenizer.token_to_id(self.cls)) for t in sentInst.tokens: encoding = self.tokenizer.encode(t) ids = [ v for v, mask in zip(encoding.ids, encoding.special_tokens_mask) if mask == 0 ] first_subtoken_vec.append(len(subtoken_vec)) subtoken_vec.extend(ids) last_subtoken_vec.append(len(subtoken_vec)) subtoken_vec.append(self.tokenizer.token_to_id(self.sep)) label_list = [(u[0], u[1], self.label_alphabet.get_index(u[2])) for u in sentInst.entities] subtoken_dic_dic[len( sentInst.tokens)][len(subtoken_vec)].append(subtoken_vec) first_subtoken_dic_dic[len( sentInst.tokens)][len(subtoken_vec)].append(first_subtoken_vec) last_subtoken_dic_dic[len( sentInst.tokens)][len(subtoken_vec)].append(last_subtoken_vec) label_dic_dic[len( sentInst.tokens)][len(subtoken_vec)].append(label_list) input_ids_batches = [] first_subtokens_batches = [] last_subtokens_batches = [] label_batches = [] for length1 in sorted(subtoken_dic_dic.keys(), reverse=True): for length2 in sorted(subtoken_dic_dic[length1].keys(), reverse=True): input_ids_batches.extend(subtoken_dic_dic[length1][length2]) first_subtokens_batches.extend( first_subtoken_dic_dic[length1][length2]) last_subtokens_batches.extend( last_subtoken_dic_dic[length1][length2]) label_batches.extend(label_dic_dic[length1][length2]) [ this_input_ids_batches.append(input_ids_batches[i:i + batch_size]) for i in range(0, len(input_ids_batches), batch_size) ] [ this_first_subtokens_batches.append( first_subtokens_batches[i:i + batch_size]) for i in range(0, len(first_subtokens_batches), batch_size) ] [ this_last_subtokens_batches.append( last_subtokens_batches[i:i + batch_size]) for i in range(0, len(last_subtokens_batches), batch_size) ] [ this_label_batches.append(label_batches[i:i + batch_size]) for i in range(0, len(label_batches), batch_size) ] this_input_ids_batches, this_input_mask_batches, this_mask_batches \ = self._pad_batches(this_input_ids_batches, this_first_subtokens_batches) return (this_input_ids_batches, this_input_mask_batches, this_first_subtokens_batches, this_last_subtokens_batches, this_label_batches, this_mask_batches) def to_batch(self, batch_size: int) -> Tuple: ret_list = [] for sent_list in [self.train, self.dev, self.test]: ret_list.append(self.get_batches(sent_list, batch_size)) return tuple(ret_list) def read_all_data(self, file_path: str, train_file: str, dev_file: str, test_file: str) -> None: self.train = self._read_file(file_path + train_file) self.dev = self._read_file(file_path + dev_file, mode='dev') self.test = self._read_file(file_path + test_file, mode='test') self._gen_dic() def debug_single_sample(self, subtoken: List[int], label_list: List[Tuple[int, int, int]]) -> None: print(" ".join( [self.subword_alphabet.get_instance(t) for t in subtoken])) for label in label_list: print(label[0], label[1], self.label_alphabet.get_instance(label[2]))
import torch from tokenizers import BertWordPieceTokenizer from amadeus_model import Amadeus tokenizer = BertWordPieceTokenizer('data/bert-base-uncased-vocab.txt', lowercase=True) model = Amadeus(num_tokens=tokenizer.get_vocab_size(), enc_seq_len=4096, dec_seq_len=1024) model.load_state_dict( torch.load('models/amadeus-performer-2020-11-03-16.54.13.pt')) model.eval(fix_proj_matrices=True) in_seq = torch.randint(0, tokenizer.get_vocab_size(), (1, model.in_seq_len)) out_seq = torch.randint(0, tokenizer.get_vocab_size(), (1, model.out_seq_len)) traced_script_model = torch.jit.trace(model, (in_seq, out_seq), check_trace=False) traced_script_model.save('traced.pt')
def numerize(vocab_path, input_path, bin_path): tokenizer = BertWordPieceTokenizer(vocab_path, unk_token=UNK_TOKEN, sep_token=SEP_TOKEN, cls_token=CLS_TOKEN, pad_token=PAD_TOKEN, mask_token=MASK_TOKEN, lowercase=False, strip_accents=False) sentences = [] with open(input_path, 'r') as f: batch_stream = [] for i, line in enumerate(f): batch_stream.append(line) if i % 1000 == 0: res = tokenizer.encode_batch(batch_stream) batch_stream = [] # flatten the list for s in res: sentences.extend(s.ids[1:]) if i % 100000 == 0: print(f'processed {i} lines') print('convert the data to numpy') # convert data to numpy format in uint16 if tokenizer.get_vocab_size() < 1 << 16: sentences = np.uint16(sentences) else: assert tokenizer.get_vocab_size() < 1 << 31 sentences = np.int32(sentences) # save special tokens for later processing sep_index = tokenizer.token_to_id(SEP_TOKEN) cls_index = tokenizer.token_to_id(CLS_TOKEN) unk_index = tokenizer.token_to_id(UNK_TOKEN) mask_index = tokenizer.token_to_id(MASK_TOKEN) pad_index = tokenizer.token_to_id(PAD_TOKEN) # sanity check assert sep_index == SEP_INDEX assert cls_index == CLS_INDEX assert unk_index == UNK_INDEX assert pad_index == PAD_INDEX assert mask_index == MASK_INDEX print('collect statistics') # collect some statistics of the dataset n_unks = (sentences == unk_index).sum() n_toks = len(sentences) p_unks = n_unks * 100. / n_toks n_seqs = (sentences == sep_index).sum() print( f'| {n_seqs} sentences - {n_toks} tokens - {p_unks:.2f}% unknown words' ) # print some statistics data = { 'sentences': sentences, 'sep_index': sep_index, 'cls_index': cls_index, 'unk_index': unk_index, 'pad_index': pad_index, 'mask_index': mask_index } torch.save(data, bin_path, pickle_protocol=4)
import torch from tokenizers import BertWordPieceTokenizer, Encoding from amadeus_model import Amadeus tokenizer = BertWordPieceTokenizer('data/bert-base-uncased-vocab.txt', lowercase=True) model = Amadeus(num_tokens=tokenizer.get_vocab_size(), enc_seq_len=1024, dec_seq_len=512) checkpoint = torch.load( 'checkpoints/amadeus-performer-2020-11-25-00.20.57-300.pt') model.eval(True) # model.load_state_dict(torch.load('models/amadeus-performer-2020-11-06-12.47.52.pt')) model.load_state_dict(checkpoint['model_state_dict']) model.cuda() run = True sentences = [] while run: try: sentence = input('> ') if sentence in ['quit', 'exit']: run = False continue sentences.append(tokenizer.encode(sentence)) if len(sentences) > 3: sentences = sentences[-3:]
parser.add_argument('--txtfolder', type=str, help='the FOLDER where are those txt files') args = parser.parse_args() paths = [str(x) for x in Path(str(args.txtfolder)).glob("**/*.txt")] # Initialize a lm_model tokenizer = BertWordPieceTokenizer() #trainer = BpeTrainer(vocab_size= VOCAB_SIZE, show_progress=True, initial_alphabet=ByteLevel.alphabet()) #tokenizer.train(trainer, paths) # Customize training ''' tokenizer._tokenizer.post_processor = BertProcessing(("[CLS]", tokenizer.token_to_id("[CLS]")), ("[SEP]", tokenizer.token_to_id("[SEP]")), ) ''' tokenizer.train(files=paths, vocab_size=VOCAB_SIZE, min_frequency=2, special_tokens=[ "[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", ]) print("Trained vocab size: {}".format(tokenizer.get_vocab_size())) tokenizer.save_model('./lm_model') print('tokenizer savedresults, they are vocab.json and merges.txt')