def test_data(self): path_dir_name = os.path.dirname(os.path.realpath(__file__)) data_path = os.path.join(path_dir_name, "sample.txt") with tempfile.TemporaryDirectory() as tmpdirname: processor = TextProcessor() processor.train_tokenizer([data_path], vocab_size=1000, to_save_dir=tmpdirname, languages={ "<mzn>": 0, "<glk": 1 }) create_batches.write(text_processor=processor, cache_dir=tmpdirname, seq_len=512, txt_file=data_path, sen_block_size=10) dataset = TextDataset(save_cache_dir=tmpdirname, max_cache_size=3) assert dataset.line_num == 70 dataset.__getitem__(3) assert len(dataset.current_cache) == 3 dataset.__getitem__(9) assert len(dataset.current_cache) == 3 dataset.__getitem__(69) assert len(dataset.current_cache) == 2
def test_albert_init(self): path_dir_name = os.path.dirname(os.path.realpath(__file__)) data_path = os.path.join(path_dir_name, "sample.txt") with tempfile.TemporaryDirectory() as tmpdirname: processor = TextProcessor() processor.train_tokenizer([data_path], vocab_size=1000, to_save_dir=tmpdirname, languages={"<en>": 0}) lm = LM(text_processor=processor) assert lm.encoder.base_model.embeddings.word_embeddings.num_embeddings == 1000 lm.save(tmpdirname) new_lm = LM.load(tmpdirname) assert new_lm.config == lm.config
def get_tokenizer(tokenizer_path: Optional[str] = None, train_path: Optional[str] = None, model_path: Optional[str] = None, vocab_size: Optional[int] = None) -> TextProcessor: if tokenizer_path is None: if not os.path.exists(model_path): os.makedirs(model_path) print("Training Tokenizer...") text_processor = TextProcessor() print("Writing raw text...") languages = set() with open(train_path + ".tmp", "w") as wf: with open(train_path, "r") as rf: for i, line in enumerate(rf): spl = [ sen.strip() for sen in line.split("</s>") if len(sen.strip()) > 0 ] if len(spl) == 0: continue if spl[0].startswith("<"): sen_split = spl[0].strip().split(" ") spl[0] = " ".join(sen_split[1:]) languages.add(sen_split[0]) wf.write("\n".join(spl)) wf.write("\n") if ((i + 1) % 1000 == 0): print(i + 1, "\r", end="") print("Writing raw text done!") text_processor.train_tokenizer( paths=[train_path + ".tmp"], vocab_size=vocab_size, to_save_dir=model_path, languages={l: i for i, l in enumerate(sorted(languages))}) print("Removing temporary file!") os.system("rm " + train_path + ".tmp &") print("done!") else: text_processor = TextProcessor(tokenizer_path) return text_processor
def test_train_tokenizer(self): path_dir_name = os.path.dirname(os.path.realpath(__file__)) data_path = os.path.join(path_dir_name, "sample.txt") with tempfile.TemporaryDirectory() as tmpdirname: processor = TextProcessor() processor.train_tokenizer([data_path], vocab_size=1000, to_save_dir=tmpdirname, languages={"<en>": 0}) assert processor.tokenizer.get_vocab_size() == 1000 sen1 = "Obama signed many landmark bills into law during his first two years in office." assert processor._tokenize(sen1) is not None many_sens = "\n".join([sen1] * 10) assert len(processor.tokenize(many_sens)) == 10 new_prcoessor = TextProcessor(tok_model_path=tmpdirname) assert new_prcoessor.tokenizer.get_vocab_size() == 1000 sen2 = "Obama signed many landmark bills into law during his first two years in office." assert processor._tokenize(sen2) is not None
def test_albert_seq2seq_init(self): path_dir_name = os.path.dirname(os.path.realpath(__file__)) data_path = os.path.join(path_dir_name, "sample.txt") with tempfile.TemporaryDirectory() as tmpdirname: processor = TextProcessor() processor.train_tokenizer([data_path], vocab_size=1000, to_save_dir=tmpdirname, languages={ "<en>": 0, "<fa>": 1 }) seq2seq = Seq2Seq(text_processor=processor) src_inputs = torch.tensor([[ 1, 2, 3, 4, 5, processor.pad_token_id(), processor.pad_token_id() ], [1, 2, 3, 4, 5, 6, processor.pad_token_id()]]) tgt_inputs = torch.tensor( [[6, 8, 7, processor.pad_token_id(), processor.pad_token_id()], [6, 8, 7, 8, processor.pad_token_id()]]) src_mask = (src_inputs != processor.pad_token_id()) tgt_mask = (tgt_inputs != processor.pad_token_id()) src_langs = torch.tensor([[0], [0]]).squeeze() tgt_langs = torch.tensor([[1], [1]]).squeeze() seq_output = seq2seq(src_inputs, tgt_inputs, src_mask, tgt_mask, src_langs, tgt_langs, log_softmax=True) assert list(seq_output.size()) == [5, processor.vocab_size()] seq_output = seq2seq(src_inputs, tgt_inputs, src_mask, tgt_mask, src_langs, tgt_langs) assert list(seq_output.size()) == [5, processor.vocab_size()]