def __init__(self, en_file, ru_file, en_tokenizer_file='en_tokenizer.model', ru_tokenizer_file='ru_tokenizer.model', en_vocab_size=5000, ru_vocab_size=5000): if not (os.path.exists(en_tokenizer_file) and os.path.exists(ru_tokenizer_file)): self.en_tokenizer, self.ru_tokenizer = init_tokenizers( en_file, ru_file, en_vocab_size, ru_vocab_size) else: self.en_tokenizer = yttm.BPE(model=en_tokenizer_file) self.ru_tokenizer = yttm.BPE(model=ru_tokenizer_file) self.en_corpus, self.ru_corpus = read_corupuses(en_file, ru_file)
def construct(self, booster: "xgboost.core.Booster", params: Mapping[str, int], bpe_model_path: str): self._booster = booster self._params = params self._bpe_model_path = bpe_model_path self._bpe_model = youtokentome.BPE(bpe_model_path) return self
def init_tokenizers( en_file, ru_file, en_vocab_size=5000, ru_vocab_size=5000 ) -> Tuple[yttm.youtokentome.BPE, yttm.youtokentome.BPE]: if not (os.path.exists(en_file) and os.path.exists(ru_file)): raise FileNotFoundError('Couldn\'t find corpus files') yttm.BPE.train(data=en_file, vocab_size=en_vocab_size, model='en_tokenizer.model') yttm.BPE.train(data=ru_file, vocab_size=ru_vocab_size, model='ru_tokenizer.model') return yttm.BPE(model='en_tokenizer.model'), yttm.BPE( model='ru_tokenizer.model')
def __init__(self, api_url): self.path = os.path.dirname(os.path.abspath(__file__)) self.api_url = api_url self.bpe = yttm.BPE(model=os.path.join(self.path, 'yttm.model')) self.vocab_size = self.bpe.vocab_size() self.sequence_length = 20 self.newline_token = 88
def __init__(self, vocab_file, errors='replace', bos_token="<s>", eos_token="</s>", sep_token="</s>", cls_token="<s>", unk_token="<unk>", pad_token='<pad>', mask_token='<mask>', **kwargs): super(RubertaTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs) self.max_len_single_sentence = self.max_len - 2 # take into account special tokens self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens self.vocab_file = vocab_file self.bpe = yttm.BPE(model=vocab_file) self.pat = re.compile( r""" ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") self.encoder = { self.bpe.id_to_subword(i): i for i in range(self.bpe.vocab_size()) } self.encoder['<|endoftext|>'] = self.bpe.vocab_size() self.decoder = {v: k for k, v in self.encoder.items()}
def __init__(self, bpe_path = None): bpe_path = Path(bpe_path) assert bpe_path.exists(), f'BPE json path {str(bpe_path)} does not exist' tokenizer = yttm.BPE(model = str(bpe_path)) self.tokenizer = tokenizer self.vocab_size = tokenizer.vocab_size()
def test_encode_decode(): generate_artifacts() os.remove(BASE_MODEL_FILE) yttm.BPE.train( data=TRAIN_FILE, vocab_size=16000, model=BASE_MODEL_FILE, bos_id=BOS_ID, eos_id=EOS_ID, ) bpe = yttm.BPE(BASE_MODEL_FILE) text_in = [ " ".join("".join([random.choice("abcd ") for _ in range(50)]).split()) ] ids = bpe.encode(text_in, yttm.OutputType.ID) # It is necessary to add first empty line, since everything in BPE starts from a new line text_in[0] = "\n" + text_in[0] assert text_in == bpe.decode(ids) ids_bos_eos = bpe.encode(text_in, yttm.OutputType.ID, bos=True, eos=True) assert text_in == bpe.decode(ids_bos_eos, ignore_ids=[BOS_ID, EOS_ID]) assert bpe.decode(ids, ignore_ids=[]) == bpe.decode(ids_bos_eos, ignore_ids=[BOS_ID, EOS_ID])
def _load_model(self): path_saved_model = os.path.join(self.path_cache, self.config_hash) if not os.path.isdir(path_saved_model): if not globals_vars.TRAINING: raise Exception( f"Embedding: While running in TEST mode: Model is not trained with this config yet \n({path_saved_model})" ) else: print( f"Embedding: While running in TRAINING mode: Model is not trained with this config yet -> Now training the model with this config \n({path_saved_model})" ) train_path = os.path.join( os.path.dirname(os.path.abspath(__file__)), "train.py", ) training_return = os.system(f"python {train_path}") if training_return != 0: raise Exception( f"EMBEDDING TRAINING HAS FAILED OUCH\npython {train_path} failed" ) else: print( f"Embedding: loading model already trained with config: \n({path_saved_model})\n" ) self.model = yttm.BPE( model=os.path.join(self.path_cache, self.config_hash, "model.bin") ) # already contain '<PAD>', '<UNK>', '<EOS>', '<BOS>' (no +1 necessary) self.vocab_size = self.model.vocab_size() print( "\nvocab_size from lang {}: \n{}".format(self.lang, self.model.vocab_size()) ) print("\nvocab from lang {}: \n{}".format(self.lang, self.model.vocab()))
def save_vocab(self): bpe = yttm.BPE(model=self.bpe_model_path) vocab = bpe.vocab() with open(os.path.join(self.config.data_dir, 'vocab.txt'), mode='w') as file_object: file_object.write('\n'.join(vocab))
def train_tokenizer( self, corpus_file=CORPUS_FILE, model_file=TOKENIZER_MODEL_FILE, vocab_sz=50000, dump_labels=True, ): assert self.built import youtokentome as yttm # first we need to dump labels if dump_labels: self.dump_labels(corpus_file) # train model print("Training yttm model...") yttm.BPE.train(data=corpus_file, vocab_size=vocab_sz, model=model_file) print("Done.") # load model (for testing) print("Testing yttm model...") bpe = yttm.BPE(model=model_file) # Two types of tokenization test_text = "Are you freakin' crazy?" encoded1 = bpe.encode([test_text], output_type=yttm.OutputType.ID) encoded2 = bpe.encode([test_text], output_type=yttm.OutputType.SUBWORD) decoded = bpe.decode(encoded1) print(encoded1) print(encoded2) print(decoded)
def create_tokenizer(tokenizer_path, datasets, vocab_size, tokens, temp_file_path='tokenizer_text.temp'): # Load tokenizer if os.path.exists(tokenizer_path): print('Loading pretrained tokenizer...') tokenizer = yttm.BPE(model=tokenizer_path) else: print('Creating new tokenizer...') # Create the corresponding folder (if needed) os.makedirs(os.path.dirname(tokenizer_path), exist_ok=True) # Create temp file with data to train tokenizer. with open(temp_file_path, 'w', encoding='utf8') as out_file: for data in datasets: out_file.write('\n'.join(map(str.lower, data))) # Train tokenizer. tokenizer = yttm.BPE.train(data=temp_file_path, vocab_size=vocab_size, model=tokenizer_path, n_threads=-1, **tokens) # Delete temp file. os.remove(temp_file_path) return tokenizer
def __init__(self, model_path, bpe_dropout=0.0): model_path = Path(model_path).expanduser() self.tokenizer = yttm.BPE(model=str(model_path)) self.vocab_size = len(self.tokenizer.vocab()) self.special_tokens = self.tokens_to_ids( ["<PAD>", "<UNK>", "<BOS>", "<EOS>"]) self.bpe_dropout = bpe_dropout
def test(self, mode, bpe_model_path=None): while True: file_path = input("File path: ").strip() file_path = r"C:\Users\lezgy\OneDrive\Рабочий стол\Data_summ\data.txt" if file_path == "q": break try: with open(file_path, "r", encoding="utf-8") as r: article = r.read().strip().split("\n") article = " ".join(article) if mode in ["lemm", "stem", "gram", "base"]: article = article.lower() article = word_tokenize(article) article = " ".join(article) print(f"real_text : {article}") if mode == "lemm": lemmatizer = mystem.Mystem() article = preprocess_lemm(article, lemmatizer) elif mode == "stem": stemmer = RussianStemmer(False) article = preprocess_stemm(article, stemmer) elif mode == "gram": token_model = youtokentome.BPE(model=bpe_model_path) article = preprocess_gramm(article, token_model) self.test_calc(article) except Exception as e: print(e) print("File not found")
def __init__(self, path_to_bpe: str, path_to_model: str, model_params: Dict[str, int]): self.bpe_model = yttm.BPE(path_to_bpe) self.categories: List[str] = [ "Алкоголь", "Бытовая техника", "Воды, соки, напитки", "Дача и гриль", "Другое", "Замороженные продукты", "Зоотовары", "Красота, гигиена, бытовая химия", "Макароны, крупы, специи", "Молоко, сыр, яйца", "Овощи, фрукты, ягоды", "Подборки и готовые блюда", "Постные продукты", "Посуда", "Птица, мясо, деликатесы", "Рыба, икра", "Соусы, орехи, консервы", "Товары для дома и дачи", "Товары для мам и детей", "Хлеб, сладости, снеки", "Чай, кофе, сахар", ] self.device = torch.device("cpu") self.model = CategoryClassifier(**model_params) self.model.load_state_dict( torch.load(path_to_model, map_location=self.device)) self.model.eval()
def __init__(self, vocab_size: int=10000, train_fname: str='train_texts.txt', bpe_path: str=''): self.bpe_path = bpe_path if len(bpe_path) else 'yttm_bpe.bin' self.bpe_model = yttm.BPE(bpe_path) if len(bpe_path) else None self.train_fname = train_fname self.vocab_size = vocab_size
def compute_features(data, model_path="vocab.model", max_len=20): bpe = yttm.BPE(model=model_path) features_ids = bpe.encode(data.feature_string.values.tolist(), output_type=yttm.OutputType.ID) features_ids = [ f[:max_len] + [0] * (max_len - len(f)) for f in features_ids ] return np.array(features_ids)
def main(args): training_args = TrainingArguments( output_dir=args.output_dir, overwrite_output_dir=True, num_train_epochs=args.epoch, per_device_train_batch_size=args.batch_size, save_steps=args.save_steps, save_total_limit=10, gradient_accumulation_steps=args.gradient_accumulation_steps, dataloader_num_workers=args.n_workers, ) print("Loading tokenizer...") tokenizer = yttm.BPE(args.tokenizer) print("Loading model...") model = get_model(vocab_size=tokenizer.vocab_size()) print("List training files...") train_paths = get_training_files(args.train_dir) if args.check: train_paths = train_paths[:10000] print("Loading train texts...") train_data = [] for p in tqdm(train_paths): (keys, notes) = read_abc(p) if keys is None: continue keys_tokens = tokenizer.encode(keys) bars = notes.split(" | ") notes_tokens = [tokenizer.encode(i + " | ") for i in bars] ## To avoid OOM sequence_len = sum(len(i) for i in notes_tokens) if not (args.min_sequence_lenght < sequence_len < args.max_sequence_lenght): print("Skip", p) continue train_data.append((keys_tokens, notes_tokens)) print("Making dataset...") train_dataset = ABCDataset(train_data) if args.checkpoint: state_dict = torch.load(args.checkpoint) model.load_state_dict(state_dict) trainer = Trainer(model=model, args=training_args, data_collator=collate_function, train_dataset=train_dataset) print("Start training...") trainer.train()
def test_encode_decode(): generate_artifacts() os.remove(BASE_MODEL_FILE) yttm.BPE.train(data=TRAIN_FILE, vocab_size=16000, model=BASE_MODEL_FILE) bpe = yttm.BPE(BASE_MODEL_FILE) text_in = [" ".join("".join([random.choice("abcd ") for _ in range(50)]).split())] ids = bpe.encode(text_in, yttm.OutputType.ID) assert text_in == bpe.decode(ids)
def get_bpe_tokenizer(train_texts, train_txt_path, bpe_model_name, vocab_size): _save_text(train_texts, train_txt_path) yttm.BPE.train(data=train_txt_path, vocab_size=vocab_size, model=bpe_model_name) tokenizer = yttm.BPE(bpe_model_name) return tokenizer
def main(): parser = argparse.ArgumentParser(description="Compute BLEU.") parser.add_argument('ckpt', type=str, help="Checkpoint to restore.") parser.add_argument('--dir', type=str, default="./wmt14", help="Directory of dataset.") parser.add_argument('--split', default='test', type=str, help="Specify which split of data to evaluate.") parser.add_argument( '--gpu_id', default=0, type=int, help="CUDA visible GPU ID. Currently only support single GPU.") parser.add_argument('--beams', default=1, type=int, help="Beam Search width.") args = parser.parse_args() os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_id) assert torch.cuda.is_available() import data import build_model # Restore checkpoint info = torch.load(args.ckpt) cfg = info['cfg'] # Build model bpe_model = yttm.BPE(model=cfg['bpe']) model = build_model.Seq2Seq(bpe_model.vocab_size(), bpe_model.vocab_size(), hidden_size=cfg['model']['hidden_size'], encoder_layers=cfg['model']['encoder_layers'], decoder_layers=cfg['model']['decoder_layers'], use_bn=cfg['model']['use_bn']) model.load_state_dict(info['weights']) model.eval() model = model.cuda() # Create dataset if args.beams == 1: batch_size = cfg['train']['batch_size'] else: batch_size = 1 loader = data.load(args.dir, split=args.split, batch_size=batch_size, bpe_model=bpe_model) # Evaluate _, bleu = utils.eval_dataset(loader, model, bpe_model, args.beams) print("BLEU on %s set = %.4f" % (args.split, error))
def youtoken(): bpe = yttm.BPE(model="statementPrediction.model") with open("../results/all_projects.json") as jsonf: logs = json.load(jsonf) with open("tokenizedExample.txt", "w") as output: for log in logs: msg = log["msg"] tokenizedMsg = bpe.encode( [msg], output_type=yttm.OutputType.SUBWORD)[0] output.write(f"{msg} ----> {str(tokenizedMsg)}\n")
def __init__(self, filename, *inputs, **kwargs): super().__init__(*inputs, **kwargs) if os.path.isdir(filename): filename = os.path.join(filename, self.def_name) self.bpe = yttm.BPE(filename) self.hash = hashlib.sha512(open(filename, 'rb').read()).hexdigest()[:10] self.filename = filename
def train(self, train_fname: str='', vocab_size: int=10000): if len(train_fname): self.train_fname = train_fname if vocab_size > 0: self.vocab_size = vocab_size yttm.BPE.train(data=self.train_fname, vocab_size=self.vocab_size, model=self.bpe_path) self.bpe_model = yttm.BPE(self.bpe_path)
def __init__(self, data_folder, source_suffix, target_suffix, split, tokens_in_batch): """ :param data_folder: folder containing the source and target language data files :param source_suffix: the filename suffix for the source language files :param target_suffix: the filename suffix for the target language files :param split: train, or val, or test? :param tokens_in_batch: the number of target language tokens in each batch """ self.tokens_in_batch = tokens_in_batch self.source_suffix = source_suffix self.target_suffix = target_suffix assert split.lower() in { "train", "val", "test" }, "'split' must be one of 'train', 'val', 'test'! (case-insensitive)" self.split = split.lower() # Is this for training? self.for_training = self.split == "train" # Load BPE model self.bpe_model = youtokentome.BPE( model=os.path.join(data_folder, "bpe.model")) # Load data with codecs.open(os.path.join(data_folder, ".".join([split, source_suffix])), "r", encoding="utf-8") as f: source_data = f.read().split("\n")[:-1] with codecs.open(os.path.join(data_folder, ".".join([split, target_suffix])), "r", encoding="utf-8") as f: target_data = f.read().split("\n")[:-1] assert len(source_data) == len( target_data ), "There are a different number of source or target sequences!" source_lengths = [ len(s) for s in self.bpe_model.encode(source_data, bos=False, eos=False) ] target_lengths = [ len(t) for t in self.bpe_model.encode(target_data, bos=True, eos=True) ] # target language sequences have <BOS> and <EOS> tokens self.data = list( zip(source_data, target_data, source_lengths, target_lengths)) # If for training, pre-sort by target lengths - required for itertools.groupby() later if self.for_training: self.data.sort(key=lambda x: x[3]) # Create batches self.create_batches()
def __init__(self, path): super().__init__() data = pickle.loads(Path(path).read_bytes()) self.samples = data['samples'] self.seq_length = data['seq_length'] self.bpe = yttm.BPE(data['bpe_path']) self.vocab = self.bpe.vocab() self.vocab_size = len(self.vocab) self.pad_idx, self.unk_idx, self.bos_idx, self.eos_idx = list(range(4))
def bpe_tokenize(self): self.split_files(self.file_path) if self.force or not os.path.isfile( self.src_bpe_file) or not os.path.isfile(self.src_bpe_file): self._remove_file(self.src_bpe_file) self._remove_file(self.trg_bpe_file) yttm.BPE.train(data=self.src_file, vocab_size=self.src_vocab_size, model=self.src_bpe_file) yttm.BPE.train(data=self.trg_file, vocab_size=self.trg_vocab_size, model=self.trg_bpe_file) # Loading model self.src_bpe: yttm.BPE = yttm.BPE(model=self.src_bpe_file) self.trg_bpe: yttm.BPE = yttm.BPE(model=self.trg_bpe_file) return
def make_yttm_tokenizer(train_conll: List[Instance], vocab_size=400): tokens = [] for instance in train_conll: tokens += [token.text for token in instance['tokens']] text = ' '.join(tokens) with open('train_chunks.txt', 'w') as fobj: fobj.write(text) yttm.BPE.train(data='train_chunks.txt', vocab_size=vocab_size, model='conll_model.yttm') return yttm.BPE('conll_model.yttm')
def test_vocabulary_consistency(): generate_artifacts() os.remove(BASE_MODEL_FILE) yttm.BPE.train(data=TRAIN_FILE, vocab_size=16000, model=BASE_MODEL_FILE) bpe = yttm.BPE(BASE_MODEL_FILE) assert bpe.vocab_size() == len(bpe.vocab()) assert bpe.vocab_size() == len(set(bpe.vocab())) vc = bpe.vocab() for i, subword in enumerate(vc): assert i == bpe.subword_to_id(subword) assert subword == bpe.id_to_subword(i)
def __init__(self, filename, *inputs, **kwargs): super().__init__(*inputs, **kwargs) #self.max_len_single_sentence = 1024 # no default special tokens - you can update this value if you add special tokens #self.max_len_sentences_pair = 1024 # no default special tokens - you can update this value if you add special tokens if os.path.isdir(filename): filename = os.path.join(filename, self.def_name) self.bpe = yttm.BPE(filename) self.hash = hashlib.sha512(open(filename, 'rb').read()).hexdigest()[:10] self.filename = filename
def __init__(self, bpe_path=None): bpe_path = Path(bpe_path) print(f"************ bpe_path : {bpe_path}") assert bpe_path.exists( ), f'BPE json path {str(bpe_path)} does not exist' tokenizer = yttm.BPE(model=str(bpe_path)) self.tokenizer = tokenizer self.vocab_size = tokenizer.vocab_size() print(f"************ self.tokenizer : {self.tokenizer}") print(f"************ self.vocab_size : {self.vocab_size}")