Esempio n. 1
0
 def __init__(self,
              en_file,
              ru_file,
              en_tokenizer_file='en_tokenizer.model',
              ru_tokenizer_file='ru_tokenizer.model',
              en_vocab_size=5000,
              ru_vocab_size=5000):
     if not (os.path.exists(en_tokenizer_file)
             and os.path.exists(ru_tokenizer_file)):
         self.en_tokenizer, self.ru_tokenizer = init_tokenizers(
             en_file, ru_file, en_vocab_size, ru_vocab_size)
     else:
         self.en_tokenizer = yttm.BPE(model=en_tokenizer_file)
         self.ru_tokenizer = yttm.BPE(model=ru_tokenizer_file)
     self.en_corpus, self.ru_corpus = read_corupuses(en_file, ru_file)
 def construct(self, booster: "xgboost.core.Booster",
               params: Mapping[str, int], bpe_model_path: str):
     self._booster = booster
     self._params = params
     self._bpe_model_path = bpe_model_path
     self._bpe_model = youtokentome.BPE(bpe_model_path)
     return self
Esempio n. 3
0
def init_tokenizers(
        en_file,
        ru_file,
        en_vocab_size=5000,
        ru_vocab_size=5000
) -> Tuple[yttm.youtokentome.BPE, yttm.youtokentome.BPE]:
    if not (os.path.exists(en_file) and os.path.exists(ru_file)):
        raise FileNotFoundError('Couldn\'t find corpus files')
    yttm.BPE.train(data=en_file,
                   vocab_size=en_vocab_size,
                   model='en_tokenizer.model')
    yttm.BPE.train(data=ru_file,
                   vocab_size=ru_vocab_size,
                   model='ru_tokenizer.model')
    return yttm.BPE(model='en_tokenizer.model'), yttm.BPE(
        model='ru_tokenizer.model')
Esempio n. 4
0
 def __init__(self, api_url):
     self.path = os.path.dirname(os.path.abspath(__file__))
     self.api_url = api_url
     self.bpe = yttm.BPE(model=os.path.join(self.path, 'yttm.model'))
     self.vocab_size = self.bpe.vocab_size()
     self.sequence_length = 20
     self.newline_token = 88
Esempio n. 5
0
 def __init__(self,
              vocab_file,
              errors='replace',
              bos_token="<s>",
              eos_token="</s>",
              sep_token="</s>",
              cls_token="<s>",
              unk_token="<unk>",
              pad_token='<pad>',
              mask_token='<mask>',
              **kwargs):
     super(RubertaTokenizer, self).__init__(bos_token=bos_token,
                                            eos_token=eos_token,
                                            unk_token=unk_token,
                                            **kwargs)
     self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
     self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens
     self.vocab_file = vocab_file
     self.bpe = yttm.BPE(model=vocab_file)
     self.pat = re.compile(
         r""" ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
     self.encoder = {
         self.bpe.id_to_subword(i): i
         for i in range(self.bpe.vocab_size())
     }
     self.encoder['<|endoftext|>'] = self.bpe.vocab_size()
     self.decoder = {v: k for k, v in self.encoder.items()}
Esempio n. 6
0
    def __init__(self, bpe_path = None):
        bpe_path = Path(bpe_path)
        assert bpe_path.exists(), f'BPE json path {str(bpe_path)} does not exist'

        tokenizer = yttm.BPE(model = str(bpe_path))
        self.tokenizer = tokenizer
        self.vocab_size = tokenizer.vocab_size()
Esempio n. 7
0
def test_encode_decode():
    generate_artifacts()
    os.remove(BASE_MODEL_FILE)

    yttm.BPE.train(
        data=TRAIN_FILE,
        vocab_size=16000,
        model=BASE_MODEL_FILE,
        bos_id=BOS_ID,
        eos_id=EOS_ID,
    )

    bpe = yttm.BPE(BASE_MODEL_FILE)
    text_in = [
        " ".join("".join([random.choice("abcd ") for _ in range(50)]).split())
    ]
    ids = bpe.encode(text_in, yttm.OutputType.ID)
    # It is necessary to add first empty line, since everything in BPE starts from a new line
    text_in[0] = "\n" + text_in[0]
    assert text_in == bpe.decode(ids)
    ids_bos_eos = bpe.encode(text_in, yttm.OutputType.ID, bos=True, eos=True)
    assert text_in == bpe.decode(ids_bos_eos, ignore_ids=[BOS_ID, EOS_ID])
    assert bpe.decode(ids,
                      ignore_ids=[]) == bpe.decode(ids_bos_eos,
                                                   ignore_ids=[BOS_ID, EOS_ID])
    def _load_model(self):
        path_saved_model = os.path.join(self.path_cache, self.config_hash)
        if not os.path.isdir(path_saved_model):
            if not globals_vars.TRAINING:
                raise Exception(
                    f"Embedding: While running in TEST mode: Model is not trained with this config yet \n({path_saved_model})"
                )
            else:
                print(
                    f"Embedding: While running in TRAINING mode: Model is not trained with this config yet -> Now training the model with this config \n({path_saved_model})"
                )

                train_path = os.path.join(
                    os.path.dirname(os.path.abspath(__file__)), "train.py",
                )
                training_return = os.system(f"python {train_path}")
                if training_return != 0:
                    raise Exception(
                        f"EMBEDDING TRAINING HAS FAILED OUCH\npython {train_path} failed"
                    )
        else:
            print(
                f"Embedding: loading model already trained with config: \n({path_saved_model})\n"
            )

        self.model = yttm.BPE(
            model=os.path.join(self.path_cache, self.config_hash, "model.bin")
        )

        # already contain '<PAD>', '<UNK>', '<EOS>', '<BOS>'  (no +1 necessary)
        self.vocab_size = self.model.vocab_size()
        print(
            "\nvocab_size from lang {}: \n{}".format(self.lang, self.model.vocab_size())
        )
        print("\nvocab from lang {}: \n{}".format(self.lang, self.model.vocab()))
    def save_vocab(self):
        bpe = yttm.BPE(model=self.bpe_model_path)
        vocab = bpe.vocab()

        with open(os.path.join(self.config.data_dir, 'vocab.txt'),
                  mode='w') as file_object:
            file_object.write('\n'.join(vocab))
Esempio n. 10
0
    def train_tokenizer(
        self,
        corpus_file=CORPUS_FILE,
        model_file=TOKENIZER_MODEL_FILE,
        vocab_sz=50000,
        dump_labels=True,
    ):
        assert self.built
        import youtokentome as yttm

        # first we need to dump labels
        if dump_labels:
            self.dump_labels(corpus_file)

        # train model
        print("Training yttm model...")
        yttm.BPE.train(data=corpus_file, vocab_size=vocab_sz, model=model_file)
        print("Done.")

        # load model (for testing)
        print("Testing yttm model...")
        bpe = yttm.BPE(model=model_file)
        # Two types of tokenization
        test_text = "Are you freakin' crazy?"
        encoded1 = bpe.encode([test_text], output_type=yttm.OutputType.ID)
        encoded2 = bpe.encode([test_text], output_type=yttm.OutputType.SUBWORD)
        decoded = bpe.decode(encoded1)
        print(encoded1)
        print(encoded2)
        print(decoded)
Esempio n. 11
0
def create_tokenizer(tokenizer_path,
                     datasets,
                     vocab_size,
                     tokens,
                     temp_file_path='tokenizer_text.temp'):
    # Load tokenizer
    if os.path.exists(tokenizer_path):
        print('Loading pretrained tokenizer...')
        tokenizer = yttm.BPE(model=tokenizer_path)
    else:
        print('Creating new tokenizer...')
        # Create the corresponding folder (if needed)
        os.makedirs(os.path.dirname(tokenizer_path), exist_ok=True)
        # Create temp file with data to train tokenizer.
        with open(temp_file_path, 'w', encoding='utf8') as out_file:
            for data in datasets:
                out_file.write('\n'.join(map(str.lower, data)))
        # Train tokenizer.
        tokenizer = yttm.BPE.train(data=temp_file_path,
                                   vocab_size=vocab_size,
                                   model=tokenizer_path,
                                   n_threads=-1,
                                   **tokens)
        # Delete temp file.
        os.remove(temp_file_path)
    return tokenizer
Esempio n. 12
0
 def __init__(self, model_path, bpe_dropout=0.0):
     model_path = Path(model_path).expanduser()
     self.tokenizer = yttm.BPE(model=str(model_path))
     self.vocab_size = len(self.tokenizer.vocab())
     self.special_tokens = self.tokens_to_ids(
         ["<PAD>", "<UNK>", "<BOS>", "<EOS>"])
     self.bpe_dropout = bpe_dropout
Esempio n. 13
0
    def test(self, mode, bpe_model_path=None):
        while True:
            file_path = input("File path: ").strip()
            file_path = r"C:\Users\lezgy\OneDrive\Рабочий стол\Data_summ\data.txt"
            if file_path == "q":
                break
            try:
                with open(file_path, "r", encoding="utf-8") as r:
                    article = r.read().strip().split("\n")
                    article = " ".join(article)
                    if mode in ["lemm", "stem", "gram", "base"]:
                        article = article.lower()
                        article = word_tokenize(article)
                        article = " ".join(article)
                    print(f"real_text : {article}")

                if mode == "lemm":
                    lemmatizer = mystem.Mystem()
                    article = preprocess_lemm(article, lemmatizer)
                elif mode == "stem":
                    stemmer = RussianStemmer(False)
                    article = preprocess_stemm(article, stemmer)
                elif mode == "gram":
                    token_model = youtokentome.BPE(model=bpe_model_path)
                    article = preprocess_gramm(article, token_model)
                self.test_calc(article)
            except Exception as e:
                print(e)
                print("File not found")
Esempio n. 14
0
 def __init__(self, path_to_bpe: str, path_to_model: str,
              model_params: Dict[str, int]):
     self.bpe_model = yttm.BPE(path_to_bpe)
     self.categories: List[str] = [
         "Алкоголь",
         "Бытовая техника",
         "Воды, соки, напитки",
         "Дача и гриль",
         "Другое",
         "Замороженные продукты",
         "Зоотовары",
         "Красота, гигиена, бытовая химия",
         "Макароны, крупы, специи",
         "Молоко, сыр, яйца",
         "Овощи, фрукты, ягоды",
         "Подборки и готовые блюда",
         "Постные продукты",
         "Посуда",
         "Птица, мясо, деликатесы",
         "Рыба, икра",
         "Соусы, орехи, консервы",
         "Товары для дома и дачи",
         "Товары для мам и детей",
         "Хлеб, сладости, снеки",
         "Чай, кофе, сахар",
     ]
     self.device = torch.device("cpu")
     self.model = CategoryClassifier(**model_params)
     self.model.load_state_dict(
         torch.load(path_to_model, map_location=self.device))
     self.model.eval()
Esempio n. 15
0
    def __init__(self, vocab_size: int=10000, 
                 train_fname: str='train_texts.txt', 
                 bpe_path: str=''):

        self.bpe_path = bpe_path if len(bpe_path) else 'yttm_bpe.bin'
        self.bpe_model = yttm.BPE(bpe_path) if len(bpe_path) else None
        self.train_fname = train_fname
        self.vocab_size = vocab_size
Esempio n. 16
0
def compute_features(data, model_path="vocab.model", max_len=20):
    bpe = yttm.BPE(model=model_path)
    features_ids = bpe.encode(data.feature_string.values.tolist(),
                              output_type=yttm.OutputType.ID)
    features_ids = [
        f[:max_len] + [0] * (max_len - len(f)) for f in features_ids
    ]
    return np.array(features_ids)
def main(args):
    training_args = TrainingArguments(
        output_dir=args.output_dir,
        overwrite_output_dir=True,
        num_train_epochs=args.epoch,
        per_device_train_batch_size=args.batch_size,
        save_steps=args.save_steps,
        save_total_limit=10,
        gradient_accumulation_steps=args.gradient_accumulation_steps,
        dataloader_num_workers=args.n_workers,
    )

    print("Loading tokenizer...")
    tokenizer = yttm.BPE(args.tokenizer)

    print("Loading model...")
    model = get_model(vocab_size=tokenizer.vocab_size())

    print("List training files...")
    train_paths = get_training_files(args.train_dir)

    if args.check:
        train_paths = train_paths[:10000]

    print("Loading train texts...")
    train_data = []
    for p in tqdm(train_paths):
        (keys, notes) = read_abc(p)
        if keys is None:
            continue

        keys_tokens = tokenizer.encode(keys)
        bars = notes.split(" | ")
        notes_tokens = [tokenizer.encode(i + " | ") for i in bars]

        ## To avoid OOM
        sequence_len = sum(len(i) for i in notes_tokens)
        if not (args.min_sequence_lenght < sequence_len <
                args.max_sequence_lenght):
            print("Skip", p)
            continue

        train_data.append((keys_tokens, notes_tokens))

    print("Making dataset...")
    train_dataset = ABCDataset(train_data)

    if args.checkpoint:
        state_dict = torch.load(args.checkpoint)
        model.load_state_dict(state_dict)

    trainer = Trainer(model=model,
                      args=training_args,
                      data_collator=collate_function,
                      train_dataset=train_dataset)

    print("Start training...")
    trainer.train()
Esempio n. 18
0
def test_encode_decode():
    generate_artifacts()
    os.remove(BASE_MODEL_FILE)
    yttm.BPE.train(data=TRAIN_FILE, vocab_size=16000, model=BASE_MODEL_FILE)

    bpe = yttm.BPE(BASE_MODEL_FILE)
    text_in = [" ".join("".join([random.choice("abcd ") for _ in range(50)]).split())]
    ids = bpe.encode(text_in, yttm.OutputType.ID)
    assert text_in == bpe.decode(ids)
Esempio n. 19
0
def get_bpe_tokenizer(train_texts, train_txt_path, bpe_model_name, vocab_size):
    _save_text(train_texts, train_txt_path)
    yttm.BPE.train(data=train_txt_path,
                   vocab_size=vocab_size,
                   model=bpe_model_name)

    tokenizer = yttm.BPE(bpe_model_name)

    return tokenizer
Esempio n. 20
0
def main():
    parser = argparse.ArgumentParser(description="Compute BLEU.")
    parser.add_argument('ckpt', type=str, help="Checkpoint to restore.")
    parser.add_argument('--dir',
                        type=str,
                        default="./wmt14",
                        help="Directory of dataset.")
    parser.add_argument('--split',
                        default='test',
                        type=str,
                        help="Specify which split of data to evaluate.")
    parser.add_argument(
        '--gpu_id',
        default=0,
        type=int,
        help="CUDA visible GPU ID. Currently only support single GPU.")
    parser.add_argument('--beams',
                        default=1,
                        type=int,
                        help="Beam Search width.")
    args = parser.parse_args()

    os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_id)
    assert torch.cuda.is_available()
    import data
    import build_model

    # Restore checkpoint
    info = torch.load(args.ckpt)
    cfg = info['cfg']

    # Build model
    bpe_model = yttm.BPE(model=cfg['bpe'])
    model = build_model.Seq2Seq(bpe_model.vocab_size(),
                                bpe_model.vocab_size(),
                                hidden_size=cfg['model']['hidden_size'],
                                encoder_layers=cfg['model']['encoder_layers'],
                                decoder_layers=cfg['model']['decoder_layers'],
                                use_bn=cfg['model']['use_bn'])
    model.load_state_dict(info['weights'])
    model.eval()
    model = model.cuda()

    # Create dataset
    if args.beams == 1:
        batch_size = cfg['train']['batch_size']
    else:
        batch_size = 1
    loader = data.load(args.dir,
                       split=args.split,
                       batch_size=batch_size,
                       bpe_model=bpe_model)

    # Evaluate
    _, bleu = utils.eval_dataset(loader, model, bpe_model, args.beams)
    print("BLEU on %s set = %.4f" % (args.split, error))
Esempio n. 21
0
def youtoken():
    bpe = yttm.BPE(model="statementPrediction.model")
    with open("../results/all_projects.json") as jsonf:
        logs = json.load(jsonf)
        with open("tokenizedExample.txt", "w") as output:
            for log in logs:
                msg = log["msg"]
                tokenizedMsg = bpe.encode(
                    [msg], output_type=yttm.OutputType.SUBWORD)[0]
                output.write(f"{msg} ----> {str(tokenizedMsg)}\n")
Esempio n. 22
0
    def __init__(self, filename, *inputs, **kwargs):
        super().__init__(*inputs, **kwargs)

        if os.path.isdir(filename):
            filename = os.path.join(filename, self.def_name)

        self.bpe = yttm.BPE(filename)
        self.hash = hashlib.sha512(open(filename,
                                        'rb').read()).hexdigest()[:10]
        self.filename = filename
Esempio n. 23
0
    def train(self, train_fname: str='', vocab_size: int=10000):

        if len(train_fname):
            self.train_fname = train_fname

        if vocab_size > 0:
            self.vocab_size = vocab_size

        yttm.BPE.train(data=self.train_fname, vocab_size=self.vocab_size, model=self.bpe_path)
        self.bpe_model = yttm.BPE(self.bpe_path)
    def __init__(self, data_folder, source_suffix, target_suffix, split,
                 tokens_in_batch):
        """
        :param data_folder: folder containing the source and target language data files
        :param source_suffix: the filename suffix for the source language files
        :param target_suffix: the filename suffix for the target language files
        :param split: train, or val, or test?
        :param tokens_in_batch: the number of target language tokens in each batch
        """
        self.tokens_in_batch = tokens_in_batch
        self.source_suffix = source_suffix
        self.target_suffix = target_suffix
        assert split.lower() in {
            "train", "val", "test"
        }, "'split' must be one of 'train', 'val', 'test'! (case-insensitive)"
        self.split = split.lower()

        # Is this for training?
        self.for_training = self.split == "train"

        # Load BPE model
        self.bpe_model = youtokentome.BPE(
            model=os.path.join(data_folder, "bpe.model"))

        # Load data
        with codecs.open(os.path.join(data_folder,
                                      ".".join([split, source_suffix])),
                         "r",
                         encoding="utf-8") as f:
            source_data = f.read().split("\n")[:-1]
        with codecs.open(os.path.join(data_folder,
                                      ".".join([split, target_suffix])),
                         "r",
                         encoding="utf-8") as f:
            target_data = f.read().split("\n")[:-1]
        assert len(source_data) == len(
            target_data
        ), "There are a different number of source or target sequences!"
        source_lengths = [
            len(s)
            for s in self.bpe_model.encode(source_data, bos=False, eos=False)
        ]
        target_lengths = [
            len(t)
            for t in self.bpe_model.encode(target_data, bos=True, eos=True)
        ]  # target language sequences have <BOS> and <EOS> tokens
        self.data = list(
            zip(source_data, target_data, source_lengths, target_lengths))

        # If for training, pre-sort by target lengths - required for itertools.groupby() later
        if self.for_training:
            self.data.sort(key=lambda x: x[3])

        # Create batches
        self.create_batches()
Esempio n. 25
0
    def __init__(self, path):
        super().__init__()

        data = pickle.loads(Path(path).read_bytes())

        self.samples = data['samples']
        self.seq_length = data['seq_length']
        self.bpe = yttm.BPE(data['bpe_path'])
        self.vocab = self.bpe.vocab()
        self.vocab_size = len(self.vocab)

        self.pad_idx, self.unk_idx, self.bos_idx, self.eos_idx = list(range(4))
Esempio n. 26
0
    def bpe_tokenize(self):
        self.split_files(self.file_path)

        if self.force or not os.path.isfile(
                self.src_bpe_file) or not os.path.isfile(self.src_bpe_file):

            self._remove_file(self.src_bpe_file)
            self._remove_file(self.trg_bpe_file)

            yttm.BPE.train(data=self.src_file,
                           vocab_size=self.src_vocab_size,
                           model=self.src_bpe_file)
            yttm.BPE.train(data=self.trg_file,
                           vocab_size=self.trg_vocab_size,
                           model=self.trg_bpe_file)

        # Loading model
        self.src_bpe: yttm.BPE = yttm.BPE(model=self.src_bpe_file)
        self.trg_bpe: yttm.BPE = yttm.BPE(model=self.trg_bpe_file)

        return
Esempio n. 27
0
def make_yttm_tokenizer(train_conll: List[Instance], vocab_size=400):
    tokens = []
    for instance in train_conll:
        tokens += [token.text for token in instance['tokens']]
    text = ' '.join(tokens)

    with open('train_chunks.txt', 'w') as fobj:
        fobj.write(text)
    yttm.BPE.train(data='train_chunks.txt',
                   vocab_size=vocab_size,
                   model='conll_model.yttm')
    return yttm.BPE('conll_model.yttm')
Esempio n. 28
0
def test_vocabulary_consistency():
    generate_artifacts()
    os.remove(BASE_MODEL_FILE)
    yttm.BPE.train(data=TRAIN_FILE, vocab_size=16000, model=BASE_MODEL_FILE)

    bpe = yttm.BPE(BASE_MODEL_FILE)
    assert bpe.vocab_size() == len(bpe.vocab())
    assert bpe.vocab_size() == len(set(bpe.vocab()))
    vc = bpe.vocab()
    for i, subword in enumerate(vc):
        assert i == bpe.subword_to_id(subword)
        assert subword == bpe.id_to_subword(i)
Esempio n. 29
0
    def __init__(self, filename, *inputs, **kwargs):
        super().__init__(*inputs, **kwargs)
        #self.max_len_single_sentence = 1024 # no default special tokens - you can update this value if you add special tokens
        #self.max_len_sentences_pair = 1024 # no default special tokens - you can update this value if you add special tokens

        if os.path.isdir(filename):
            filename = os.path.join(filename, self.def_name)

        self.bpe = yttm.BPE(filename)
        self.hash = hashlib.sha512(open(filename,
                                        'rb').read()).hexdigest()[:10]
        self.filename = filename
Esempio n. 30
0
    def __init__(self, bpe_path=None):
        bpe_path = Path(bpe_path)
        print(f"************ bpe_path : {bpe_path}")
        assert bpe_path.exists(
        ), f'BPE json path {str(bpe_path)} does not exist'

        tokenizer = yttm.BPE(model=str(bpe_path))
        self.tokenizer = tokenizer
        self.vocab_size = tokenizer.vocab_size()

        print(f"************ self.tokenizer : {self.tokenizer}")
        print(f"************ self.vocab_size : {self.vocab_size}")