Exemple #1
0
def train(args):

    tokenizer = CharBPETokenizer()

    tokenizer.train([args.corpus], vocab_size=1000)

    tokenizer.save("src/dev_scripts/tokenizer.json")
class HuggingFaceTokenizer:
    def __init__(self, cache_dir, max_length=None, vocab_size=400):
        self.vocab_size = vocab_size
        self.max_length = max_length
        self.cache_dir = cache_dir
        self.name = "%d-%s" % (vocab_size, max_length)
        self.tokenizer = None

        vocab = os.path.join(self.cache_dir, self.name + '-vocab.json')
        merges = os.path.join(self.cache_dir, self.name + '-merges.txt')
        if os.path.exists(vocab) and os.path.exists(merges):
            self.tokenizer = CharBPETokenizer(vocab, merges, lowercase=True)
            print('Using cached HuggingFaceTokenizer')

    def build(self, texts):
        if self.tokenizer is not None:
            return

        tmp_file = tempfile.NamedTemporaryFile()

        with open(tmp_file.name, "w") as f:
            f.write(' '.join(texts).lower())

        self.tokenizer = CharBPETokenizer(lowercase=True)
        self.tokenizer.train(
            [tmp_file.name],
            vocab_size=self.vocab_size,
            special_tokens=[
                NUL_token,
                PAD_token,
                BOS_token,
                UNK_token,
            ],
        )
        os.makedirs(self.cache_dir, exist_ok=True)
        self.tokenizer.save(self.cache_dir, self.name)

    def encode(self, text):
        token_ids = self.tokenizer.encode(text.lower()).ids
        token_ids = token_ids[:self.max_length]

        return token_ids

    def decode(self, tokens, skip_special_tokens=True):
        text = self.tokenizer.decode(  # My special tokens
            tokens,
            # [token for token in tokens if token > 3],   # aren't skipped
            skip_special_tokens=skip_special_tokens,  # even I set f*****g
        )  # skip_special_tokens
        return text  # to True

    def decode_plus(self, token_batch):
        sentences = []
        for tokens in token_batch:
            sentences.append(self.decode(tokens))
        return sentences
Exemple #3
0
def train():
    """My main man"""

    base = os.environ['DATA_ROOT']
    corpus_path = base + 'Thyme/Text/train+dev+test/*'
    files = glob.glob(corpus_path)

    tokenizer = CharBPETokenizer(lowercase=True)
    tokenizer.train(files=files,
                    vocab_size=10000,
                    min_frequency=3,
                    show_progress=True)
    tokenizer.save('.', name='thyme-tokenizer')
Exemple #4
0
def train_subword_tokenizer(size, special_tokens, path):
    """Train subword tokenizers for subword encoding
    ref: https://github.com/huggingface/tokenizers

    Args:
        path: path of training corpus.
    """
    tokenizer = CharBPETokenizer()
    tokenizer.train(
        [path+"/corpus_all.txt"],
        vocab_size=size,
        min_frequency=2,
        show_progress=True,
        special_tokens=special_tokens[:3]+["<unk>"],
    )
    tokenizer.save(path, "bpe")
def get_data():
    transcript_folder = os.path.join('data', 'transcripts')
    summary_folder = os.path.join('data', 'summary')

    train_files, train_result_files, test_files, test_result_files = get_dataset_files(transcript_folder,
                                                                                       summary_folder)
    train_data, train_results, test_data, test_results = get_dataset(train_files, train_result_files, test_files,
                                                                     test_result_files)

    tokenizer = CharBPETokenizer()
    all_files = np.concatenate([train_files, train_result_files, test_files, test_result_files])
    tokenizer.train(list(all_files))

    train_data = tokenize_data(tokenizer, train_data)
    test_data = tokenize_data(tokenizer, test_data)

    return train_data, train_results, test_data, test_results
Exemple #6
0
def create_tokenizer_imbd(data_path, file_name, vocab_size):
    #df = pd.read_csv(os.path.join(data_path, file_name))
    tokenizer = CharBPETokenizer()
    tokenizer.train(
        os.path.join(data_path, file_name),
        vocab_size=vocab_size,
        min_frequency=2,
        show_progress=True,
        special_tokens=["[CLS]", "[PAD]", "[MASK]", "[UNK]", "[SEP]"])

    print("[CLS]: {}, [PAD]: {}, [MASK]: {}, [UNK]: {}, [SEP]: {}".format(
        str(tokenizer.token_to_id("[CLS]")),
        str(tokenizer.token_to_id("[PAD]")),
        str(tokenizer.token_to_id("[MASK]")),
        str(tokenizer.token_to_id("[UNK]")),
        str(tokenizer.token_to_id("[SEP]"))))

    tokenizer.save(data_path, "tokenizer")
Exemple #7
0
def create_tokenizer(data_path, vocab_size):

    tokenizer = CharBPETokenizer()
    tokenizer.train([
        os.path.join(data_path, file) for file in
        [f
         for f in os.listdir(data_path) if f.find("uncased_chunk") != -1][:20]
    ],
                    vocab_size=vocab_size,
                    min_frequency=2,
                    show_progress=True,
                    special_tokens=[
                        "[CLS]", "[PAD]", "[MASK]", "[UNK]", "[SEP]"
                    ])

    print("[CLS]: {}, [PAD]: {}, [MASK]: {}, [UNK]: {}, [SEP]: {}".format(
        str(tokenizer.token_to_id("[CLS]")),
        str(tokenizer.token_to_id("[PAD]")),
        str(tokenizer.token_to_id("[MASK]")),
        str(tokenizer.token_to_id("[UNK]")),
        str(tokenizer.token_to_id("[SEP]"))))

    tokenizer.save(data_path, "tokenizer")
Exemple #8
0
    def _cbpe(self):
        tokenizer = CharBPETokenizer(
            vocab=self.conf.vocab,
            merges=self.conf.merges,
            unk_token=self.conf.cbpe_unk_token,
            suffix=self.conf.suffix,
            dropout=self.conf.dropout,
            lowercase=self.conf.lowercase,
            unicode_normalizer=self.conf.unicode_normalizer,
            bert_normalizer=self.conf.bert_normalizer,
            split_on_whitespace_only=self.conf.split_on_whitespace_only,
        )

        tokenizer.train(
            files=self.files,
            vocab_size=self.conf.vocab_size,
            min_frequency=self.conf.min_frequency,
            special_tokens=self.conf.special_tokens,
            limit_alphabet=self.conf.limit_alphabet,
            initial_alphabet=self.conf.initial_alphabet,
            suffix=self.conf.cpbe_train_shuffix,
        )

        return tokenizer
Exemple #9
0
class BPETokenizer:
    def __init__(self, text_list, vocab_size, lazy=False):
        if not lazy:
            self.tokenizer = CharBPETokenizer()
            self.tokenizer.train(text_list,
                                 vocab_size=vocab_size,
                                 special_tokens=[PAD, BOS, EOS, "<unk>"])
            self.tokenizer.add_special_tokens([PAD, BOS, EOS])
        else:
            self.tokenizer = None

    def tokens_to_ids(self, tokens):
        return [self.tokenizer.token_to_id(t) for t in tokens]

    def ids_to_tokens(self, ids):
        return [self.tokenizer.id_to_token(i) for i in ids]

    def encode(self, text):
        encodes = self.tokenizer.encode(text)
        return encodes.ids

    def decode(self, ids, skip_special=True):
        return self.tokenizer.decode(ids, skip_special_tokens=skip_special)

    def save(self, path, file_name):
        self.tokenizer.save(path, file_name)

    @classmethod
    def load(cls, vocab, merges):
        tkz = cls(None, None, lazy=True)
        tkz.tokenizer = CharBPETokenizer(vocab, merges)
        tkz.tokenizer.add_special_tokens([PAD, BOS, EOS])
        return tkz

    def __len__(self):
        return self.tokenizer.get_vocab_size()
Exemple #10
0
import json
import argparse
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors

from tokenizers import CharBPETokenizer

parser = argparse.ArgumentParser()

parser.add_argument("--corpus",
                    help="Path to text training corpus",
                    default="/home/benet/IRI/How2Sign/metadata/metadata.txt")
parser.add_argument("--saveto",
                    help="Path where to save the model",
                    default="steps/tokenizer.json")
parser.add_argument("--size",
                    help="Number of tokens / vocabulary size",
                    type=int,
                    default=1000)

if __name__ == '__main__':

    args = parser.parse_args()

    tokenizer = CharBPETokenizer()

    tokenizer.train([args.corpus], vocab_size=args.size)

    tokenizer.save(args.saveto)
Exemple #11
0
    return bleu_score(outputs, targets)

# function to save model parameters
def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
    print("=> Saving checkpoint")
    torch.save(state, filename)

# function to load pre-saved model parameters
def load_checkpoint(checkpoint, model, optimizer):
    print("=> Loading checkpoint")
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])

# Common Tokenizers training
tokenizer = CharBPETokenizer()
tokenizer.train(["english_fr.txt", "english_lt.txt", "french.txt", "lithuanian.txt"])

# Data loading
english_lt = open("english_lt.txt", encoding="utf-8").read().split("\n")
lithuanian = open("lithuanian.txt", encoding="utf-8").read().split("\n")
english_fr = open("english_fr.txt", encoding="utf-8").read().split("\n")
french = open("french.txt", encoding="utf-8").read().split("\n")

# Create dataframe
raw_data_child = {
    "English": [line for line in english_lt],
    "Lithuanian": [line for line in lithuanian],
}
raw_data_parent = {
    "English": [line for line in english_fr],
    "French": [line for line in french],
Exemple #12
0
class EngGerNewstest(Dataset):
    """
    The newstest 2014 dataset used for testing
    """
    def __init__(self,
                 data_folder,
                 rank=0,
                 val_set=False,
                 world_size=1,
                 seed=0,
                 eng_to_ger=True,
                 vocab_size=37000,
                 MASK="<MASK>",
                 START="<START>",
                 STOP="<STOP>",
                 exp_name="",
                 max_context=None,
                 batch_size=128,
                 val_size=30000,
                 **kwargs):
        """
        rank: int
            the rank in the distributed training
        val_set: bool
            if true, this dataset is created as the validation set
        world_size: int
            the number of processes if using distributed training
        seed: int
            random seed
        data_folder: str
            the path to the folder that should contain a `train.en` and
            a `train.de` file.
        eng_to_ger: bool
            if true, the x values are returned as english ids and the
            y values are german ids. If false, then visa-versa
        vocab_size: int
            the number of encodings for the byte-pair encoding scheme
        MASK: str
            the mask token
        START: str
            the start token
        STOP: str
            the stop token
        exp_name: str
            name of the experiment
        max_context: int
            the maximum sequence length
        val_size: int
            the number of samples to be set aside for validation
        """
        self.rank = rank
        print("rank:", self.rank)
        self.world_size = world_size
        self.val_set = val_set
        self.val_size = val_size
        self.batch_size = batch_size
        self.data_folder = os.path.expanduser(data_folder)
        self.en_path = os.path.join(data_folder, "newstest2014.en")
        self.de_path = os.path.join(data_folder, "newstest2014.de")
        self.eng_to_ger = eng_to_ger
        self.vocab_size = vocab_size
        self.MASK = MASK
        self.START = START
        self.STOP = STOP
        self.max_context = max_context
        self.en_tok_path = os.path.join(self.data_folder, "en_tokenizer")
        self.de_tok_path = os.path.join(self.data_folder, "de_tokenizer")
        self.en_arr_path = os.path.join(self.data_folder, "en_bcolz")
        self.de_arr_path = os.path.join(self.data_folder, "de_bcolz")
        self.en_lens_path = os.path.join(self.data_folder, "en_bcolz_lens")
        self.de_lens_path = os.path.join(self.data_folder, "de_bcolz_lens")

        # Train tokenizers
        if rank == 0: print("Tokenizing english..")
        self.en_tokenizer = CharBPETokenizer()
        if os.path.exists(self.en_tok_path):  # Load trained tokenizer
            if rank == 0:
                print("loading from pretrained tokenizer", self.en_tok_path)
            self.en_tokenizer = ml_utils.datas.load_tokenizer(
                self.en_tokenizer, self.en_tok_path)
        else:
            self.en_tokenizer.train([self.en_path], vocab_size=vocab_size)
            os.mkdir(self.en_tok_path)
            self.en_tokenizer.save_model(self.en_tok_path)
        self.en_tokenizer.add_special_tokens([self.MASK])
        self.en_tokenizer.add_tokens([self.START])
        self.en_tokenizer.add_tokens([self.STOP])
        self.en_mask_idx = self.en_tokenizer.token_to_id(self.MASK)
        self.en_start_idx = self.en_tokenizer.token_to_id(self.START)
        self.en_stop_idx = self.en_tokenizer.token_to_id(self.STOP)

        if rank == 0: print("Tokenizing german..")
        self.de_tokenizer = CharBPETokenizer()
        if os.path.exists(self.de_tok_path):  # Load trained tokenizer
            if rank == 0:
                print("loading from pretrained tokenizer", self.de_tok_path)
            self.de_tokenizer = ml_utils.datas.load_tokenizer(
                self.de_tokenizer, self.de_tok_path)
        else:
            self.de_tokenizer.train([self.de_path], vocab_size=vocab_size)
            os.mkdir(self.de_tok_path)
            self.de_tokenizer.save_model(self.de_tok_path)
        self.de_tokenizer.add_special_tokens([self.MASK])
        self.de_tokenizer.add_tokens([self.START])
        self.de_tokenizer.add_tokens([self.STOP])
        self.de_mask_idx = self.de_tokenizer.token_to_id(self.MASK)
        self.de_start_idx = self.de_tokenizer.token_to_id(self.START)
        self.de_stop_idx = self.de_tokenizer.token_to_id(self.STOP)

        # Get English sentence lists
        if rank == 0: print("Making english idxs")
        self.en_max_len = 0
        self.en_idxs = []
        self.en_lens = []
        with open(self.en_path, 'r') as f:
            for i, l in tqdm(enumerate(f.readlines())):
                l = l.strip()
                if len(l) > 0:
                    output = self.en_tokenizer.encode(l)
                    ids = [self.en_start_idx]+list(output.ids)\
                                             +[self.en_stop_idx]
                    self.en_idxs.append(ids)
                    self.en_lens.append(len(ids))
                    if len(ids) > self.en_max_len:
                        self.en_max_len = len(ids)
                if exp_name == "test" and i > 100:
                    break
        mask = [self.en_mask_idx for i in range(self.en_max_len)]
        l = 0
        if rank == 0: print("Padding english idxs")
        for i in tqdm(range(len(self.en_idxs))):
            diff = self.en_max_len - len(self.en_idxs[i])
            self.en_idxs[i] = self.en_idxs[i] + mask[:diff]

        # Get German Sentence Lists
        if rank == 0: print("Making german idxs")
        self.de_max_len = 0
        self.de_idxs = []
        self.de_lens = []
        with open(self.de_path, 'r') as f:
            for i, l in tqdm(enumerate(f.readlines())):
                l = l.strip()
                if len(l) > 0:
                    output = self.de_tokenizer.encode(l)
                    ids = [self.de_start_idx]+list(output.ids)\
                                             +[self.de_stop_idx]
                    self.de_idxs.append(ids)
                    self.de_lens.append(len(ids))
                    if len(ids) > self.de_max_len:
                        self.de_max_len = len(ids)
                if exp_name == "test" and i > 100:
                    break
        mask = [self.de_mask_idx for i in range(self.de_max_len)]
        if rank == 0: print("Padding german idxs")
        for i in tqdm(range(len(self.de_idxs))):
            diff = self.de_max_len - len(self.de_idxs[i])
            self.de_idxs[i] = self.de_idxs[i] + mask[:diff]

        if rank == 0: print("Converting to numpy arrays")
        if self.eng_to_ger:
            self.X = np.asarray(self.en_idxs)
            self.X_lens = np.asarray(self.en_lens)
            self.X_tokenizer = self.en_tokenizer
            self.X_mask_idx = self.en_mask_idx
            self.X_start_idx = self.en_start_idx
            self.X_stop_idx = self.en_stop_idx
            self.X_max_len = self.en_max_len

            self.Y = np.asarray(self.de_idxs)
            self.Y_lens = np.asarray(self.de_lens)
            self.Y_tokenizer = self.de_tokenizer
            self.Y_mask_idx = self.de_mask_idx
            self.Y_start_idx = self.de_start_idx
            self.Y_stop_idx = self.de_stop_idx
            self.Y_max_len = self.de_max_len
        else:
            self.X = np.asarray(self.de_idxs)
            self.X_lens = np.asarray(self.de_lens)
            self.X_tokenizer = self.de_tokenizer
            self.X_mask_idx = self.de_mask_idx
            self.X_start_idx = self.de_start_idx
            self.X_stop_idx = self.de_stop_idx
            self.X_max_len = self.de_max_len

            self.Y = np.asarray(self.en_idxs)
            self.Y_lens = np.asarray(self.en_lens)
            self.Y_tokenizer = self.en_tokenizer
            self.Y_mask_idx = self.en_mask_idx
            self.Y_start_idx = self.en_start_idx
            self.Y_stop_idx = self.en_stop_idx
            self.Y_max_len = self.en_max_len

    def __len__(self):
        return len(self.en_idxs)

    #def __getitem__(self,i,l=None):
    #    if l is None:
    #        l = self.X_lens[int(i)]
    #    idxs = np.zeros(1)
    #    margin = 5
    #    while idxs.sum()<25 and margin < 400:
    #        min_l = l-margin
    #        max_l = l+margin
    #        idxs = (self.X_lens>min_l)&(self.X_lens<max_l)
    #        margin += 5
    #    max_l = min(np.max(self.X_lens[idxs]),self.max_context)
    #    if max_l <   50 : batch_size = self.batch_size
    #    elif max_l < 70: batch_size = self.batch_size//2
    #    elif max_l < 100: batch_size = self.batch_size//4
    #    elif max_l < 120: batch_size = self.batch_size//8
    #    elif max_l < 140: batch_size = self.batch_size//16
    #    elif max_l < 160: batch_size = self.batch_size//32
    #    else: batch_size = self.batch_size//64
    #    batch_size = max(16,batch_size)
    #    perm = np.random.permutation(idxs.sum())[:batch_size]
    #    max_l = np.max(self.X_lens[idxs][perm])
    #    x = np.asarray(self.X[idxs][perm,:max_l])
    #    max_l = np.max(self.Y_lens[idxs][perm])
    #    y = np.asarray(self.Y[idxs][perm,:max_l])
    #    return torch.LongTensor(x), torch.LongTensor(y)

    def __getitem__(self, idx):
        return torch.LongTensor(self.X[idx]), torch.LongTensor(self.Y[idx])

    def get_largest_batch(self, size_num):
        l = 10
        if size_num == 1:
            l = 25
        elif size_num == 2:
            l = 400
        elif size_num == 3:
            l = 130
        elif size_num == 4:
            l = 75
        elif size_num == 5:
            l = 44
        elif size_num == 6:
            l = 94
        elif size_num == 7:
            l = 200
        elif size_num == 8:
            l = 300
        return self.__getitem__(0, l)

    def X_idxs2tokens(self, idxs):
        """
        idxs: LongTensor (N,)
            converts an array of tokens to a sentence
        """
        return self.X_tokenizer.decode(idxs)

    def Y_idxs2tokens(self, idxs):
        """
        idxs: LongTensor (N,)
            converts an array of tokens to a sentence
        """
        return self.Y_tokenizer.decode(idxs)
Exemple #13
0
class EngGerDataset(Dataset):
    """
    Can be english to german or german to english.
    """
    def __init__(self,
                 data_folder,
                 rank=0,
                 val_set=False,
                 world_size=1,
                 seed=0,
                 eng_to_ger=True,
                 vocab_size=37000,
                 MASK="<MASK>",
                 START="<START>",
                 STOP="<STOP>",
                 exp_name="",
                 max_context=None,
                 batch_size=128,
                 val_size=30000,
                 **kwargs):
        """
        rank: int
            the rank in the distributed training
        val_set: bool
            if true, this dataset is created as the validation set
        world_size: int
            the number of processes if using distributed training
        seed: int
            random seed
        data_folder: str
            the path to the folder that should contain a `train.en` and
            a `train.de` file.
        eng_to_ger: bool
            if true, the x values are returned as english ids and the
            y values are german ids. If false, then visa-versa
        vocab_size: int
            the number of encodings for the byte-pair encoding scheme
        MASK: str
            the mask token
        START: str
            the start token
        STOP: str
            the stop token
        exp_name: str
            name of the experiment
        max_context: int
            the maximum sequence length
        val_size: int
            the number of samples to be set aside for validation
        """
        self.rank = rank
        print("rank:", self.rank)
        self.world_size = world_size
        self.val_set = val_set
        self.val_size = val_size
        self.batch_size = batch_size
        self.data_folder = os.path.expanduser(data_folder)
        self.en_path = os.path.join(data_folder, "train.en")
        self.de_path = os.path.join(data_folder, "train.de")
        self.eng_to_ger = eng_to_ger
        self.vocab_size = vocab_size
        self.MASK = MASK
        self.START = START
        self.STOP = STOP
        self.max_context = max_context
        self.en_tok_path = os.path.join(self.data_folder, "en_tokenizer")
        self.de_tok_path = os.path.join(self.data_folder, "de_tokenizer")
        self.en_arr_path = os.path.join(self.data_folder, "en_bcolz")
        self.de_arr_path = os.path.join(self.data_folder, "de_bcolz")
        self.en_lens_path = os.path.join(self.data_folder, "en_bcolz_lens")
        self.de_lens_path = os.path.join(self.data_folder, "de_bcolz_lens")

        # Train tokenizers
        if rank == 0: print("Tokenizing english..")
        self.en_tokenizer = CharBPETokenizer()
        if os.path.exists(self.en_tok_path):  # Load trained tokenizer
            if rank == 0:
                print("loading from pretrained tokenizer", self.en_tok_path)
            self.en_tokenizer = ml_utils.datas.load_tokenizer(
                self.en_tokenizer, self.en_tok_path)
        else:
            self.en_tokenizer.train([self.en_path], vocab_size=vocab_size)
            os.mkdir(self.en_tok_path)
            self.en_tokenizer.save_model(self.en_tok_path)
        self.en_tokenizer.add_special_tokens([self.MASK])
        self.en_tokenizer.add_tokens([self.START])
        self.en_tokenizer.add_tokens([self.STOP])
        self.en_mask_idx = self.en_tokenizer.token_to_id(self.MASK)
        self.en_start_idx = self.en_tokenizer.token_to_id(self.START)
        self.en_stop_idx = self.en_tokenizer.token_to_id(self.STOP)

        if rank == 0: print("Tokenizing german..")
        self.de_tokenizer = CharBPETokenizer()
        if os.path.exists(self.de_tok_path):  # Load trained tokenizer
            if rank == 0:
                print("loading from pretrained tokenizer", self.de_tok_path)
            self.de_tokenizer = ml_utils.datas.load_tokenizer(
                self.de_tokenizer, self.de_tok_path)
        else:
            self.de_tokenizer.train([self.de_path], vocab_size=vocab_size)
            os.mkdir(self.de_tok_path)
            self.de_tokenizer.save_model(self.de_tok_path)
        self.de_tokenizer.add_special_tokens([self.MASK])
        self.de_tokenizer.add_tokens([self.START])
        self.de_tokenizer.add_tokens([self.STOP])
        self.de_mask_idx = self.de_tokenizer.token_to_id(self.MASK)
        self.de_start_idx = self.de_tokenizer.token_to_id(self.START)
        self.de_stop_idx = self.de_tokenizer.token_to_id(self.STOP)

        # Get English sentence lists
        if rank == 0: print("Making english idxs")
        if os.path.exists(self.en_arr_path):
            if rank == 0: print("loading from bcolz", self.en_arr_path)
            self.en_idxs = bcolz.carray(rootdir=self.en_arr_path)
            self.en_lens = bcolz.carray(rootdir=self.en_lens_path)
            self.en_max_len = self.en_idxs.shape[-1]
            if exp_name == "test":
                self.val_size = 250
                self.en_idxs = self.en_idxs[:1000]
                self.en_lens = self.en_lens[:1000]
            if self.world_size > 1:
                with temp_seed(seed - rank):
                    sample_perm = np.random.permutation(len(self.en_idxs))
                if not self.val_set:
                    n_samps = (len(self.en_idxs) - self.val_size)
                    n_samps = n_samps // self.world_size
                    indices = sample_perm[rank * n_samps:(rank + 1) * n_samps]
                else:
                    indices = sample_perm[-self.val_size:]
                try:
                    if rank == 0:
                        print("splitting dataset.. ", end="")
                        starttime = time.time()
                    self.en_idxs = self.en_idxs[indices]
                    self.en_lens = self.en_lens[indices]
                    if rank == 0: print("duration:", time.time() - starttime)
                except:
                    temp_idxs = []
                    temp_lens = []
                    if rank == 0:
                        print("Collecting data")
                        rnge = tqdm(indices)
                    else:
                        rnge = indices
                    for i in rnge:
                        temp_idxs.append(self.en_idxs[i])
                        temp_lens.append(self.en_lens[i])
                    self.en_idxs = np.asarray(temp_idxs)
                    self.en_lens = np.asarray(temp_lens)
                    if rank == 0: print("duration:", time.time() - starttime)
        elif world_size == 1:
            self.en_max_len = 0
            self.en_idxs = []
            self.en_lens = []
            with open(self.en_path, 'r') as f:
                for i, l in tqdm(enumerate(f.readlines())):
                    l = l.strip()
                    if len(l) > 0:
                        output = self.en_tokenizer.encode(l)
                        ids = [self.en_start_idx]+list(output.ids)\
                                                 +[self.en_stop_idx]
                        self.en_idxs.append(ids)
                        self.en_lens.append(len(ids))
                        if len(ids) > self.en_max_len:
                            self.en_max_len = len(ids)
                    if exp_name == "test" and i > 100:
                        break
            mask = [self.en_mask_idx for i in range(self.en_max_len)]
            l = 0
            if rank == 0: print("Padding english idxs")
            for i in tqdm(range(len(self.en_idxs))):
                diff = self.en_max_len - len(self.en_idxs[i])
                self.en_idxs[i] = self.en_idxs[i] + mask[:diff]
            if rank == 0: print("Saving to bcolz")
            self.en_idxs = bcolz.carray(self.en_idxs,
                                        rootdir=self.en_arr_path,
                                        dtype="int32")
            self.en_idxs.flush()
            self.en_lens = bcolz.carray(self.en_lens,
                                        rootdir=self.en_lens_path,
                                        dtype="int32")
            self.en_lens.flush()
        else:
            print("Make dataset without using multi-processing!!")
            assert False
        if self.en_max_len > max_context:
            if rank == 0:
                print("Truncating context from", self.en_max_len, "to",
                      self.max_context)
            self.en_max_len = self.max_context

        # Get German Sentence Lists
        if rank == 0: print("Making german idxs")
        if os.path.exists(self.de_arr_path):
            if rank == 0: print("loading from bcolz", self.de_arr_path)
            self.de_idxs = bcolz.carray(rootdir=self.de_arr_path)
            self.de_lens = bcolz.carray(rootdir=self.de_lens_path)
            self.de_max_len = self.de_idxs.shape[-1]
            if exp_name == "test":
                self.val_size = 250
                self.en_idxs = self.en_idxs[:1000]
                self.en_lens = self.en_lens[:1000]
            if self.world_size > 1:
                try:
                    if rank == 0:
                        print("splitting dataset.. ", end="")
                        starttime = time.time()
                    self.de_idxs = self.de_idxs[indices]
                    self.de_lens = self.de_lens[indices]
                    if rank == 0: print("duration:", time.time() - starttime)
                except:
                    temp_idxs = []
                    temp_lens = []
                    try:
                        if rank == 0: print("Collecting data")
                        for i in rnge:
                            temp_idxs.append(self.de_idxs[i])
                            temp_lens.append(self.de_lens[i])
                    except Exception as e:
                        print("Likely error caused by bcolz existing "+\
                                               "for en but not de data")
                        print(e)
                        assert False
                    self.de_idxs = np.asarray(temp_idxs)
                    self.de_lens = np.asarray(temp_lens)
                    if rank == 0: print("duration:", time.time() - starttime)
        else:
            self.de_max_len = 0
            self.de_idxs = []
            self.de_lens = []
            with open(self.de_path, 'r') as f:
                for i, l in tqdm(enumerate(f.readlines())):
                    l = l.strip()
                    if len(l) > 0:
                        output = self.de_tokenizer.encode(l)
                        ids = [self.de_start_idx]+list(output.ids)\
                                                 +[self.de_stop_idx]
                        self.de_idxs.append(ids)
                        self.de_lens.append(len(ids))
                        if len(ids) > self.de_max_len:
                            self.de_max_len = len(ids)
                    if exp_name == "test" and i > 100:
                        break
            mask = [self.de_mask_idx for i in range(self.de_max_len)]
            if rank == 0: print("Padding german idxs")
            for i in tqdm(range(len(self.de_idxs))):
                diff = self.de_max_len - len(self.de_idxs[i])
                self.de_idxs[i] = self.de_idxs[i] + mask[:diff]
            if rank == 0: print("Saving to bcolz")
            self.de_idxs = bcolz.carray(self.de_idxs,
                                        rootdir=self.de_arr_path,
                                        dtype="int32")
            self.de_idxs.flush()
            self.de_lens = bcolz.carray(self.de_lens,
                                        rootdir=self.de_lens_path,
                                        dtype="int32")
            self.de_lens.flush()
        if self.de_max_len > max_context:
            if rank == 0:
                print("Truncating context from", self.de_max_len, "to",
                      self.max_context)
            self.de_max_len = self.max_context

        if rank == 0: print("Converting to numpy arrays")
        if self.eng_to_ger:
            self.X = np.asarray(self.en_idxs)
            self.X_lens = np.asarray(self.en_lens)
            self.X_tokenizer = self.en_tokenizer
            self.X_mask_idx = self.en_mask_idx
            self.X_start_idx = self.en_start_idx
            self.X_stop_idx = self.en_stop_idx
            self.X_max_len = self.en_max_len

            self.Y = np.asarray(self.de_idxs)
            self.Y_lens = np.asarray(self.de_lens)
            self.Y_tokenizer = self.de_tokenizer
            self.Y_mask_idx = self.de_mask_idx
            self.Y_start_idx = self.de_start_idx
            self.Y_stop_idx = self.de_stop_idx
            self.Y_max_len = self.de_max_len
        else:
            self.X = np.asarray(self.de_idxs)
            self.X_lens = np.asarray(self.de_lens)
            self.X_tokenizer = self.de_tokenizer
            self.X_mask_idx = self.de_mask_idx
            self.X_start_idx = self.de_start_idx
            self.X_stop_idx = self.de_stop_idx
            self.X_max_len = self.de_max_len

            self.Y = np.asarray(self.en_idxs)
            self.Y_lens = np.asarray(self.en_lens)
            self.Y_tokenizer = self.en_tokenizer
            self.Y_mask_idx = self.en_mask_idx
            self.Y_start_idx = self.en_start_idx
            self.Y_stop_idx = self.en_stop_idx
            self.Y_max_len = self.en_max_len

    def __len__(self):
        return len(self.en_idxs)

    def __getitem__(self, i, l=None):
        if l is None:
            l = self.X_lens[int(i)]
        idxs = np.zeros(1)
        margin = 5
        while idxs.sum() < 25 and margin < 400:
            min_l = l - margin
            max_l = l + margin
            idxs = (self.X_lens > min_l) & (self.X_lens < max_l)
            margin += 5
        max_l = min(np.max(self.X_lens[idxs]), self.max_context)
        if max_l < 50: batch_size = self.batch_size
        elif max_l < 70: batch_size = self.batch_size // 2
        elif max_l < 100: batch_size = self.batch_size // 4
        elif max_l < 120: batch_size = self.batch_size // 8
        elif max_l < 140: batch_size = self.batch_size // 16
        elif max_l < 160: batch_size = self.batch_size // 32
        else: batch_size = self.batch_size // 64
        batch_size = max(16, batch_size)
        perm = np.random.permutation(idxs.sum())[:batch_size]
        max_l = np.max(self.X_lens[idxs][perm])
        x = np.asarray(self.X[idxs][perm, :max_l])
        max_l = np.max(self.Y_lens[idxs][perm])
        y = np.asarray(self.Y[idxs][perm, :max_l])
        return torch.LongTensor(x), torch.LongTensor(y)

    def get_largest_batch(self, size_num):
        l = 10
        if size_num == 1:
            l = 25
        elif size_num == 2:
            l = 400
        elif size_num == 3:
            l = 130
        elif size_num == 4:
            l = 75
        elif size_num == 5:
            l = 44
        elif size_num == 6:
            l = 94
        elif size_num == 7:
            l = 200
        elif size_num == 8:
            l = 300
        return self.__getitem__(0, l)

    def X_idxs2tokens(self, idxs):
        """
        idxs: LongTensor (N,)
            converts an array of tokens to a sentence
        """
        return self.X_tokenizer.decode(idxs)

    def Y_idxs2tokens(self, idxs):
        """
        idxs: LongTensor (N,)
            converts an array of tokens to a sentence
        """
        return self.Y_tokenizer.decode(idxs)
Exemple #14
0
from tokenizers import CharBPETokenizer

tokenizer_code = CharBPETokenizer()
tokenizer_doc = CharBPETokenizer()

#tokenizer.train([list of files to learn tokenizer from], vocab_size)
#change file locations if the training files is elsewhere
tokenizer_code.train([
    "ncs_preprocessed_data/train-ncs/code.original_subtoken",
    "ncs_preprocessed_data/dev/code.original_subtoken"
],
                     vocab_size=100000)
tokenizer_doc.train([
    "ncs_preprocessed_data/train-ncs/javadoc.original",
    "ncs_preprocessed_data/dev/javadoc.original"
],
                    vocab_size=100000)

print(tokenizer_code.get_vocab)
print(tokenizer_doc.get_vocab)

#use the trained tokenizer_code to encode and write in output_file

file_dir = "data/ncs_preprocessed_data/train-CoDesc/"
src_file_name = "code.original_subtoken"
tgt_file_name = "code.bpe"

output_file = open(file_dir + "/" + tgt_file_name, "w")
output_file.close()
output_file = open(file_dir + "/" + tgt_file_name, "a")
with open(file_dir + "/" + src_file_name, 'r') as file:
        # ('../youtube-speech-text/preprocessed_english_meta.csv', 'Normalized Transcription')
    ]
    if not os.path.exists('raw_corpus.txt'):
        with open('raw_corpus.txt', 'w') as f:
            for csv_filename, col_name in caption_texts:
                texts = list(pd.read_csv(csv_filename)[col_name])
                for t in texts:
                    t.replace('<eos>', '')
                    f.write(t + '\n')
    tokenizer = CharBPETokenizer(lowercase=True)

    tokenizer.train(
        ["raw_corpus.txt"],
        vocab_size=1000,
        min_frequency=2,
        special_tokens=[
            "<blank>",
            "<bos>",
            "<unk>",
        ],
    )

    # os.makedirs('./BPE-1000', exist_ok=True)
    tokenizer.save(f'./BPE-1000', '')

    tokenizer = CharBPETokenizer('./BPE-1000/-vocab.json',
                                 './BPE-1000/-merges.txt')
    # with open('.test.pkl', 'w') as f:
    #     pickle.dump(tokenizer, f)

    tokenizer = HuggingFaceTokenizer()
    print(
Exemple #16
0
files = glob.glob(args.files)
if not files:
    logger.info(f'File does not exist: {args.files}')
    exit(1)

# Initialize an empty tokenizer
# ANY ARGS?
tokenizer = CharBPETokenizer()

# And then train
tokenizer.train(
    files,
    vocab_size=args.vocab_size,
    min_frequency=2,
    show_progress=True,
    special_tokens=['<unk>'],
    suffix='</w>',
    limit_alphabet=args.limit_alphabet,
)

# Save the files
tokenizer.save(args.out, args.name)

# Restoring model from learned vocab/merges
tokenizer = CharBPETokenizer(
    join(args.out, '{}-vocab.json'.format(args.name)),
    join(args.out, '{}-merges.txt'.format(args.name)),
)

# Test encoding
def train(args):

    tokenizer = CharBPETokenizer()

    tokenizer.train([args.corpus], vocab_size=args.size)
    tokenizer.save(args.output_file)
Exemple #18
0
from tokenizers import CharBPETokenizer
import os
import torch
from transformers import RobertaConfig

paths = ['data/train.txt']
# Initialize a tokenizer
tokenizer = CharBPETokenizer(split_on_whitespace_only=True)
# Customize training
tokenizer.train(files=paths,
                special_tokens=[
                    "<s>",
                    "<pad>",
                    "</s>",
                    "<unk>",
                    "<mask>",
                ])
os.makedirs('./tokenizer/Charbpetokenizer', exist_ok=True)
tokenizer.save_model('./tokenizer/Charbpetokenizer')
tokenizer = CharBPETokenizer(
    "./tokenizer/Charbpetokenizer/vocab.json",
    "./tokenizer/Charbpetokenizer/merges.txt",
)

config = RobertaConfig(
    vocab_size=12531,
    max_position_embeddings=130,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)
Exemple #19
0
# coding:utf-8
from tokenizers import CharBPETokenizer
from pathlib import Path

# Initialize a tokenizer
tokenizer = CharBPETokenizer()

# Then train it!
tokenizer.train(["./data/wiki_sunyang.txt"])

# And you can use it
encoded = tokenizer.encode(
    "In 2012, Sun became the first Chinese man to win an Olympic gold medal in swimming."
)
# print(encoded.tokens)

# And finally save it somewhere
saved_path = Path("./saved_tokenizer/wiki_sunyang")
saved_path.mkdir(exist_ok=True, parents=True)
tokenizer.save(str(saved_path))
Exemple #20
0
def main():
    batch_size = 4
    vocab_size = 16384
    max_source_length = 1024
    max_target_length = 1024
    num_workers = 3

    dataset = nlp.load_dataset("iwslt2017.py", "nl-en")

    # Train tokenizer
    tokenizer_filename = "tokenizer.json"
    if os.path.exists(tokenizer_filename):
        tokenizer = Tokenizer.from_file(tokenizer_filename)
    else:
        data_filename = "whole_data.txt"
        with open(data_filename, "w") as f:
            for item in dataset["train"]:
                f.write(item["source"] + "\n")
                f.write(item["target"] + "\n\n")

        tokenizer = CharBPETokenizer()
        tokenizer.train([data_filename], vocab_size=vocab_size)
        pad_token = AddedToken("[PAD]", lstrip=False, rstrip=False)
        tokenizer.add_tokens([pad_token])
        tokenizer.save(tokenizer_filename)

    tokenizer.pad_token_id = vocab_size

    # Loaders
    train_dataset = Seq2SeqDataset(tokenizer, dataset["train"],
                                   max_source_length, max_target_length)
    val_dataset = Seq2SeqDataset(tokenizer, dataset["validation"],
                                 max_source_length, max_target_length)
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=train_dataset.collate_fn,
        num_workers=num_workers,
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        collate_fn=val_dataset.collate_fn,
        num_workers=num_workers,
    )

    # Train model
    config = BartConfig(
        vocab_size=vocab_size + 1,  # Pad
        d_model=1024,
        encoder_ffn_dim=1024,
        encoder_layers=6,
        encoder_attention_heads=4,
        decoder_ffn_dim=1024,
        decoder_layers=6,
        decoder_attention_heads=4,
    )
    model = BartForConditionalGeneration(config)
    translator = Translate(model, tokenizer)

    trainer = pl.Trainer(gpus=1)
    trainer.fit(translator, train_loader, val_loader)
from tokenizers import CharBPETokenizer
import json
import tqdm

if __name__ == "__main__":
    # Initialize a tokenizer
    tokenizer = CharBPETokenizer()

    # Then train it!
    tokenizer.train(
        [
            "data\\train.txt",
            "D:/数据/wikitext-2-raw-v1/wikitext-2-raw/wiki.train.raw",
            "D:/数据/webtext2019zh/web_text_raw.txt"
        ],
        vocab_size=30000,
        min_frequency=2,
        special_tokens=['<UNK>', '<BOS>', '<EOS>', '<PAD>', '<CLS>', '<SEP>'])

    # Now, let's use it:
    encoded = tokenizer.encode("I can feel the magic, can you?")

    # And finally save it somewhere
    tokenizer.save("./", "bpe.tokenizer.json")
Exemple #22
0
# function to save model parameters
def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
    print("=> Saving checkpoint")
    torch.save(state, filename)


# function to load pre-saved model parameters
def load_checkpoint(checkpoint, model, optimizer):
    print("=> Loading checkpoint")
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])


# Separate Tokenizers training
tokenizer = CharBPETokenizer()
tokenizer.train(["english_lt.txt", "lithuanian.txt"])
eng_tokenizer = CharBPETokenizer()
eng_tokenizer.train(["english_lt.txt"])
lt_tokenizer = CharBPETokenizer()
lt_tokenizer.train(["lithuanian.txt"])

# Data loading
english_txt = open("english_lt.txt", encoding="utf-8").read().split("\n")
lithuanian_txt = open("lithuanian.txt", encoding="utf-8").read().split("\n")

# Create dataframe
raw_data = {
    "English": [line for line in english_txt],
    "Lithuanian": [line for line in lithuanian_txt],
}
df = pd.DataFrame(raw_data, columns=["English", "Lithuanian"])