def main():
    args = cmd_args()
    outdir = args.o if args.o else os.path.dirname(args.i)

    target_special_tokens, subtoken_special_tokens = get_special_tokens(
        args.preset)
    with tempfile.TemporaryDirectory() as tmp_dir:
        targets_file = os.path.join(tmp_dir, "labels.txt")
        subtokens_file = os.path.join(tmp_dir, "subtokens.txt")

        print(f"Creating training files for BPE")
        create_bpe_training_file(args.i, targets_file, subtokens_file)
        if args.preset == Preset.variable:
            print("Variable preset")

        subtoken_tokenizer = SentencePieceBPETokenizer()
        target_tokenizer = SentencePieceBPETokenizer()
        print(f"Training subtoken tokenizer")
        subtoken_tokenizer.add_special_tokens(subtoken_special_tokens)
        print(f"Training target tokenizer")
        target_tokenizer.add_special_tokens(target_special_tokens)

        target_tokenizer.train(files=[targets_file],
                               vocab_size=args.target_vocab)
        subtoken_tokenizer.train(files=[subtokens_file],
                                 vocab_size=args.subtoken_vocab)

    target_tokenizer.save(outdir, "target.bpe")
    subtoken_tokenizer.save(outdir, "subtoken.bpe")
Example #2
0
def main():
    args = cmd_args()
    outdir = args.o if args.o else os.path.dirname(args.i)

    print(
        f"Training SentencePiece to create a vocabulary of size {args.vocab_size}"
    )
    with tempfile.TemporaryDirectory() as tmp_dir:
        train_file = os.path.join(tmp_dir, "train.txt")
        create_bpe_training_file(args.i, train_file)

        tokenizer = SentencePieceBPETokenizer()
        tokenizer.train(files=[train_file], vocab_size=args.vocab_size)

    tokenizer.save(outdir, args.n)
Example #3
0
def build_bpe(vocab_size=10000):
    # Initialize a tokenizer
    tokenizer = SentencePieceBPETokenizer()

    #mypath = "../../Downloads/riksdagens_protokoll_1920-2020/annual"
    mypath = "../../Desktop/cood/python/machine-learning/old-school/markov-lstm-killer/data/fi"
    onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
    print("ONL", onlyfiles)

    paths = [mypath + "/" + f for f in onlyfiles]

    #paths = paths[:5]

    # COPY FILES
    txts = []
    for path, fname in zip(paths, onlyfiles):
        if path[-4:] == ".txt":
            localpath = "data/" + fname
            txts.append(localpath)

            infile = open(path)
            outfile = open(localpath, "w")

            for line in infile:
                clean_line = cleanup(line) + "\n"
                outfile.write(clean_line)

            outfile.close()

    # Then train it!
    #tokenizer.train([ "../../Downloads/riksdagens_protokoll_1920-2020/annual/prot_2019.txt" ], vocab_size=15000)
    tokenizer.train(txts, vocab_size=vocab_size)

    # Now, let's use it:
    s = "Det politiska arbetet har redan börjat på olika sätt, med resor, besök, möten, politikutveckling, motionsskrivande och mycket annat. Jag har sett att ni redan har varit aktiva under ett antal veckor, och jag kan försäkra er att det även gäller talmanspresidiet. Nu är det dags att med tillförsikt påbörja ett nytt riksdagsår. Jag hoppas att ni alla ser fram emot det lika myck­et som jag gör."
    #s = "Ite en oo viel mitää hyvää kyl sielt syöny."
    #s = "ja kieltämät siihe tommoste kokonaisii sanoi merkitsevät tavumerkit on huomattavasti näppärämpii ku ääniä tarkottavat aakkoset joist pitää rakentaa jokane sana"
    encoded = tokenizer.encode(s)

    print(encoded.ids)
    print(encoded.tokens)
    # And finally save it somewhere
    tokenizer.save("./bpe-fi.tokenizer.json")
Example #4
0
def train_kenlm_language_model(input_data_paths, output_model_dir):
    output_model_dir = Path(output_model_dir)
    output_model_dir.mkdir(exist_ok=True, parents=True)
    output_model_path = output_model_dir / 'kenlm_model.arpa'
    with log_action('Training tokenizer'):
        tokenizer = SentencePieceBPETokenizer()
        tokenizer.train([str(path) for path in input_data_paths], vocab_size=20000)
        tokenizer.save(str(output_model_dir), 'spm_tokenizer')
    with log_action('Tokenizing'):
        tokenized_data_paths = get_temp_filepaths(len(input_data_paths))
        for tokenized_data_path, input_data_path in zip(tokenized_data_paths, input_data_paths):
            encodings = tokenizer.encode_batch(read_lines(input_data_path))
            write_lines([' '.join(encoding.tokens) for encoding in encodings], tokenized_data_path)
    with log_action('Training language model'):
        kenlm_path = input('Please provide the path to the lmplz script (install at https://github.com/kpu/kenlm): ')
        command = (
            f'cat {" ".join([str(path) for path in tokenized_data_paths])} | {kenlm_path} -o 3 > {output_model_path}'
        )
        run_command(command, mute=False)
    [path.unlink() for path in tokenized_data_paths]
    return output_model_dir
Example #5
0
    # for prefix in prefixes:
    #     input_dir_gs = os.path.join(
    #         STORAGE_BUCKET,
    #         "data/corpus/%s_lower/zhwiki-latest-pages-articles_%s_lower.txt" % (prefix, prefix)
    #     )
    #     input_dir_local = "./zhwiki-latest-pages-articles_%s_lower.txt" % prefix
    #     tf.gfile.Copy(input_dir_gs, input_dir_local, overwrite=True)

    for vocab_size in vocab_sizes:
        for prefix in prefixes:
            try:
                tokenizer_name = prefix + "_" + str(vocab_size)
                tokenizer = SentencePieceBPETokenizer()

                tokenizer.train(
                    [
                        "./zhwiki-latest-pages-articles_%s_lower.txt" % prefix
                        # "./zhwiki-latest-pages-articles_lower.txt"
                    ],
                    vocab_size=vocab_size,
                    show_progress=True,
                    min_frequency=1,
                    special_tokens=[
                        "<unk>", "[SEP]", "[CLS]", "[PAD]", "[MASK]"
                    ])
                tokenizer.save("data_proc/tokenizers/sentencepiece",
                               tokenizer_name)

            except Exception as e:
                print(e)
import argparse
from tokenizers import SentencePieceBPETokenizer
from tokenizers.trainers import BpeTrainer

parser = argparse.ArgumentParser()

parser.add_argument("--corpus_file", type=str)
parser.add_argument("--vocab_size", type=int, default=32000)
parser.add_argument("--limit_alphabet", type=int, default=6000)

args = parser.parse_args()

tokenizer = SentencePieceBPETokenizer(
    vocab_file=None,
    clean_text=True,
    handle_chinese_chars=True,
    strip_accents=False,  # Must be False if cased model
    lowercase=False,
    wordpieces_prefix="##")

tokenizer.train(files=[args.corpus_file],
                limit_alphabet=args.limit_alphabet,
                vocab_size=args.vocab_size)

tokenizer.save("./", "ch-{}-wpm-{}".format(args.limit_alphabet,
                                           args.vocab_size))
Example #7
0
class BPE:
    """
    An implementation of Byte-Pair Encoding (BPE) which supports
    - Character BPE
    - Byte BPE
    - WordPiece BPE
    - SentencePiece BPE
    """
    def __init__(self, args):
        self.args = args
        if self.args.type == "byte":
            self.tokenizer = ByteLevelBPETokenizer(
                add_prefix_space=True,  # required
                lowercase=True,  # required
                unicode_normalizer=None,  # required
                vocab_file=None,
                merges_file=None,
                dropout=None,
                continuing_subword_prefix=None,
                end_of_word_suffix=None)

        elif self.args.type == "char":
            self.tokenizer = CharBPETokenizer(
                unk_token=unk_token,  # required
                suffix=suffix_token,  # required
                lowercase=True,  # required
                unicode_normalizer=None,  # required
                vocab_file=None,
                merges_file=None,
                dropout=None)

        elif self.args.type == "bert":
            self.tokenizer = BertWordPieceTokenizer(
                clean_text=True,  # required
                handle_chinese_chars=True,  # required
                strip_accents=True,  # required
                lowercase=True,  # required
                vocab_file=None,
                # add_special_tokens=True,
                unk_token=BUNK,
                sep_token=BSEP,
                cls_token=BCLS,
                wordpieces_prefix=BPRE)

        elif self.args.type == "sent":
            self.tokenizer = SentencePieceBPETokenizer(
                add_prefix_space=True,  # required
                unk_token=unk_token,
                replacement=rep_token,
                vocab_file=None,
                merges_file=None,
                dropout=None)

        else:
            raise Exception("Not implement yet")

        pass

    @staticmethod
    def load(vocab_file=None):
        if not os.path.exists(vocab_file):
            raise Exception("{} is not exist".format(vocab_file))
        path, filename = os.path.split(vocab_file)
        ttype = filename.split("_")[0]
        merges_file = os.path.join(
            path, filename.replace("vocab.json", "merges.txt"))
        if ttype == "byte":
            if not os.path.exists(merges_file):
                raise Exception("{} is not exist".format(merges_file))
            tokenizer = ByteLevelBPETokenizer(
                add_prefix_space=True,  # required
                lowercase=True,  # required
                unicode_normalizer=None,  # required
                vocab_file=vocab_file,
                merges_file=merges_file,
                dropout=None,
                continuing_subword_prefix=None,
                end_of_word_suffix=None)

        elif ttype == "char":
            if not os.path.exists(merges_file):
                raise Exception("{} is not exist".format(merges_file))
            tokenizer = CharBPETokenizer(
                unk_token=unk_token,  # required
                suffix=suffix_token,  # required
                lowercase=True,  # required
                unicode_normalizer=None,  # required
                vocab_file=vocab_file,
                merges_file=merges_file,
                dropout=None)

        elif ttype == "bert":
            tokenizer = BertWordPieceTokenizer(
                clean_text=True,  # required
                handle_chinese_chars=True,  # required
                strip_accents=True,  # required
                lowercase=True,  # required
                vocab_file=vocab_file,
                # add_special_tokens=True,
                unk_token=BUNK,
                sep_token=BSEP,
                cls_token=BCLS,
                wordpieces_prefix=BPRE)

        elif ttype == "sent":
            if not os.path.exists(merges_file):
                raise Exception("{} is not exist".format(merges_file))
            tokenizer = SentencePieceBPETokenizer(
                add_prefix_space=True,  # required
                unk_token=unk_token,
                replacement=rep_token,
                vocab_file=vocab_file,
                merges_file=merges_file,
                dropout=None)

        else:
            raise Exception("Not implement yet")

        return tokenizer

    def train(self):
        files, vocab_size, min_frequency = self.args.files, self.args.vocab_size, self.args.min_frequency
        limit_alphabet = self.args.limit_alphabet
        files = glob.glob(files)
        if not files:
            print(f"File does not exist: {args.files}")
            exit(1)

        if self.args.type == "bert":
            # special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
            special_tokens = [BPAD, BUNK, BCLS, BSEP, BMASK]
        else:
            # special_tokens = ["<unk>"]
            special_tokens = [pad_token, unk_token]

        if self.args.type == "byte":
            self.tokenizer.train(files=files,
                                 vocab_size=vocab_size,
                                 min_frequency=min_frequency,
                                 special_tokens=special_tokens,
                                 show_progress=True)

        elif self.args.type == "char":
            self.tokenizer.train(files=files,
                                 vocab_size=vocab_size,
                                 min_frequency=min_frequency,
                                 special_tokens=special_tokens,
                                 limit_alphabet=limit_alphabet,
                                 initial_alphabet=[],
                                 suffix=suffix_token,
                                 show_progress=True)

        elif self.args.type == "bert":
            self.tokenizer.train(files=files,
                                 vocab_size=vocab_size,
                                 min_frequency=min_frequency,
                                 special_tokens=special_tokens,
                                 limit_alphabet=limit_alphabet,
                                 initial_alphabet=[],
                                 wordpieces_prefix=BPRE,
                                 show_progress=True)

        elif self.args.type == "sent":
            self.tokenizer.train(files=files,
                                 vocab_size=vocab_size,
                                 min_frequency=min_frequency,
                                 special_tokens=special_tokens,
                                 limit_alphabet=limit_alphabet,
                                 initial_alphabet=[],
                                 show_progress=True)

        else:
            raise Exception("Not implement yet")

        if not os.path.exists(self.args.out):
            os.mkdir(self.args.out)
        self.tokenizer.save(self.args.out, self.args.type + "_level-bpe")

    @staticmethod
    def tokens2ids(pretrained_tokenizer,
                   sos=False,
                   eos=False,
                   add_special_tokens=False):
        """
        :param pretrained_tokenizer: pretrained tokenizer
        :return: a token2index function
        """
        def f(sent):
            if sos:
                sent = SOT + " " + sent
            if eos:
                sent = sent + " " + EOT
            tokenized_ids = pretrained_tokenizer.encode(
                sent, add_special_tokens=add_special_tokens).ids
            return tokenized_ids

        return f

    @staticmethod
    def collate_fn(padding_value=0, batch_first=True):
        def collate(examples):
            source = pad_sequence([torch.tensor(d[0]) for d in examples],
                                  batch_first=batch_first,
                                  padding_value=padding_value)
            target = pad_sequence([
                torch.tensor(d[1]) if d[1] is not None else torch.empty(0)
                for d in examples
            ],
                                  batch_first=batch_first,
                                  padding_value=padding_value)
            return source, target

        return collate
Example #8
0
special_tokens = [
    '[PAD]',
    '[UNK]',
    '[SEP]',
    '[P0]',
    '[P1]',
    '[DOC_SEP]'
]

tokenizer = SentencePieceBPETokenizer(unk_token='[UNK]')

texts = [
    path.join(DATA_PATH, item)
    for item in listdir(DATA_PATH)
    if item.endswith('.txt')
]

tokenizer.train(texts, 
    vocab_size=VOCAB_SIZE, 
    min_frequency=10,
    special_tokens=special_tokens
)

SAVE_PATH = path.join(DATA_PATH, 'vocab')
if not path.isdir(SAVE_PATH):
    import os
    os.makedirs(SAVE_PATH)

tokenizer.save(SAVE_PATH, 'en')
# Initialize an empty tokenizer
tokenizer = SentencePieceBPETokenizer(add_prefix_space=True)

# And then train
tokenizer.train(
    files,
    vocab_size=args.vocab_size,
    min_frequency=2,
    show_progress=True,
    special_tokens=['<unk>'],
    limit_alphabet=1000
)

# Save the files
tokenizer.save(args.out, args.name)

# Restoring model from learned vocab/merges
tokenizer = SentencePieceBPETokenizer(
    join(args.out, '{}-vocab.json'.format(args.name)),
    join(args.out, '{}-merges.txt'.format(args.name)),
    add_prefix_space=True
)

# Test encoding
logger.info('Tokens and their ids from SentencePiece with GFP protein sequence: \n MSKGEE LFTGVVPILVELDGDVNGHKFSVSGEGEG DAT')
encoded = tokenizer.encode('MSKGEE LFTGVVPILVELDGDVNGHKFSVSGEGEG DAT')
logger.info(encoded.tokens)
logger.info(encoded.ids)
logger.info('done!')
Example #10
0
nltk.download('stopwords')
# client = storage.Client()
storage_client = storage.Client()
bucket = storage_client.get_bucket('assignment1bdia')

from tokenizers import SentencePieceBPETokenizer

#Initialize a tokenizer
tokenizer = SentencePieceBPETokenizer()

#Then train it!
tokenizer.train('Trained_words.txt')

#And finally save it somewhere
tokenizer.save("tokenizer", "my-bpe")

#Initialize a tokenizer
vocab = "tokenizer/my-bpe-vocab.json"
merges = "tokenizer/my-bpe-merges.txt"
tokenizer = SentencePieceBPETokenizer(vocab, merges)
global_list = []


# scrape from input to output
class Scrape(beam.DoFn):
    def process(self, element):
        inputs_pattern = 'gs://assignment1bdia/data/export_dataframe.csv'
        df = pd.read_csv(inputs_pattern)
        output_csv = pd.DataFrame(columns=['cik', 'year', 'Filings', 'link'])
        counter = 0
class TextProcessor:
    def __init__(self, tok_model_path: Optional[str] = None):
        self.languages = {}
        if tok_model_path is not None:
            self.tokenizer = SentencePieceBPETokenizer(
                tok_model_path + "/vocab.json",
                tok_model_path + "/merges.txt",
            )
            with open(os.path.join(tok_model_path, "langs"), "rb") as fp:
                self.languages: Dict[str, int] = pickle.load(fp)
        self.init_properties(self.languages)

    def init_properties(self, languages: Dict[str, int] = None):
        self.max_len = 512
        self.pad_token = "<pad>"
        self.mask_token = "<mask>"
        self.unk_token = "<unk>"
        self.sep_token = "</s>"
        self.bos = "<s>"
        self.special_tokens = [
            self.pad_token, self.bos, self.unk_token, self.mask_token,
            self.sep_token
        ] + list(languages.keys())
        self.languages = languages

    def train_tokenizer(self, paths: List[str], vocab_size: int,
                        to_save_dir: str, languages: Dict[str, int]):
        self.tokenizer = SentencePieceBPETokenizer()
        self.init_properties(languages)
        self.tokenizer.train(files=paths,
                             vocab_size=vocab_size,
                             min_frequency=5,
                             special_tokens=self.special_tokens)
        self.save(directory=to_save_dir)

    def _tokenize(self, line) -> Encoding:
        return self.tokenizer.encode(line)

    def save(self, directory):
        self.tokenizer.save(directory)
        with open(os.path.join(directory, "langs"), "wb") as fp:
            pickle.dump(self.languages, fp)

    def tokenize_one_line(self,
                          line,
                          ignore_middle_eos: bool = False) -> List[int]:
        tokenized = []
        spl = [sen for sen in line.split("</s>") if len(sen.strip()) > 0]
        if spl[0].startswith("<"):
            words = spl[0].strip().split(" ")
            spl[0] = " ".join(words[1:])
            tokenized += [self.token_id(words[0])]

        for sen in spl:
            tokenized += self._tokenize(sen).ids
            if not ignore_middle_eos:
                tokenized += [self.sep_token_id()]
        if ignore_middle_eos:
            tokenized += [self.sep_token_id()]
        return tokenized

    def tokenize_one_sentence(self, line) -> List[int]:
        """
        Assume that the sentence has language id in the first token and end of sentence as the end!
        :param line:
        :return:
        """
        spl = line.strip().split(" ")
        lang_id, sen, eos = spl[0], " ".join(spl[1:-1]), spl[-1]
        tokenized = [self.token_id(lang_id)
                     ] + self._tokenize(sen).ids + [self.token_id(eos)]
        return tokenized

    def tokenize_lines(self,
                       line,
                       blind_split: bool = False,
                       split_len: int = 512) -> List[List[int]]:
        """

        :param line:
        :param blind_split: If True, just splits the tokenized data into chunks without considering that every vector
        should start with a first word in sentence.
        :return:
        """
        tokenized = []
        if len(self.languages) > 0:
            spl = [sen for sen in line.split("</s>") if len(sen.strip()) > 0]
            lang_id = []
            if spl[0].startswith("<"):
                words = spl[0].strip().split(" ")
                lang_id = [self.token_id(words[0])]
                spl[0] = " ".join(words[1:])

            max_len = 0
            for sen in spl:
                toks = self._tokenize(sen).ids
                tokenized += lang_id + toks + [self.sep_token_id()]
                max_len = max(max_len, len(toks) + 1)
        else:
            tokenized = self._tokenize(line.strip()).ids

        if blind_split:
            num_pads = (split_len - (len(tokenized) % split_len))
            pad_arr = [self.pad_token_id()] * num_pads
            tokenized = np.array(tokenized + pad_arr)
            reshaped = tokenized.reshape((-1, split_len))
            return reshaped
        else:
            return self.split_tokenized(tokenized, min(max_len, self.max_len))

    def tokenize(self, lines) -> List[List[int]]:
        lines = [
            line.strip() for line in lines.strip().split("\n")
            if len(line.strip()) > 0
        ]
        tokenized = self.tokenizer.encode_batch(lines)
        return [tok.ids for tok in tokenized]

    def pad_token_id(self) -> int:
        return self.tokenizer.token_to_id(self.pad_token)

    def mask_token_id(self) -> int:
        return self.tokenizer.token_to_id(self.mask_token)

    def unk_token_id(self) -> int:
        return self.tokenizer.token_to_id(self.unk_token)

    def bos_token_id(self) -> int:
        return self.tokenizer.token_to_id(self.bos)

    def sep_token_id(self) -> int:
        return self.tokenizer.token_to_id(self.sep_token)

    def token_id(self, token: str) -> int:
        tok_id = self.tokenizer.token_to_id(token)
        if tok_id is None:
            return 0
        return tok_id

    def id2token(self, id: int) -> str:
        return self.tokenizer.id_to_token(id)

    def vocab_size(self) -> int:
        return self.tokenizer.get_vocab_size()

    def is_lang(self, id) -> bool:
        return self.tokenizer.id_to_token(id) in self.languages

    def lang_id(self, tok):
        if tok in self.languages:
            return self.languages[tok]
        return 0

    def split_tokenized(self,
                        tokenized: List[int],
                        max_length: int = 512) -> List[List[int]]:
        """
        Based on self.max_len, splits very long sequences to smaller ones.
        Here we assume to not have any overlapping sequences.
        If the first token is a language, we add it to every new sequence.
        :return:
        """
        if len(tokenized) <= max_length:
            sequences = [tokenized]
            sequences[-1] = sequences[-1] + (
                max_length - len(sequences[-1])) * [self.pad_token_id()]
            return sequences

        has_lang = self.is_lang(tokenized[0])
        sequence = tokenized[0:] if has_lang else tokenized

        seq_len = len(sequence)
        sep_id = self.sep_token_id()
        max_len = max_length - 1 if has_lang else max_length

        cur_start = 0
        sequences = []
        built_seq = []
        truncated = False  # Shows if previous sequence is truncated due to its length.
        used_ends = set()
        while cur_start < seq_len:
            if not truncated or not has_lang:
                cur_end = min(seq_len, cur_start + max_len)
            else:
                cur_end = min(seq_len, cur_start + max_len + 1)
            subseq = sequence[cur_start:cur_end]

            built_seq += subseq
            sep_positions = [
                i for i, id in enumerate(built_seq) if id == sep_id
            ]
            if len(sep_positions) > 0:
                if sep_positions[-1] in used_ends:
                    truncated = True
                else:
                    built_seq = built_seq[:sep_positions[-1] + 1]
                    truncated = False
            else:
                truncated = True

            assert built_seq[-1] == sequence[len(built_seq) - 1]

            if has_lang and len(subseq) < max_len + 1:
                subseq = [tokenized[0]] + subseq

            sequences.append(subseq)

            cur_start = len(built_seq)
            used_ends.add(cur_start - 1)
        if len(sequences[-1]) < max_length:
            sequences[-1] = sequences[-1] + (
                max_length - len(sequences[-1])) * [self.pad_token_id()]
        assert built_seq[-1] == sequence[len(built_seq) - 1]
        return sequences