Beispiel #1
0
    def __init__(self, **kwargs):
        super().__init__()
        self.save_hyperparameters()
        self.tokenizer = Tokenizer.from_file(self.hparams.tokenizer_file)
        self.tokenizer.add_special_tokens(["<s>", "</s>"])
        vocab_size = self.tokenizer.get_vocab_size()
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=self.hparams.embedding_dim)
        current_input_dim = self.hparams.embedding_dim

        if (self.hparams.use_transformer):
            self.pos_encoder = PositionalEncoding(self.hparams.embedding_dim, 0.2)            
            encoder_layer = nn.TransformerEncoderLayer(d_model=self.hparams.embedding_dim, nhead=8)
            self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=4)
            self.dropout = nn.Dropout(p=0.2)
            self.output = nn.Linear(self.hparams.embedding_dim, out_features=len(streaming_punctuator.data.PUNCTUATIONS))


        else:
            if len(self.hparams.conv_dims) > 0:
                self.conv_dims = [int(s) for s in self.hparams.conv_dims.split(",")]
                self.conv_sizes = [int(s) for s in self.hparams.conv_sizes.split(",")]
                self.conv_dilations = [int(s) for s in self.hparams.conv_dilations.split(",")]
                assert(len(self.conv_dims) == len(self.conv_sizes))
                assert(len(self.conv_dims) == len(self.conv_dilations))

                conv_layers = []
                current_input_dim = self.hparams.embedding_dim
                for i in range(len(self.conv_dims)):
                    conv_layer = CausalConv1d(current_input_dim, self.conv_dims[i], self.conv_sizes[i], 
                                        dilation=self.conv_dilations[i], 
                                        bias=False)                                
                    conv_layers.append(conv_layer)            
                    conv_layers.append(nn.BatchNorm1d(self.conv_dims[i], affine=False))
                    conv_layers.append(nn.ReLU(inplace=True))
                    current_input_dim = self.conv_dims[i]
                self.conv_layers = nn.Sequential(*conv_layers)
            else:
                self.conv_layers = None

            self.lstm = nn.LSTM(input_size=current_input_dim, 
                    hidden_size=self.hparams.lstm_hidden_size,
                    bidirectional=self.hparams.use_bidirectional,
                    num_layers=self.hparams.lstm_num_layers,
                    batch_first=True)
            self.dropout = nn.Dropout(p=0.2)
            lstm_output_size = self.hparams.lstm_hidden_size + self.hparams.lstm_hidden_size * self.hparams.use_bidirectional
            self.output = nn.Linear(lstm_output_size, out_features=len(streaming_punctuator.data.PUNCTUATIONS))
Beispiel #2
0
    def test_line2seqs_label_delay(self):
        tokenizer = Tokenizer.from_file("test/tokenizer.json") 
        tokenizer.add_special_tokens(["<s>", "</s>"])
        token_seq, label_seq = line2seqs("tere ! minu nimi on Baabuu .", tokenizer, label_delay=2)
        print([tokenizer.id_to_token(i) for i in token_seq])
        print(label_seq)
        self.assertEqual(len(token_seq), len(label_seq))
        self.assertEqual([tokenizer.id_to_token(i) for i in token_seq], ['<s>', 'tere</w>', 'minu</w>', 'nimi</w>', 'on</w>', 'Baa', 'bu', 'u</w>', '</s>', '</s>', '</s>'])
        self.assertEqual(label_seq, [-1, -1, -1, 4, 0, 0, 0, -1, -1, 2, -1])


        token_seq, label_seq = line2seqs("tere ! minu nimi on Baabuu .", tokenizer, label_delay=0)
        print([tokenizer.id_to_token(i) for i in token_seq])
        print(label_seq)
        self.assertEqual(len(token_seq), len(label_seq))
        self.assertEqual([tokenizer.id_to_token(i) for i in token_seq], ['<s>', 'tere</w>', 'minu</w>', 'nimi</w>', 'on</w>', 'Baa', 'bu', 'u</w>', "</s>"])
        self.assertEqual(label_seq, [-1, 4, 0, 0, 0, -1, -1, 2, -1])
Beispiel #3
0
def fetch_encoder(params):
    no_dataset = params.get('no_dataset', False)
    if no_dataset:
        return None

    dataset = next(iter(params['dataset_configs'].values())
                   )  # Get the first value from the dict
    path = dataset["tokenizer_path"]
    is_pretrained = dataset.get("tokenizer_is_pretrained", False)

    if is_pretrained:
        tok = GPT2TokenizerFast.from_pretrained(path)

        # Will add a padding token id of 50257 at run-time
        tok.add_special_tokens({'pad_token': '<|padding|>'})
        return tok

    return Tokenizer.from_file(path)
def init_tokenizer(lang, n, m):
    if n is None and m is None:
        print('size nor model are specified, but one of them is required')
        exit(1)

    if m is not None:
        tokenizer = AutoTokenizer.from_pretrained(m, use_fast=True)
        return tokenizer

    tokenizer = Tokenizer.from_file(
        str(
            Path('data') / lang / 'preparation' / 'vocabularies' /
            f'{lang}-{str(n).zfill(3)}k.tokenizer.json'))
    tokenizer.post_processor = RobertaProcessing(
        ('</s>', tokenizer.token_to_id('</s>')),
        ('<s>', tokenizer.token_to_id('<s>')),
        trim_offsets=True)
    return tokenizer
Beispiel #5
0
    def test_dataloader(self):
        tokenizer = Tokenizer.from_file("test/tokenizer.json") 
        tokenizer.add_special_tokens(["<s>", "</s>"])

        dataset = PunctuationDataset(tokenizer, "test/dev.txt")
        
        batch_size = 8

        random_sampler = RandomSampler(dataset)
        
        batch_iterator = BucketBatchSampler(random_sampler, batch_size=batch_size, drop_last=False, sort_key=lambda x: dataset[x]["length"], bucket_size_multiplier=100)
        dataloader = torch.utils.data.DataLoader(dataset, batch_sampler=batch_iterator, collate_fn=dataset.collate_batch)

        for i in range(2):
            print(f"Testing epoch {i}")
            for j, batch in enumerate(dataloader):
                if j == 0:
                    # make sure that the length difference inside a batch is not > 20%
                    self.assertTrue((batch["lengths"].max() - batch["lengths"].min()) / batch["lengths"].max() < 0.2 )
Beispiel #6
0
def bleu_eval(args, references):
    """
    BLEU-1 via tokenized smile
    """
    print("Loading Tokenizer: {}.".format(args.tokenizer))
    tokenizer = Tokenizer.from_file(args.tokenizer)
    scores = []
    for smi in references:
        cur_scores = []
        for smi2 in references:
            if smi2 != smi:
                reference = tokenizer.encode(smi)
                candidate = tokenizer.encode(smi2)
                cur_scores.append(
                    sentence_bleu(reference.tokens,
                                  candidate.tokens,
                                  weights=(1.0, 0, 0, 0)))
        scores.append(np.mean(cur_scores))
    return round(np.mean(scores), 4)
    def __init__(self, *args, **kwargs):
        tokenizer_object = kwargs.pop("tokenizer_object", None)
        slow_tokenizer = kwargs.pop("__slow_tokenizer", None)
        fast_tokenizer_file = kwargs.pop("tokenizer_file", None)
        from_slow = kwargs.pop("from_slow", False)

        if from_slow and slow_tokenizer is None and self.slow_tokenizer_class is None:
            raise ValueError(
                "Cannot instantiate this tokenizer from a slow version. If it's based on sentencepiece, make sure you "
                "have sentencepiece installed."
            )

        if tokenizer_object is not None:
            fast_tokenizer = tokenizer_object
        elif fast_tokenizer_file is not None and not from_slow:
            # We have a serialization from tokenizers which let us directly build the backend
            fast_tokenizer = TokenizerFast.from_file(fast_tokenizer_file)
        elif slow_tokenizer is not None:
            # We need to convert a slow tokenizer to build the backend
            fast_tokenizer = convert_slow_tokenizer(slow_tokenizer)
        elif self.slow_tokenizer_class is not None:
            # We need to create and convert a slow tokenizer to build the backend
            slow_tokenizer = self.slow_tokenizer_class(*args, **kwargs)
            fast_tokenizer = convert_slow_tokenizer(slow_tokenizer)
        else:
            raise ValueError(
                "Couldn't instantiate the backend tokenizer from one of: \n"
                "(1) a `tokenizers` library serialization file, \n"
                "(2) a slow tokenizer instance to convert or \n"
                "(3) an equivalent slow tokenizer class to instantiate and convert. \n"
                "You need to have sentencepiece installed to convert a slow tokenizer to a fast one."
            )

        self._tokenizer = fast_tokenizer

        if slow_tokenizer is not None:
            kwargs.update(slow_tokenizer.init_kwargs)

        self._decode_use_source_tokenizer = False

        # We call this after having initialized the backend tokenizer because we update it.
        super().__init__(**kwargs)
Beispiel #8
0
    def __init__(self, datapath):
        self.training = True
        with open(os.path.join(datapath, 'entity2wikidata.json'), 'r') as f:
            self.entity2wiki = json.load(f)

        with open(os.path.join(datapath, 'relation2wikidata.json'), 'r') as f:
            self.relation2wiki = json.load(f)

        self.relation2wiki['[UNK]'] = {
            'label': '[UNK]',
            'alternatives': [],
        }
        self.entity2wiki['[UNK]'] = {
            'label': '[UNK]',
            'alternatives': [],
        }

        self.tokenizer = Tokenizer.from_file(path)
        self.pad_token_id = self.tokenizer.token_to_id('[PAD]')

        self.id2entity = {
            value: key
            for key, value in torch.load(os.path.join(datapath,
                                                      'entity2id.pt')).items()
        }
        self.id2entity[len(self.id2entity)] = '[UNK]'

        self.entity2tokens = torch.from_numpy(
            np.array(
                [self.get_entity(idx) for idx in range(len(self.id2entity))]))

        self.id2relation = {
            value: key
            for key, value in torch.load(os.path.join(datapath,
                                                      'rel2id.pt')).items()
        }
        self.id2relation[len(self.id2relation)] = '[UNK]'

        self.relations2tokens = torch.from_numpy(
            np.array([
                self.get_relation(idx) for idx in range(len(self.id2relation))
            ]))
Beispiel #9
0
def preprocess(fp, suffix, tokenizer):
    tokenizer = Tokenizer.from_file(tokenizer)
    dps_outfile = "output/{}_dps.txt".format(suffix)
    ids_outfile = "output/{}_ids.txt".format(suffix)
    num = 0
    with open(fp) as fin, open(dps_outfile,
                               "w") as fout_dps, open(ids_outfile,
                                                      "w") as fout_ids:
        for i, line in enumerate(file_tqdm(fin)):
            dp = json.loads(line.strip())
            asts, ids = split(dp, 1000, tokenizer)
            for i, (ast, extended) in enumerate(asts):
                if len(ast) > 1:
                    json.dump([ast, extended], fp=fout_dps)
                    json.dump(ids[i], fp=fout_ids)
                    fout_dps.write("\n")
                    fout_ids.write("\n")
                    num += 1
    logging.info("Wrote {} datapoints to {} and {}".format(
        num, ids_outfile, dps_outfile))
Beispiel #10
0
    def initialize(self, context):
        self.context = context
        model_dir = context.system_properties.get("model_dir")
        serialized_file = context.manifest["model"]["serializedFile"]
        model_pt_path = model_dir + "/" + serialized_file
        state_dict = torch.load(model_pt_path)

        self.initialized = True

        #self.model          = HTSClassifier().eval()
        #self.model.load_state_dict(state_dict)

        #checkpoint = torch.load(state_dict)
        #self.model.load_state_dict(checkpoint['state_dict'])
        self.model = torch.jit.load(model_pt_path)

        self.tokenizer = Tokenizer.from_file(model_dir + "/tokenizer.json")
        self.padding_length = 64
        self.num_samples = 5
        with open(model_dir + '/index_to_name.pkl', 'rb') as f:
            self.label_enc = pickle.load(f)
Beispiel #11
0
def load_pretrained_tokenizer(
        tokenizer_file: str,
        cache_dir: Optional[str] = None) -> PreTrainedTokenizerFast:
    """Load BertWordPieceTokenizer from tokenizer.json.
    This is necessary due to the following reasons:
    - BertWordPieceTokenizer cannot load from tokenizer.json via .from_file() method
    - Tokenizer.from_file(tokenizer_file) cannot be used because MecabPretokenizer is not a valid native PreTokenizer.
    """
    tokenizer = Tokenizer.from_file(tokenizer_file)
    tokenizer.pre_tokenizer = PreTokenizer.custom(MecabPreTokenizer())

    tokenizer_dir = os.path.dirname(tokenizer_file)
    pt_tokenizer: PreTrainedTokenizerFast = AutoTokenizer.from_pretrained(
        tokenizer_dir,
        cache_dir=cache_dir,
    )

    # This is necessary for pt_tokenizer.save_pretrained(save_path)
    pt_tokenizer._tokenizer = tokenizer  # ._tokenizer

    return pt_tokenizer
Beispiel #12
0
def main(args):
    # Load Selfies vocabulary
    idx2selfies, selfies2idx = load_selfies_vocab(args.selfies_vocab)
    # Load Tokenizer
    print('Loading Tokenizer: {}.'.format(args.tokenizer))
    tokenizer = Tokenizer.from_file(args.tokenizer)
    print('Testing with SMILES String: {}'.format(args.test_string))
    encoding = tokenizer.encode(args.test_string)
    print('Encoded string: {}'.format(encoding.tokens))
    decoded = tokenizer.decode(encoding.ids)
    print('Decoded string: {}'.format(decoded))
    print('Tokenizer Loaded.')

    # Create tokenized captions
    print("Creating JSON")
    create_tokenized_smiles_json(tokenizer, args.data_dir, args.data_split, args.config_output_name, args.max_length, args.label_filename, idx2selfies, selfies2idx)
    print("JSON created")

    # Save Images and processed Captions
    print("Processing and Saving Images")
    create_input_files(args, args.data_dir, args.config_output_name, args.image_output_filename, args.output_path, args.img_size)
    print("Done processing dataset")
Beispiel #13
0
    def __init__(self,
                 keypoints_file,
                 text_file,
                 max_frames,
                 transform,
                 selection,
                 use_rand_tokens=False):

        self.keypoints_data = h5py.File(keypoints_file, "r")

        self.utt_texts = {}
        text = open(text_file)
        for line in text:
            line = line.strip().split(" ")
            utt_id = line[0]
            text = line[1:]
            text = " ".join(text)
            self.utt_texts[utt_id] = text

        text_utt_ids = set(self.utt_texts.keys())
        keypoints_utt_ids = list(self.keypoints_data.keys())
        self.utt_ids = list(text_utt_ids.intersection(keypoints_utt_ids))

        print("IDs in text file:\t", len(text_utt_ids))
        print("IDs in keypoints file:\t", len(keypoints_utt_ids))
        print("IDs in both files:\t", len(self.utt_ids))

        self.max_frames = max_frames

        self.transform = transform

        self.tokenizer = Tokenizer.from_file("tokenizer_models/tokenizer.json")

        self.random_tokens = torch.randint(0, 999, (40, ), dtype=torch.long)

        self.use_rand_tokens = use_rand_tokens

        self.selection = selection
def main(args):
    if args.do_train:
        # Initialize a tokenizer
        files = get_smi_files(args.training_files)
        print("Training BPE tokenizer using the following files:{}".format(
            files))
        tokenizer = Tokenizer(models.BPE(unk_token="<unk>"))
        tokenizer.enable_padding(pad_id=args.vocab_size + 2,
                                 pad_token="<pad>",
                                 length=args.pad_len)
        tokenizer.enable_truncation(max_length=args.pad_len,
                                    strategy='only_first')
        tokenizer.normalizer = Sequence([NFKC()])
        tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(
            add_prefix_space=False)
        tokenizer.decoder = decoders.ByteLevel()
        tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
        # Train the tokenizer
        trainer = trainers.BpeTrainer(show_progress=True,
                                      vocab_size=args.vocab_size,
                                      min_frequency=args.min_frequency)
        tokenizer.train(files, trainer=trainer)
        tokenizer.add_tokens(["<start>", "<end>"])
        tokenizer.save(os.path.join('tokenizers', args.tokenizer_name),
                       pretty=True)
        print("Trained vocab size: {}".format(tokenizer.get_vocab_size()))

    if args.do_test:
        # Test the tokenizer
        tokenizer = Tokenizer.from_file(
            os.path.join('tokenizers', args.tokenizer_name))
        print("Testing with SMILES String: {}".format(args.test_string))
        encoding = tokenizer.encode(args.test_string)
        print("Encoded string: {}".format(encoding.tokens))
        print(encoding.ids)
        decoded = tokenizer.decode(encoding.ids)
        print("Decoded string: {}".format(decoded))
Beispiel #15
0
def main():
    parser = argparse.ArgumentParser(description="Train GPT2 Model")
    parser.add_argument("--batch_size", type=int, default=4, help="Specify batch size")
    parser.add_argument("--num_epoch", type=int, default=3, help="Specify number of epochs")
    parser.add_argument("--learning_rate", type=float, default=5e-5, help="Specify AdamW learning rate")

    args = parser.parse_args()

    tokenizer = Tokenizer.from_file("output/tokenizer.json")
    dataset = Dataset("output/train_rq4_dps.txt")

    model = TransformerModel(
        tokenizer.get_vocab_size(),
        CrossEntropyLoss(ignore_index=tokenizer.encode("[PAD]").ids[0]),
        6,
        300,
        1000,
        6,
        1e-05
    )

    training_args = TrainingArgs(
        batch_size = args.batch_size,
        num_epoch = args.num_epoch,
        output_dir = "output",
        optimizer = AdamW(model.parameters(), lr=args.learning_rate),
        save_model_on_epoch = False
    )

    trainer = Trainer(
        model,
        dataset,
        tokenizer,
        training_args
    )

    trainer.train()
Beispiel #16
0
    def __init__(self,
                 captions_file,
                 tokenizer_path,
                 keys,
                 res=128,
                 text_context_len=64):

        with open(captions_file, "r") as f:
            self.data = json.load(f)
        self.image_keys = list(self.data.keys())
        self.indices = keys

        # image related
        self.t = transforms.Compose(
            [transforms.Resize((res, res)),
             transforms.ToTensor()])

        # text related
        self.textlen = text_context_len
        self.tok = Tokenizer.from_file(tokenizer_path)
        self.text_end_id = self.tok.get_vocab()["<|endoftext|>"]
        self.image_end_id = self.tok.get_vocab()["<|endofimage|>"]

        print("Tokenizer loaded with vocab size:", self.tok.get_vocab_size())
Beispiel #17
0
def main():
    parser = ArgumentParser()
    parser.add_argument('lang', choices=['nld', 'ita'])
    parser.add_argument('models', nargs='+')
    parser.add_argument('--src', default='small', choices=['full', 'small'])
    parser.add_argument('--file', default='full')
    parser.add_argument('-n', default=5, type=int)
    parser.add_argument('-f', '--force', action='store_true')
    args = parser.parse_args()

    base_path = Path(
        'data') / args.lang / 'evaluation' / 'examples' / args.src / args.file

    src_path = base_path / 'gold.txt'
    if not src_path.exists():
        print(f' > gold path {src_path} does not exist')
        exit(1)

    print(' > loading tokenizer')
    os.environ['TOKENIZERS_PARALLELISM'] = 'false'
    if args.lang == 'ita':
        tokenizer = GPT2TokenizerFast.from_pretrained(
            'LorenzoDeMattei/GePpeTto')
    else:
        tokenizer_path = Path(
            'data') / args.lang / 'vocabularies' / 'tokenizer.json'
        tokenizer = Tokenizer.from_file(str(tokenizer_path))
        args.n += 1

    print(f' > loading examples from {src_path}')
    examples = []
    with open(src_path) as f:
        for line in f:
            token_ids = tokenizer.encode(line.strip())
            if type(token_ids) != list:
                token_ids = [0] + token_ids.ids
            examples.append(token_ids[:args.n])
    print(f' > loaded {len(examples)} examples')

    for model_name in args.models:
        tgt_path = base_path / f'{model_name.replace("/", "_")}.txt'
        if not args.force and tgt_path.exists():
            print(f'{tgt_path} already exists. skipping')
            continue

        model_path = Path('data') / args.lang / 'models' / model_name
        if not model_path.exists():
            model_path = model_name

        print(f' > loading model {model_path}')
        model = GPT2LMHeadModel.from_pretrained(model_path).cuda()
        model.eval()

        print(' > generating endings for examples')
        generated = [
            generate(input_ids, model, tokenizer)
            for input_ids in tqdm(examples, ncols=80)
        ]
        with open(tgt_path, 'w') as f:
            f.writelines(generated)

        print(f'\nsaved to {tgt_path}')
Beispiel #18
0
from tokenizers import Tokenizer
import sys
import pickle
import numpy as np
from build_bpe import cleanup
import os

tokenizer = Tokenizer.from_file("bpe-fi.tokenizer.json")

print(tokenizer)
#dfolder = "../../Data/wiki/fi/"
dfolder = "../../Data/finovels/"
files = os.listdir(dfolder)

print("Read files from", dfolder)
print("...")
#s = open(dpath).read().lower()

lines = []

for dpath in files:
    with open(dfolder + dpath) as f:
        print("File:", dpath)

        for line in f:
            clean_line = cleanup(line)
            lines.append(clean_line)

#print("Encode", s[:100], len(s))
print("ENCODE")
encoded_l = tokenizer.encode_batch(lines)
Beispiel #19
0
def load_jieba_tokenizer(tokenizer_path) -> Tokenizer:
    tokenizer = Tokenizer.from_file(str(tokenizer_path))
    tokenizer.pre_tokenizer = PreTokenizer.custom(JiebaPreTokenizer())
    tokenizer.decoder = Decoder.custom(JiebaDecoder())
    return tokenizer
Beispiel #20
0
def fetch_encoder(config: EncoderConfig):
    if config.is_pretrained:
        return GPT2TokenizerFast.from_pretrained(config.location)

    return Tokenizer.from_file(config.location)
Beispiel #21
0
        return torch.tensor(self.examples[i])


configuration = BertConfig()
model = BertModel(configuration)
configuration = model.config

#tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
#trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
#tokenizer.pre_tokenizer = Whitespace()
#files = ['./processed_wiki_ko.txt']
#tokenizer.train(files=files, trainer=trainer)

#tokenizer = Tokenizer.from_file("./wiki_tokenizer.json")
#fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="wiki_tokenizer.json")
tokenizer = Tokenizer.from_file("./wiki_tokenizer.json")
tokenizer.enable_truncation(max_length=512)

#tokenizer._tokenizer.post_processor = BertProcessing(
#        single="[CLS] $A [SEP]",
#        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
#        special_tokens=[
#             ("[CLS]", tokenizer.token_to_id("[CLS]")),
#             ("[SEP]", tokenizer.token_to_id("[SEP]")),
#        ],
#)

tokenizer.post_processor = BertProcessing(sep=("[SEP]",
                                               tokenizer.token_to_id("[SEP]")),
                                          cls=("[CLS]",
                                               tokenizer.token_to_id("[CLS]")))
Beispiel #22
0
dataset = build_dataset.TrainDataset(filepath)
dataLoader = DataLoader(dataset,
                        batch_size=batch_size,
                        shuffle=True,
                        num_workers=0)

config = BertConfig(
    vocab_size=249,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

tokenizer = Tokenizer.from_file(
    "/home/ubuntu/BERT-GAN/BERT_GAN/bAbI_tokenizer.json")

model = BertForMaskedLM(config=config)

training_args = TrainingArguments(
    output_dir="/home/ubuntu/BERT-GAN/BERT_GAN/bAbibert_model",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_gpu_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
Beispiel #23
0
def train_tokenizer_vocab(dataset, style='BPE', force_retrain=True):
    """
    if force_retrain: overwrite the stored tokenizer from tokenizers dir (by retraining)
    else: load the tokenizer if it exists
    """
    assert dataset in VALID_DATASETS
    assert style in VALID_TOKENIZATIONS

    tpath_expected = default_tpath(dataset, style)

    train = True
    if not force_retrain and os.path.isfile(tpath_expected):
        tokenizer = Tokenizer.from_file(tpath_expected)
        train = False
    else:
        print('%s tokenizer file does not exist; training new tokenizer' %
              tpath_expected)

    if train:

        # load data associated with one of the valid datasets (from /data/ directory)
        datafiles = load_dataset(dataset)

        # Steps for each algo (e.g. BPE):
        # - init Tokenizer using algo
        # - specify algo specific trainer
        # - specify any pre-processing of text (will affect decoding)
        #   see: https://huggingface.co/docs/tokenizers/python/latest/components.html#decoders
        # - different training calls if its the arxiv dataset or wikitext
        #   see https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/

        if style == 'BPE':
            tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
            trainer = BpeTrainer(
                special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
            tokenizer.pre_tokenizer = ByteLevel()
            if dataset == 'arxiv':
                tokenizer.train_from_iterator(datafiles, trainer=trainer)
            else:
                tokenizer.train(datafiles, trainer=trainer)
            tokenizer.decoder = decoders.ByteLevel()

        else:
            assert style == 'WordLevel'
            tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
            trainer = WordLevelTrainer(
                special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
            tokenizer.pre_tokenizer = Whitespace()
            if dataset == 'arxiv':
                tokenizer.train_from_iterator(datafiles, trainer=trainer)
            else:
                tokenizer.train(datafiles, trainer=trainer)
            tokenizer.decoder = decoders.WordPiece(
            )  # WordPiece seems to work (adds back spaces)

        # Save to tokenizers directory
        tokenizer.save(tpath_expected)

    # Generate vocab object based on tokenizer.decoder() method
    # ... TODO implement the same vocabulary functionality, or ensure it is present in Tokenizer and then code it elsewhere...
    # Features we need to match:
    #   from torchtext.legacy.vocab import Vocab as RetiredVocab
    #   ntokens = len(vocab.stoi) ---> ntokens = tokenizer.(...)
    #   data = [torch.tensor([vocab[token] for token in tokenizer(item)],
    #                         dtype=torch.long) for item in raw_text_iter]
    #   tokenized_text_ints = torch.tensor([vocab[token] for token in tokenized_text], dtype=torch.long)
    #   running_context_string = ' '.join([vocab.itos[src[k]] for k in range(src.shape[0])])
    #   unk_index = vocab.unk_index
    vocab = None

    return tokenizer, vocab
Beispiel #24
0
def main():
    parser = ArgumentParser()
    parser.add_argument('lang')
    parser.add_argument('model')
    parser.add_argument('-n', type=int, default=None)
    args = parser.parse_args()

    with open(Path('data') / args.lang / 'config.json') as f:
        cfg = json.load(f)

    model_path = Path('data') / args.lang / 'models' / args.model

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print('Using device:', device)

    os.environ['TOKENIZERS_PARALLELISM'] = str(False)
    # tokenizer_tgt = Tokenizer.from_file('tgt.tokenizer.json')
    if args.lang == 'ita':
        tokenizer_tgt = GPT2Tokenizer.from_pretrained(
            'LorenzoDeMattei/GePpeTto')
    else:
        tokenizer_tgt = Tokenizer.from_file(
            str(
                Path('data') / args.lang / 'preparation' / 'vocabularies' /
                'tokenizer.json'))

    # model: GPT2LMHeadModel = EmbeddingTunerModel.load_from_checkpoint(model_path).m
    model = GPT2LMHeadModel.from_pretrained(str(model_path))
    model.to(device)

    if args.n is not None:
        tokenizer_eng = GPT2Tokenizer.from_pretrained('gpt2')

        dict_path = Path(
            'data') / args.lang / 'dictionaries' / f'{args.model}.tsv'
        with open(dict_path) as f_map:
            token_id_map = [
                tokenizer_eng.convert_tokens_to_ids(
                    line.strip().split('\t')[1]) for line in f_map
            ]

        print(f'generating {args.n:,} random texts (unconditioned)')

        out_dir = Path('data') / args.lang / 'results' / 'examples'
        os.makedirs(out_dir, exist_ok=True)
        name = str(int(time()))

        tgt_out_path = out_dir / f'{name}.{args.lang}.txt'
        src_out_path = out_dir / f'{name}.eng.txt'

        print(
            f'generating {args.n} {args.lang} examples to {tgt_out_path} [{src_out_path}]'
        )
        with open(tgt_out_path, 'w') as f_tgt, open(src_out_path,
                                                    'w') as f_eng:
            for i, (tgt, eng) in enumerate(
                    gen(tokenizer_tgt,
                        model,
                        device,
                        n=args.n,
                        tokenizer_eng=tokenizer_eng,
                        token_id_map=token_id_map,
                        cfg=cfg)):
                print(f'{i:,}/{args.n:,}')
                f_tgt.write(tgt + '\n\n')
                f_eng.write(eng + '\n\n')

        return

    while True:
        print('\n##########################################')
        prompt = input(' > ').strip()

        for txt in gen(tokenizer_tgt, model, device, prompt, cfg=cfg):
            print('\n' + txt)
Beispiel #25
0
 def test_full_serialization_albert(self, albert_base):
     # Check we can read this file.
     # This used to fail because of BufReader that would fail because the
     # file exceeds the buffer capacity
     tokenizer = Tokenizer.from_file(albert_base)
Beispiel #26
0
test_corpus = LabeledCorpus(test_file)

train_inputs = []
train_targets = []

for label, doc in train_corpus:
    train_targets.append(label)
    train_inputs.append(doc)

test_inputs = []
test_targets = []
for label, doc in test_corpus:
    test_targets.append(label)
    test_inputs.append(doc)

tokenizer = Tokenizer.from_file("rust_tokenizer.json")
VOCAB_SIZE = len(tokenizer.get_vocab())

config = {
    "experiment_name": "imdb_lstm",
    "model_config": {
        "output_dim": 2,
        "vocab_size": VOCAB_SIZE,
        "hidden_dim": 200
    },
    "random_seed": 42,
    "iterator_type": "padded_iterator",
    "loss": "cross_entropy",
    "optimizer": "adam",
    "learning_rate": 0.0001,
    "regularization": "l2",
Beispiel #27
0
def main():
    batch_size = 4
    vocab_size = 16384
    max_source_length = 1024
    max_target_length = 1024
    num_workers = 3

    dataset = nlp.load_dataset("iwslt2017.py", "nl-en")

    # Train tokenizer
    tokenizer_filename = "tokenizer.json"
    if os.path.exists(tokenizer_filename):
        tokenizer = Tokenizer.from_file(tokenizer_filename)
    else:
        data_filename = "whole_data.txt"
        with open(data_filename, "w") as f:
            for item in dataset["train"]:
                f.write(item["source"] + "\n")
                f.write(item["target"] + "\n\n")

        tokenizer = CharBPETokenizer()
        tokenizer.train([data_filename], vocab_size=vocab_size)
        pad_token = AddedToken("[PAD]", lstrip=False, rstrip=False)
        tokenizer.add_tokens([pad_token])
        tokenizer.save(tokenizer_filename)

    tokenizer.pad_token_id = vocab_size

    # Loaders
    train_dataset = Seq2SeqDataset(tokenizer, dataset["train"],
                                   max_source_length, max_target_length)
    val_dataset = Seq2SeqDataset(tokenizer, dataset["validation"],
                                 max_source_length, max_target_length)
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=train_dataset.collate_fn,
        num_workers=num_workers,
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        collate_fn=val_dataset.collate_fn,
        num_workers=num_workers,
    )

    # Train model
    config = BartConfig(
        vocab_size=vocab_size + 1,  # Pad
        d_model=1024,
        encoder_ffn_dim=1024,
        encoder_layers=6,
        encoder_attention_heads=4,
        decoder_ffn_dim=1024,
        decoder_layers=6,
        decoder_attention_heads=4,
    )
    model = BartForConditionalGeneration(config)
    translator = Translate(model, tokenizer)

    trainer = pl.Trainer(gpus=1)
    trainer.fit(translator, train_loader, val_loader)
Beispiel #28
0
def cli_main(args=None):

    pl.seed_everything(42)

    parser = ArgumentParser()
    parser.add_argument("--checkpoint", required=False, type=str)
    parser.add_argument("--strict", default=False, action='store_true')
    parser.add_argument("--name", type=str, required=True)

    parser.add_argument("--early_stopping_monitor",
                        type=str,
                        default='tokens_matched_accuracy')
    parser.add_argument("--early_stopping_mode", type=str, default='max')
    parser.add_argument("--early_stopping_min_delta",
                        type=float,
                        default=0.001)
    parser.add_argument("--early_stopping_patience", type=int, default=3)

    parser.add_argument('--tokenizer',
                        help='path to pretrained tokenizer',
                        type=str,
                        required=True)
    parser.add_argument('--dataset',
                        help='datasets dataset name',
                        type=str,
                        required=True)
    parser.add_argument('--languages',
                        help='dataset languages to tokenize',
                        type=str,
                        required=True)

    parser.add_argument('--blm_class',
                        help='Bert Lightning Module to train',
                        type=str)

    dm_class = WMT20DataModule
    parser = dm_class.add_argparse_args(parser)
    parser = BertLightningModule.add_model_specific_args(parser)
    parser = pl.Trainer.add_argparse_args(parser)

    args = parser.parse_args(args)

    import os
    os.environ["TOKENIZERS_PARALLELISM"] = "false"

    tokenizer = Tokenizer.from_file(args.tokenizer)
    # [UNK],[SEP],[PAD],[MASK],[ECHO],[TRANSLATE]
    special_tokens = [
        '[UNK]',
        '[PAD]',
        '[TRANSLATE]',
        '[ECHO]',
        '[MASK]',
        '[SEP]',
    ]
    assert tokenizer.add_special_tokens(
        special_tokens
    ) == 0, f'one of special tokens not in tokenizer: {special_tokens}'

    dm = dm_class.from_argparse_args(args,
                                     tokenizer=tokenizer,
                                     dataset=args.dataset,
                                     languages=args.languages,
                                     device='cuda')
    dm.setup()

    assert dm.device == 'cuda'

    if args.max_steps == -1:
        args.max_steps = None

    blm_class = BertLightningModule
    if args.blm_class == 'BertTranslateLightningModule':
        blm_class = BertTranslateLightningModule
    elif args.blm_class == 'BertLightningModule':
        blm_class = BertLightningModule
    elif args.blm_class == 'BertIELightningModule':
        blm_class = BertIELightningModule
    else:
        raise ValueError("unknown blm_class")

    if args.checkpoint is not None:
        print("Restoring from checkpoint", args.checkpoint)
        bert_model = blm_class.load_from_checkpoint(args.checkpoint,
                                                    strict=args.strict)
        bert_model.hparams.noam_scaler = args.noam_scaler
        bert_model.hparams.lr = args.lr
        bert_model.hparams.noam_opt_warmup_steps = args.noam_opt_warmup_steps
        bert_model.hparams.scheduler = args.scheduler
        bert_model.hparams.scheduler_patience = args.scheduler_patience
        bert_model.hparams.noam_step_factor = args.noam_step_factor
        bert_model.tokenizer = tokenizer

    else:
        args_dict = vars(args)
        lightning_module_args = {
            k: args_dict[k]
            for k in args_dict.keys() if args_dict[k] is not None
        }
        lightning_module_args['tokenizer'] = tokenizer
        bert_model = blm_class(**lightning_module_args)

    trainer_logger = pl.loggers.TensorBoardLogger("lightning_logs",
                                                  name=args.name)
    early_stop_callback = pl.callbacks.EarlyStopping(
        monitor=args.early_stopping_monitor,
        mode=args.early_stopping_mode,
        min_delta=args.early_stopping_min_delta,
        patience=args.early_stopping_patience,
        verbose=True,
    )
    trainer = pl.Trainer.from_argparse_args(args,
                                            logger=trainer_logger,
                                            callbacks=[early_stop_callback])
    trainer.fit(bert_model, datamodule=dm)
    return dm, bert_model, trainer
Beispiel #29
0
def get_tokenizer(args):
    if args.encoder_path is None:
        return GPT2TokenizerFast.from_pretrained('gpt2')
    else:
        return Tokenizer.from_file(args.encoder_path)
Beispiel #30
0
def main(gpu, params):
    """ Loads the dataset and trains the model."""
    rank = params.nr * params.gpus + gpu
    if params.distributed:
        dist.init_process_group(backend='nccl',
                                init_method='env://',
                                world_size=params.world_size,
                                rank=rank)
    seed_all(SEED)

    # get gpu device
    if params.device == 'gpu':
        device = torch.device(gpu)
    else:
        device = 'cpu'

    # only wandb on main process
    if rank == 0 and params.wandb:
        wandb.init(project='mnmt',
                   entity='nlp-mnmt-project',
                   config={
                       k: v
                       for k, v in params.__dict__.items()
                       if isinstance(v, (float, int, str))
                   })
        config = wandb.config
    logger, params = setup(params)

    # load data and train for required experiment
    if len(params.langs) == 2:
        # bilingual translation

        # load tokenizers if continuing
        if params.checkpoint:
            tokenizers = []
            for lang in params.langs:
                tokenizers.append(
                    Tokenizer.from_file(logger.root_path + '/' + lang +
                                        '_tokenizer.json'))
        else:
            if params.tokenizer is not None:
                if len(params.tokenizer) == 2:
                    tokenizers = [
                        Tokenizer.from_file('pretrained/' + tok + '.json')
                        for tok in params.tokenizer
                    ]
                else:
                    print('Wrong number of tokenizers passed. Retraining.')
                    tokenizers = None
            else:
                tokenizers = None

        train_dataloader, val_dataloader, test_dataloader, _ = preprocess.load_and_preprocess(
            params.langs,
            params.batch_size,
            params.vocab_size,
            params.dataset,
            multi=False,
            path=logger.root_path,
            tokenizer=tokenizers,
            distributed=params.distributed,
            world_size=params.world_size,
            rank=rank)

        train(rank,
              device,
              logger,
              params,
              train_dataloader,
              val_dataloader=val_dataloader,
              verbose=params.verbose)

    elif len(params.langs) > 2:
        # multilingual translation

        # load tokenizers if continuing
        if params.checkpoint:
            tokenizer = Tokenizer.from_file(logger.root_path +
                                            '/multi_tokenizer.json')
        else:
            if params.tokenizer is not None:
                tokenizer = Tokenizer.from_file('pretrained/' +
                                                params.tokenizer + '.json')
            else:
                tokenizer = None

        train_dataloader, val_dataloader, test_dataloader, tokenizer = preprocess.load_and_preprocess(
            params.langs,
            params.batch_size,
            params.vocab_size,
            params.dataset,
            multi=True,
            path=logger.root_path,
            tokenizer=tokenizer,
            distributed=params.distributed,
            world_size=params.world_size,
            rank=rank)

        train(rank,
              device,
              logger,
              params,
              train_dataloader,
              val_dataloader=val_dataloader,
              tokenizer=tokenizer,
              verbose=params.verbose)

    else:
        raise NotImplementedError

    # end wanb process to avoid hanging
    if rank == 0 and params.wandb:
        wandb.finish()