def __init__(self, **kwargs): super().__init__() self.save_hyperparameters() self.tokenizer = Tokenizer.from_file(self.hparams.tokenizer_file) self.tokenizer.add_special_tokens(["<s>", "</s>"]) vocab_size = self.tokenizer.get_vocab_size() self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=self.hparams.embedding_dim) current_input_dim = self.hparams.embedding_dim if (self.hparams.use_transformer): self.pos_encoder = PositionalEncoding(self.hparams.embedding_dim, 0.2) encoder_layer = nn.TransformerEncoderLayer(d_model=self.hparams.embedding_dim, nhead=8) self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=4) self.dropout = nn.Dropout(p=0.2) self.output = nn.Linear(self.hparams.embedding_dim, out_features=len(streaming_punctuator.data.PUNCTUATIONS)) else: if len(self.hparams.conv_dims) > 0: self.conv_dims = [int(s) for s in self.hparams.conv_dims.split(",")] self.conv_sizes = [int(s) for s in self.hparams.conv_sizes.split(",")] self.conv_dilations = [int(s) for s in self.hparams.conv_dilations.split(",")] assert(len(self.conv_dims) == len(self.conv_sizes)) assert(len(self.conv_dims) == len(self.conv_dilations)) conv_layers = [] current_input_dim = self.hparams.embedding_dim for i in range(len(self.conv_dims)): conv_layer = CausalConv1d(current_input_dim, self.conv_dims[i], self.conv_sizes[i], dilation=self.conv_dilations[i], bias=False) conv_layers.append(conv_layer) conv_layers.append(nn.BatchNorm1d(self.conv_dims[i], affine=False)) conv_layers.append(nn.ReLU(inplace=True)) current_input_dim = self.conv_dims[i] self.conv_layers = nn.Sequential(*conv_layers) else: self.conv_layers = None self.lstm = nn.LSTM(input_size=current_input_dim, hidden_size=self.hparams.lstm_hidden_size, bidirectional=self.hparams.use_bidirectional, num_layers=self.hparams.lstm_num_layers, batch_first=True) self.dropout = nn.Dropout(p=0.2) lstm_output_size = self.hparams.lstm_hidden_size + self.hparams.lstm_hidden_size * self.hparams.use_bidirectional self.output = nn.Linear(lstm_output_size, out_features=len(streaming_punctuator.data.PUNCTUATIONS))
def test_line2seqs_label_delay(self): tokenizer = Tokenizer.from_file("test/tokenizer.json") tokenizer.add_special_tokens(["<s>", "</s>"]) token_seq, label_seq = line2seqs("tere ! minu nimi on Baabuu .", tokenizer, label_delay=2) print([tokenizer.id_to_token(i) for i in token_seq]) print(label_seq) self.assertEqual(len(token_seq), len(label_seq)) self.assertEqual([tokenizer.id_to_token(i) for i in token_seq], ['<s>', 'tere</w>', 'minu</w>', 'nimi</w>', 'on</w>', 'Baa', 'bu', 'u</w>', '</s>', '</s>', '</s>']) self.assertEqual(label_seq, [-1, -1, -1, 4, 0, 0, 0, -1, -1, 2, -1]) token_seq, label_seq = line2seqs("tere ! minu nimi on Baabuu .", tokenizer, label_delay=0) print([tokenizer.id_to_token(i) for i in token_seq]) print(label_seq) self.assertEqual(len(token_seq), len(label_seq)) self.assertEqual([tokenizer.id_to_token(i) for i in token_seq], ['<s>', 'tere</w>', 'minu</w>', 'nimi</w>', 'on</w>', 'Baa', 'bu', 'u</w>', "</s>"]) self.assertEqual(label_seq, [-1, 4, 0, 0, 0, -1, -1, 2, -1])
def fetch_encoder(params): no_dataset = params.get('no_dataset', False) if no_dataset: return None dataset = next(iter(params['dataset_configs'].values()) ) # Get the first value from the dict path = dataset["tokenizer_path"] is_pretrained = dataset.get("tokenizer_is_pretrained", False) if is_pretrained: tok = GPT2TokenizerFast.from_pretrained(path) # Will add a padding token id of 50257 at run-time tok.add_special_tokens({'pad_token': '<|padding|>'}) return tok return Tokenizer.from_file(path)
def init_tokenizer(lang, n, m): if n is None and m is None: print('size nor model are specified, but one of them is required') exit(1) if m is not None: tokenizer = AutoTokenizer.from_pretrained(m, use_fast=True) return tokenizer tokenizer = Tokenizer.from_file( str( Path('data') / lang / 'preparation' / 'vocabularies' / f'{lang}-{str(n).zfill(3)}k.tokenizer.json')) tokenizer.post_processor = RobertaProcessing( ('</s>', tokenizer.token_to_id('</s>')), ('<s>', tokenizer.token_to_id('<s>')), trim_offsets=True) return tokenizer
def test_dataloader(self): tokenizer = Tokenizer.from_file("test/tokenizer.json") tokenizer.add_special_tokens(["<s>", "</s>"]) dataset = PunctuationDataset(tokenizer, "test/dev.txt") batch_size = 8 random_sampler = RandomSampler(dataset) batch_iterator = BucketBatchSampler(random_sampler, batch_size=batch_size, drop_last=False, sort_key=lambda x: dataset[x]["length"], bucket_size_multiplier=100) dataloader = torch.utils.data.DataLoader(dataset, batch_sampler=batch_iterator, collate_fn=dataset.collate_batch) for i in range(2): print(f"Testing epoch {i}") for j, batch in enumerate(dataloader): if j == 0: # make sure that the length difference inside a batch is not > 20% self.assertTrue((batch["lengths"].max() - batch["lengths"].min()) / batch["lengths"].max() < 0.2 )
def bleu_eval(args, references): """ BLEU-1 via tokenized smile """ print("Loading Tokenizer: {}.".format(args.tokenizer)) tokenizer = Tokenizer.from_file(args.tokenizer) scores = [] for smi in references: cur_scores = [] for smi2 in references: if smi2 != smi: reference = tokenizer.encode(smi) candidate = tokenizer.encode(smi2) cur_scores.append( sentence_bleu(reference.tokens, candidate.tokens, weights=(1.0, 0, 0, 0))) scores.append(np.mean(cur_scores)) return round(np.mean(scores), 4)
def __init__(self, *args, **kwargs): tokenizer_object = kwargs.pop("tokenizer_object", None) slow_tokenizer = kwargs.pop("__slow_tokenizer", None) fast_tokenizer_file = kwargs.pop("tokenizer_file", None) from_slow = kwargs.pop("from_slow", False) if from_slow and slow_tokenizer is None and self.slow_tokenizer_class is None: raise ValueError( "Cannot instantiate this tokenizer from a slow version. If it's based on sentencepiece, make sure you " "have sentencepiece installed." ) if tokenizer_object is not None: fast_tokenizer = tokenizer_object elif fast_tokenizer_file is not None and not from_slow: # We have a serialization from tokenizers which let us directly build the backend fast_tokenizer = TokenizerFast.from_file(fast_tokenizer_file) elif slow_tokenizer is not None: # We need to convert a slow tokenizer to build the backend fast_tokenizer = convert_slow_tokenizer(slow_tokenizer) elif self.slow_tokenizer_class is not None: # We need to create and convert a slow tokenizer to build the backend slow_tokenizer = self.slow_tokenizer_class(*args, **kwargs) fast_tokenizer = convert_slow_tokenizer(slow_tokenizer) else: raise ValueError( "Couldn't instantiate the backend tokenizer from one of: \n" "(1) a `tokenizers` library serialization file, \n" "(2) a slow tokenizer instance to convert or \n" "(3) an equivalent slow tokenizer class to instantiate and convert. \n" "You need to have sentencepiece installed to convert a slow tokenizer to a fast one." ) self._tokenizer = fast_tokenizer if slow_tokenizer is not None: kwargs.update(slow_tokenizer.init_kwargs) self._decode_use_source_tokenizer = False # We call this after having initialized the backend tokenizer because we update it. super().__init__(**kwargs)
def __init__(self, datapath): self.training = True with open(os.path.join(datapath, 'entity2wikidata.json'), 'r') as f: self.entity2wiki = json.load(f) with open(os.path.join(datapath, 'relation2wikidata.json'), 'r') as f: self.relation2wiki = json.load(f) self.relation2wiki['[UNK]'] = { 'label': '[UNK]', 'alternatives': [], } self.entity2wiki['[UNK]'] = { 'label': '[UNK]', 'alternatives': [], } self.tokenizer = Tokenizer.from_file(path) self.pad_token_id = self.tokenizer.token_to_id('[PAD]') self.id2entity = { value: key for key, value in torch.load(os.path.join(datapath, 'entity2id.pt')).items() } self.id2entity[len(self.id2entity)] = '[UNK]' self.entity2tokens = torch.from_numpy( np.array( [self.get_entity(idx) for idx in range(len(self.id2entity))])) self.id2relation = { value: key for key, value in torch.load(os.path.join(datapath, 'rel2id.pt')).items() } self.id2relation[len(self.id2relation)] = '[UNK]' self.relations2tokens = torch.from_numpy( np.array([ self.get_relation(idx) for idx in range(len(self.id2relation)) ]))
def preprocess(fp, suffix, tokenizer): tokenizer = Tokenizer.from_file(tokenizer) dps_outfile = "output/{}_dps.txt".format(suffix) ids_outfile = "output/{}_ids.txt".format(suffix) num = 0 with open(fp) as fin, open(dps_outfile, "w") as fout_dps, open(ids_outfile, "w") as fout_ids: for i, line in enumerate(file_tqdm(fin)): dp = json.loads(line.strip()) asts, ids = split(dp, 1000, tokenizer) for i, (ast, extended) in enumerate(asts): if len(ast) > 1: json.dump([ast, extended], fp=fout_dps) json.dump(ids[i], fp=fout_ids) fout_dps.write("\n") fout_ids.write("\n") num += 1 logging.info("Wrote {} datapoints to {} and {}".format( num, ids_outfile, dps_outfile))
def initialize(self, context): self.context = context model_dir = context.system_properties.get("model_dir") serialized_file = context.manifest["model"]["serializedFile"] model_pt_path = model_dir + "/" + serialized_file state_dict = torch.load(model_pt_path) self.initialized = True #self.model = HTSClassifier().eval() #self.model.load_state_dict(state_dict) #checkpoint = torch.load(state_dict) #self.model.load_state_dict(checkpoint['state_dict']) self.model = torch.jit.load(model_pt_path) self.tokenizer = Tokenizer.from_file(model_dir + "/tokenizer.json") self.padding_length = 64 self.num_samples = 5 with open(model_dir + '/index_to_name.pkl', 'rb') as f: self.label_enc = pickle.load(f)
def load_pretrained_tokenizer( tokenizer_file: str, cache_dir: Optional[str] = None) -> PreTrainedTokenizerFast: """Load BertWordPieceTokenizer from tokenizer.json. This is necessary due to the following reasons: - BertWordPieceTokenizer cannot load from tokenizer.json via .from_file() method - Tokenizer.from_file(tokenizer_file) cannot be used because MecabPretokenizer is not a valid native PreTokenizer. """ tokenizer = Tokenizer.from_file(tokenizer_file) tokenizer.pre_tokenizer = PreTokenizer.custom(MecabPreTokenizer()) tokenizer_dir = os.path.dirname(tokenizer_file) pt_tokenizer: PreTrainedTokenizerFast = AutoTokenizer.from_pretrained( tokenizer_dir, cache_dir=cache_dir, ) # This is necessary for pt_tokenizer.save_pretrained(save_path) pt_tokenizer._tokenizer = tokenizer # ._tokenizer return pt_tokenizer
def main(args): # Load Selfies vocabulary idx2selfies, selfies2idx = load_selfies_vocab(args.selfies_vocab) # Load Tokenizer print('Loading Tokenizer: {}.'.format(args.tokenizer)) tokenizer = Tokenizer.from_file(args.tokenizer) print('Testing with SMILES String: {}'.format(args.test_string)) encoding = tokenizer.encode(args.test_string) print('Encoded string: {}'.format(encoding.tokens)) decoded = tokenizer.decode(encoding.ids) print('Decoded string: {}'.format(decoded)) print('Tokenizer Loaded.') # Create tokenized captions print("Creating JSON") create_tokenized_smiles_json(tokenizer, args.data_dir, args.data_split, args.config_output_name, args.max_length, args.label_filename, idx2selfies, selfies2idx) print("JSON created") # Save Images and processed Captions print("Processing and Saving Images") create_input_files(args, args.data_dir, args.config_output_name, args.image_output_filename, args.output_path, args.img_size) print("Done processing dataset")
def __init__(self, keypoints_file, text_file, max_frames, transform, selection, use_rand_tokens=False): self.keypoints_data = h5py.File(keypoints_file, "r") self.utt_texts = {} text = open(text_file) for line in text: line = line.strip().split(" ") utt_id = line[0] text = line[1:] text = " ".join(text) self.utt_texts[utt_id] = text text_utt_ids = set(self.utt_texts.keys()) keypoints_utt_ids = list(self.keypoints_data.keys()) self.utt_ids = list(text_utt_ids.intersection(keypoints_utt_ids)) print("IDs in text file:\t", len(text_utt_ids)) print("IDs in keypoints file:\t", len(keypoints_utt_ids)) print("IDs in both files:\t", len(self.utt_ids)) self.max_frames = max_frames self.transform = transform self.tokenizer = Tokenizer.from_file("tokenizer_models/tokenizer.json") self.random_tokens = torch.randint(0, 999, (40, ), dtype=torch.long) self.use_rand_tokens = use_rand_tokens self.selection = selection
def main(args): if args.do_train: # Initialize a tokenizer files = get_smi_files(args.training_files) print("Training BPE tokenizer using the following files:{}".format( files)) tokenizer = Tokenizer(models.BPE(unk_token="<unk>")) tokenizer.enable_padding(pad_id=args.vocab_size + 2, pad_token="<pad>", length=args.pad_len) tokenizer.enable_truncation(max_length=args.pad_len, strategy='only_first') tokenizer.normalizer = Sequence([NFKC()]) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel( add_prefix_space=False) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.ByteLevel(trim_offsets=True) # Train the tokenizer trainer = trainers.BpeTrainer(show_progress=True, vocab_size=args.vocab_size, min_frequency=args.min_frequency) tokenizer.train(files, trainer=trainer) tokenizer.add_tokens(["<start>", "<end>"]) tokenizer.save(os.path.join('tokenizers', args.tokenizer_name), pretty=True) print("Trained vocab size: {}".format(tokenizer.get_vocab_size())) if args.do_test: # Test the tokenizer tokenizer = Tokenizer.from_file( os.path.join('tokenizers', args.tokenizer_name)) print("Testing with SMILES String: {}".format(args.test_string)) encoding = tokenizer.encode(args.test_string) print("Encoded string: {}".format(encoding.tokens)) print(encoding.ids) decoded = tokenizer.decode(encoding.ids) print("Decoded string: {}".format(decoded))
def main(): parser = argparse.ArgumentParser(description="Train GPT2 Model") parser.add_argument("--batch_size", type=int, default=4, help="Specify batch size") parser.add_argument("--num_epoch", type=int, default=3, help="Specify number of epochs") parser.add_argument("--learning_rate", type=float, default=5e-5, help="Specify AdamW learning rate") args = parser.parse_args() tokenizer = Tokenizer.from_file("output/tokenizer.json") dataset = Dataset("output/train_rq4_dps.txt") model = TransformerModel( tokenizer.get_vocab_size(), CrossEntropyLoss(ignore_index=tokenizer.encode("[PAD]").ids[0]), 6, 300, 1000, 6, 1e-05 ) training_args = TrainingArgs( batch_size = args.batch_size, num_epoch = args.num_epoch, output_dir = "output", optimizer = AdamW(model.parameters(), lr=args.learning_rate), save_model_on_epoch = False ) trainer = Trainer( model, dataset, tokenizer, training_args ) trainer.train()
def __init__(self, captions_file, tokenizer_path, keys, res=128, text_context_len=64): with open(captions_file, "r") as f: self.data = json.load(f) self.image_keys = list(self.data.keys()) self.indices = keys # image related self.t = transforms.Compose( [transforms.Resize((res, res)), transforms.ToTensor()]) # text related self.textlen = text_context_len self.tok = Tokenizer.from_file(tokenizer_path) self.text_end_id = self.tok.get_vocab()["<|endoftext|>"] self.image_end_id = self.tok.get_vocab()["<|endofimage|>"] print("Tokenizer loaded with vocab size:", self.tok.get_vocab_size())
def main(): parser = ArgumentParser() parser.add_argument('lang', choices=['nld', 'ita']) parser.add_argument('models', nargs='+') parser.add_argument('--src', default='small', choices=['full', 'small']) parser.add_argument('--file', default='full') parser.add_argument('-n', default=5, type=int) parser.add_argument('-f', '--force', action='store_true') args = parser.parse_args() base_path = Path( 'data') / args.lang / 'evaluation' / 'examples' / args.src / args.file src_path = base_path / 'gold.txt' if not src_path.exists(): print(f' > gold path {src_path} does not exist') exit(1) print(' > loading tokenizer') os.environ['TOKENIZERS_PARALLELISM'] = 'false' if args.lang == 'ita': tokenizer = GPT2TokenizerFast.from_pretrained( 'LorenzoDeMattei/GePpeTto') else: tokenizer_path = Path( 'data') / args.lang / 'vocabularies' / 'tokenizer.json' tokenizer = Tokenizer.from_file(str(tokenizer_path)) args.n += 1 print(f' > loading examples from {src_path}') examples = [] with open(src_path) as f: for line in f: token_ids = tokenizer.encode(line.strip()) if type(token_ids) != list: token_ids = [0] + token_ids.ids examples.append(token_ids[:args.n]) print(f' > loaded {len(examples)} examples') for model_name in args.models: tgt_path = base_path / f'{model_name.replace("/", "_")}.txt' if not args.force and tgt_path.exists(): print(f'{tgt_path} already exists. skipping') continue model_path = Path('data') / args.lang / 'models' / model_name if not model_path.exists(): model_path = model_name print(f' > loading model {model_path}') model = GPT2LMHeadModel.from_pretrained(model_path).cuda() model.eval() print(' > generating endings for examples') generated = [ generate(input_ids, model, tokenizer) for input_ids in tqdm(examples, ncols=80) ] with open(tgt_path, 'w') as f: f.writelines(generated) print(f'\nsaved to {tgt_path}')
from tokenizers import Tokenizer import sys import pickle import numpy as np from build_bpe import cleanup import os tokenizer = Tokenizer.from_file("bpe-fi.tokenizer.json") print(tokenizer) #dfolder = "../../Data/wiki/fi/" dfolder = "../../Data/finovels/" files = os.listdir(dfolder) print("Read files from", dfolder) print("...") #s = open(dpath).read().lower() lines = [] for dpath in files: with open(dfolder + dpath) as f: print("File:", dpath) for line in f: clean_line = cleanup(line) lines.append(clean_line) #print("Encode", s[:100], len(s)) print("ENCODE") encoded_l = tokenizer.encode_batch(lines)
def load_jieba_tokenizer(tokenizer_path) -> Tokenizer: tokenizer = Tokenizer.from_file(str(tokenizer_path)) tokenizer.pre_tokenizer = PreTokenizer.custom(JiebaPreTokenizer()) tokenizer.decoder = Decoder.custom(JiebaDecoder()) return tokenizer
def fetch_encoder(config: EncoderConfig): if config.is_pretrained: return GPT2TokenizerFast.from_pretrained(config.location) return Tokenizer.from_file(config.location)
return torch.tensor(self.examples[i]) configuration = BertConfig() model = BertModel(configuration) configuration = model.config #tokenizer = Tokenizer(BPE(unk_token="[UNK]")) #trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) #tokenizer.pre_tokenizer = Whitespace() #files = ['./processed_wiki_ko.txt'] #tokenizer.train(files=files, trainer=trainer) #tokenizer = Tokenizer.from_file("./wiki_tokenizer.json") #fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="wiki_tokenizer.json") tokenizer = Tokenizer.from_file("./wiki_tokenizer.json") tokenizer.enable_truncation(max_length=512) #tokenizer._tokenizer.post_processor = BertProcessing( # single="[CLS] $A [SEP]", # pair="[CLS] $A [SEP] $B:1 [SEP]:1", # special_tokens=[ # ("[CLS]", tokenizer.token_to_id("[CLS]")), # ("[SEP]", tokenizer.token_to_id("[SEP]")), # ], #) tokenizer.post_processor = BertProcessing(sep=("[SEP]", tokenizer.token_to_id("[SEP]")), cls=("[CLS]", tokenizer.token_to_id("[CLS]")))
dataset = build_dataset.TrainDataset(filepath) dataLoader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=0) config = BertConfig( vocab_size=249, max_position_embeddings=514, num_attention_heads=12, num_hidden_layers=6, type_vocab_size=1, ) tokenizer = Tokenizer.from_file( "/home/ubuntu/BERT-GAN/BERT_GAN/bAbI_tokenizer.json") model = BertForMaskedLM(config=config) training_args = TrainingArguments( output_dir="/home/ubuntu/BERT-GAN/BERT_GAN/bAbibert_model", overwrite_output_dir=True, num_train_epochs=1, per_gpu_train_batch_size=64, save_steps=10_000, save_total_limit=2, prediction_loss_only=True, ) trainer = Trainer( model=model,
def train_tokenizer_vocab(dataset, style='BPE', force_retrain=True): """ if force_retrain: overwrite the stored tokenizer from tokenizers dir (by retraining) else: load the tokenizer if it exists """ assert dataset in VALID_DATASETS assert style in VALID_TOKENIZATIONS tpath_expected = default_tpath(dataset, style) train = True if not force_retrain and os.path.isfile(tpath_expected): tokenizer = Tokenizer.from_file(tpath_expected) train = False else: print('%s tokenizer file does not exist; training new tokenizer' % tpath_expected) if train: # load data associated with one of the valid datasets (from /data/ directory) datafiles = load_dataset(dataset) # Steps for each algo (e.g. BPE): # - init Tokenizer using algo # - specify algo specific trainer # - specify any pre-processing of text (will affect decoding) # see: https://huggingface.co/docs/tokenizers/python/latest/components.html#decoders # - different training calls if its the arxiv dataset or wikitext # see https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/ if style == 'BPE': tokenizer = Tokenizer(BPE(unk_token="[UNK]")) trainer = BpeTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) tokenizer.pre_tokenizer = ByteLevel() if dataset == 'arxiv': tokenizer.train_from_iterator(datafiles, trainer=trainer) else: tokenizer.train(datafiles, trainer=trainer) tokenizer.decoder = decoders.ByteLevel() else: assert style == 'WordLevel' tokenizer = Tokenizer(WordLevel(unk_token="[UNK]")) trainer = WordLevelTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) tokenizer.pre_tokenizer = Whitespace() if dataset == 'arxiv': tokenizer.train_from_iterator(datafiles, trainer=trainer) else: tokenizer.train(datafiles, trainer=trainer) tokenizer.decoder = decoders.WordPiece( ) # WordPiece seems to work (adds back spaces) # Save to tokenizers directory tokenizer.save(tpath_expected) # Generate vocab object based on tokenizer.decoder() method # ... TODO implement the same vocabulary functionality, or ensure it is present in Tokenizer and then code it elsewhere... # Features we need to match: # from torchtext.legacy.vocab import Vocab as RetiredVocab # ntokens = len(vocab.stoi) ---> ntokens = tokenizer.(...) # data = [torch.tensor([vocab[token] for token in tokenizer(item)], # dtype=torch.long) for item in raw_text_iter] # tokenized_text_ints = torch.tensor([vocab[token] for token in tokenized_text], dtype=torch.long) # running_context_string = ' '.join([vocab.itos[src[k]] for k in range(src.shape[0])]) # unk_index = vocab.unk_index vocab = None return tokenizer, vocab
def main(): parser = ArgumentParser() parser.add_argument('lang') parser.add_argument('model') parser.add_argument('-n', type=int, default=None) args = parser.parse_args() with open(Path('data') / args.lang / 'config.json') as f: cfg = json.load(f) model_path = Path('data') / args.lang / 'models' / args.model device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print('Using device:', device) os.environ['TOKENIZERS_PARALLELISM'] = str(False) # tokenizer_tgt = Tokenizer.from_file('tgt.tokenizer.json') if args.lang == 'ita': tokenizer_tgt = GPT2Tokenizer.from_pretrained( 'LorenzoDeMattei/GePpeTto') else: tokenizer_tgt = Tokenizer.from_file( str( Path('data') / args.lang / 'preparation' / 'vocabularies' / 'tokenizer.json')) # model: GPT2LMHeadModel = EmbeddingTunerModel.load_from_checkpoint(model_path).m model = GPT2LMHeadModel.from_pretrained(str(model_path)) model.to(device) if args.n is not None: tokenizer_eng = GPT2Tokenizer.from_pretrained('gpt2') dict_path = Path( 'data') / args.lang / 'dictionaries' / f'{args.model}.tsv' with open(dict_path) as f_map: token_id_map = [ tokenizer_eng.convert_tokens_to_ids( line.strip().split('\t')[1]) for line in f_map ] print(f'generating {args.n:,} random texts (unconditioned)') out_dir = Path('data') / args.lang / 'results' / 'examples' os.makedirs(out_dir, exist_ok=True) name = str(int(time())) tgt_out_path = out_dir / f'{name}.{args.lang}.txt' src_out_path = out_dir / f'{name}.eng.txt' print( f'generating {args.n} {args.lang} examples to {tgt_out_path} [{src_out_path}]' ) with open(tgt_out_path, 'w') as f_tgt, open(src_out_path, 'w') as f_eng: for i, (tgt, eng) in enumerate( gen(tokenizer_tgt, model, device, n=args.n, tokenizer_eng=tokenizer_eng, token_id_map=token_id_map, cfg=cfg)): print(f'{i:,}/{args.n:,}') f_tgt.write(tgt + '\n\n') f_eng.write(eng + '\n\n') return while True: print('\n##########################################') prompt = input(' > ').strip() for txt in gen(tokenizer_tgt, model, device, prompt, cfg=cfg): print('\n' + txt)
def test_full_serialization_albert(self, albert_base): # Check we can read this file. # This used to fail because of BufReader that would fail because the # file exceeds the buffer capacity tokenizer = Tokenizer.from_file(albert_base)
test_corpus = LabeledCorpus(test_file) train_inputs = [] train_targets = [] for label, doc in train_corpus: train_targets.append(label) train_inputs.append(doc) test_inputs = [] test_targets = [] for label, doc in test_corpus: test_targets.append(label) test_inputs.append(doc) tokenizer = Tokenizer.from_file("rust_tokenizer.json") VOCAB_SIZE = len(tokenizer.get_vocab()) config = { "experiment_name": "imdb_lstm", "model_config": { "output_dim": 2, "vocab_size": VOCAB_SIZE, "hidden_dim": 200 }, "random_seed": 42, "iterator_type": "padded_iterator", "loss": "cross_entropy", "optimizer": "adam", "learning_rate": 0.0001, "regularization": "l2",
def main(): batch_size = 4 vocab_size = 16384 max_source_length = 1024 max_target_length = 1024 num_workers = 3 dataset = nlp.load_dataset("iwslt2017.py", "nl-en") # Train tokenizer tokenizer_filename = "tokenizer.json" if os.path.exists(tokenizer_filename): tokenizer = Tokenizer.from_file(tokenizer_filename) else: data_filename = "whole_data.txt" with open(data_filename, "w") as f: for item in dataset["train"]: f.write(item["source"] + "\n") f.write(item["target"] + "\n\n") tokenizer = CharBPETokenizer() tokenizer.train([data_filename], vocab_size=vocab_size) pad_token = AddedToken("[PAD]", lstrip=False, rstrip=False) tokenizer.add_tokens([pad_token]) tokenizer.save(tokenizer_filename) tokenizer.pad_token_id = vocab_size # Loaders train_dataset = Seq2SeqDataset(tokenizer, dataset["train"], max_source_length, max_target_length) val_dataset = Seq2SeqDataset(tokenizer, dataset["validation"], max_source_length, max_target_length) train_loader = DataLoader( train_dataset, batch_size=batch_size, shuffle=True, collate_fn=train_dataset.collate_fn, num_workers=num_workers, ) val_loader = DataLoader( val_dataset, batch_size=batch_size, collate_fn=val_dataset.collate_fn, num_workers=num_workers, ) # Train model config = BartConfig( vocab_size=vocab_size + 1, # Pad d_model=1024, encoder_ffn_dim=1024, encoder_layers=6, encoder_attention_heads=4, decoder_ffn_dim=1024, decoder_layers=6, decoder_attention_heads=4, ) model = BartForConditionalGeneration(config) translator = Translate(model, tokenizer) trainer = pl.Trainer(gpus=1) trainer.fit(translator, train_loader, val_loader)
def cli_main(args=None): pl.seed_everything(42) parser = ArgumentParser() parser.add_argument("--checkpoint", required=False, type=str) parser.add_argument("--strict", default=False, action='store_true') parser.add_argument("--name", type=str, required=True) parser.add_argument("--early_stopping_monitor", type=str, default='tokens_matched_accuracy') parser.add_argument("--early_stopping_mode", type=str, default='max') parser.add_argument("--early_stopping_min_delta", type=float, default=0.001) parser.add_argument("--early_stopping_patience", type=int, default=3) parser.add_argument('--tokenizer', help='path to pretrained tokenizer', type=str, required=True) parser.add_argument('--dataset', help='datasets dataset name', type=str, required=True) parser.add_argument('--languages', help='dataset languages to tokenize', type=str, required=True) parser.add_argument('--blm_class', help='Bert Lightning Module to train', type=str) dm_class = WMT20DataModule parser = dm_class.add_argparse_args(parser) parser = BertLightningModule.add_model_specific_args(parser) parser = pl.Trainer.add_argparse_args(parser) args = parser.parse_args(args) import os os.environ["TOKENIZERS_PARALLELISM"] = "false" tokenizer = Tokenizer.from_file(args.tokenizer) # [UNK],[SEP],[PAD],[MASK],[ECHO],[TRANSLATE] special_tokens = [ '[UNK]', '[PAD]', '[TRANSLATE]', '[ECHO]', '[MASK]', '[SEP]', ] assert tokenizer.add_special_tokens( special_tokens ) == 0, f'one of special tokens not in tokenizer: {special_tokens}' dm = dm_class.from_argparse_args(args, tokenizer=tokenizer, dataset=args.dataset, languages=args.languages, device='cuda') dm.setup() assert dm.device == 'cuda' if args.max_steps == -1: args.max_steps = None blm_class = BertLightningModule if args.blm_class == 'BertTranslateLightningModule': blm_class = BertTranslateLightningModule elif args.blm_class == 'BertLightningModule': blm_class = BertLightningModule elif args.blm_class == 'BertIELightningModule': blm_class = BertIELightningModule else: raise ValueError("unknown blm_class") if args.checkpoint is not None: print("Restoring from checkpoint", args.checkpoint) bert_model = blm_class.load_from_checkpoint(args.checkpoint, strict=args.strict) bert_model.hparams.noam_scaler = args.noam_scaler bert_model.hparams.lr = args.lr bert_model.hparams.noam_opt_warmup_steps = args.noam_opt_warmup_steps bert_model.hparams.scheduler = args.scheduler bert_model.hparams.scheduler_patience = args.scheduler_patience bert_model.hparams.noam_step_factor = args.noam_step_factor bert_model.tokenizer = tokenizer else: args_dict = vars(args) lightning_module_args = { k: args_dict[k] for k in args_dict.keys() if args_dict[k] is not None } lightning_module_args['tokenizer'] = tokenizer bert_model = blm_class(**lightning_module_args) trainer_logger = pl.loggers.TensorBoardLogger("lightning_logs", name=args.name) early_stop_callback = pl.callbacks.EarlyStopping( monitor=args.early_stopping_monitor, mode=args.early_stopping_mode, min_delta=args.early_stopping_min_delta, patience=args.early_stopping_patience, verbose=True, ) trainer = pl.Trainer.from_argparse_args(args, logger=trainer_logger, callbacks=[early_stop_callback]) trainer.fit(bert_model, datamodule=dm) return dm, bert_model, trainer
def get_tokenizer(args): if args.encoder_path is None: return GPT2TokenizerFast.from_pretrained('gpt2') else: return Tokenizer.from_file(args.encoder_path)
def main(gpu, params): """ Loads the dataset and trains the model.""" rank = params.nr * params.gpus + gpu if params.distributed: dist.init_process_group(backend='nccl', init_method='env://', world_size=params.world_size, rank=rank) seed_all(SEED) # get gpu device if params.device == 'gpu': device = torch.device(gpu) else: device = 'cpu' # only wandb on main process if rank == 0 and params.wandb: wandb.init(project='mnmt', entity='nlp-mnmt-project', config={ k: v for k, v in params.__dict__.items() if isinstance(v, (float, int, str)) }) config = wandb.config logger, params = setup(params) # load data and train for required experiment if len(params.langs) == 2: # bilingual translation # load tokenizers if continuing if params.checkpoint: tokenizers = [] for lang in params.langs: tokenizers.append( Tokenizer.from_file(logger.root_path + '/' + lang + '_tokenizer.json')) else: if params.tokenizer is not None: if len(params.tokenizer) == 2: tokenizers = [ Tokenizer.from_file('pretrained/' + tok + '.json') for tok in params.tokenizer ] else: print('Wrong number of tokenizers passed. Retraining.') tokenizers = None else: tokenizers = None train_dataloader, val_dataloader, test_dataloader, _ = preprocess.load_and_preprocess( params.langs, params.batch_size, params.vocab_size, params.dataset, multi=False, path=logger.root_path, tokenizer=tokenizers, distributed=params.distributed, world_size=params.world_size, rank=rank) train(rank, device, logger, params, train_dataloader, val_dataloader=val_dataloader, verbose=params.verbose) elif len(params.langs) > 2: # multilingual translation # load tokenizers if continuing if params.checkpoint: tokenizer = Tokenizer.from_file(logger.root_path + '/multi_tokenizer.json') else: if params.tokenizer is not None: tokenizer = Tokenizer.from_file('pretrained/' + params.tokenizer + '.json') else: tokenizer = None train_dataloader, val_dataloader, test_dataloader, tokenizer = preprocess.load_and_preprocess( params.langs, params.batch_size, params.vocab_size, params.dataset, multi=True, path=logger.root_path, tokenizer=tokenizer, distributed=params.distributed, world_size=params.world_size, rank=rank) train(rank, device, logger, params, train_dataloader, val_dataloader=val_dataloader, tokenizer=tokenizer, verbose=params.verbose) else: raise NotImplementedError # end wanb process to avoid hanging if rank == 0 and params.wandb: wandb.finish()