def load_dataset(batch_size): spacy_de = spacy.load('de') spacy_en = spacy.load('en') url = re.compile('(<url>.*</url>)') def tokenize_de(text): return [tok.text for tok in spacy_de.tokenizer(url.sub('@URL@', text))] def tokenize_en(text): return [tok.text for tok in spacy_en.tokenizer(url.sub('@URL@', text))] DE = Field(tokenize=tokenize_de, include_lengths=True, init_token='<sos>', eos_token='<eos>') EN = Field(tokenize=tokenize_en, include_lengths=True, init_token='<sos>', eos_token='<eos>') train, val, test = IWSLT.splits(exts=('.de', '.en'), fields=(DE, EN)) DE.build_vocab(train.src, min_freq=3) EN.build_vocab(train.trg, min_freq=3) train_iter, val_iter, test_iter = BucketIterator.splits( (train, val, test), batch_size=batch_size, sort_within_batch=True, sort_key=lambda e: len(e.src), repeat=False) return train_iter, val_iter, test_iter, DE, EN
def iwslt(): for src_lang, tgt_lang in zip([FR, EN], [EN, DE]): bpe_dir = join(ROOT_BPE_DIR, IWSLT.name, IWSLT.base_dirname.format(src_lang[1:], tgt_lang[1:])) if os.path.exists(bpe_dir): LOGGER.info('BPE IWSLT for {}-{} exists, skipping...'.format( src_lang[1:], tgt_lang[1:])) continue os.makedirs(bpe_dir) # Download corpus_dir = join( ROOT_CORPUS_DIR, IWSLT.name, IWSLT.base_dirname.format(src_lang[1:], tgt_lang[1:])) LOGGER.info('downloading in {}...'.format(corpus_dir)) IWSLT.dirname = IWSLT.base_dirname.format(src_lang[1:], tgt_lang[1:]) IWSLT.urls = [ IWSLT.base_url.format(src_lang[1:], tgt_lang[1:], IWSLT.dirname) ] IWSLT.download(root=ROOT_CORPUS_DIR, check=corpus_dir) IWSLT.clean(corpus_dir) # Tokenize token_dir = join(ROOT_TOK_DIR, IWSLT.name, IWSLT.base_dirname.format(src_lang[1:], tgt_lang[1:])) os.makedirs(token_dir) suffix_langs = [ (src_lang[1:] + '-' + tgt_lang[1:] + src_lang, src_lang), (src_lang[1:] + '-' + tgt_lang[1:] + tgt_lang, tgt_lang) ] prefixs = ['train', 'IWSLT16.TED.tst2013'] for prefix, (suffix, lang) in product(prefixs, suffix_langs): in_file = join(corpus_dir, prefix + '.' + suffix) tok_file = join(token_dir, prefix + '.' + suffix) bpe_file = join(bpe_dir, prefix + '.' + suffix) _tokenize(in_file=in_file, out_file=tok_file, lang=lang) _apply_bpe(in_file=tok_file, out_file=bpe_file)
def load(self): # for tokenizing the english sentences spacy_en = spacy.load('en') # for tokenizing the german sentences spacy_de = spacy.load('de') def tokenize_de(text): # tokenizes the german text into a list of strings(tokens) and reverse it # we are reversing the input sentences, as it is observed # by reversing the inputs we will get better results return [tok.text for tok in spacy_de.tokenizer(text)] ## [::-1] # list[::-1] used to reverse the list def tokenize_en(text): # tokenizes the english text into a list of strings(tokens) return [tok.text for tok in spacy_en.tokenizer(text)] self.SRC = Field(tokenize=tokenize_de, init_token='<sos>', eos_token='<eos>', lower=True) self.TRG = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>', lower=True) self.train_data, self.valid_data, self.test_data = IWSLT.splits(exts=('.de', '.en'), fields=(self.SRC, self.TRG)) print("Number of training samples: {}".format(len(self.train_data.examples))) print("Number of validation samples: {}".format(len(self.valid_data.examples))) print("Number of testing samples: {}".format(len(self.test_data.examples)))
def main(colab_args=None, do_train=True): if colab_args: args = colab_args else: parser = argparse.ArgumentParser() parser.add_argument( "--output_dir", type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.", ) parser.add_argument( "--should_continue", action="store_true", help="Whether to continue from latest checkpoint in output_dir" ) parser.add_argument( "--model_name_or_path", default=None, type=str, help="The model checkpoint for weights initialization. Leave None if you want to train a model from scratch.", ) parser.add_argument( "--data_path", default=None, type=str, help="The csv file for training the model" ) parser.add_argument( "--config_name", default=None, type=str, help="Optional pretrained config name or path if not the same as model_name_or_path. If both are None, initialize a new config.", ) parser.add_argument( "--tokenizer_name", default=None, type=str, help="Optional pretrained tokenizer name or path if not the same as model_name_or_path. If both are None, initialize a new tokenizer.", ) parser.add_argument( "--cache_dir", default=None, type=str, help="Optional directory to store the pre-trained models downloaded from s3 (instead of the default one)", ) parser.add_argument( "--block_size", default=-1, type=int, help="Optional input sequence length after tokenization." "The training dataset will be truncated in block of this size for training." "Default to the model max input length for single sentence inputs (take into account special tokens).", ) parser.add_argument("--per_gpu_train_batch_size", default=4, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.", ) parser.add_argument("--learning_rate", default=1e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument( "--num_train_epochs", default=1.0, type=float, help="Total number of training epochs to perform." ) parser.add_argument( "--max_steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.", ) parser.add_argument("--log_dir", default=".", type=str, help="Directory to store the logs.") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.") parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.") parser.add_argument( "--save_total_limit", type=int, default=None, help="Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default", ) parser.add_argument( "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory" ) parser.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets" ) parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") args = parser.parse_args() if args.should_continue: sorted_checkpoints = _sorted_checkpoints(args) if len(sorted_checkpoints) == 0: raise ValueError("Used --should_continue but no checkpoint was found in --output_dir.") else: args.model_name_or_path = sorted_checkpoints[-1] if ( os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir and not args.should_continue ): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( args.output_dir ) ) # Setup CUDA, GPU device = torch.device('cuda:{}'.format(torch.cuda.current_device()) if torch.cuda.is_available() else "cpu") args.n_gpu = 0 if device == 'cpu' else torch.cuda.device_count() args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) # Set seed set_seed(args) # setup tokenizer and model tok = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>', lower=True, batch_first=True) plc = Field(tokenize=tokenize_de, init_token='<sos>', eos_token='<eos>', lower=True, batch_first=True) train_data, valid_data, test_data = IWSLT.splits(exts=('.en', '.de'), fields=(tok, plc)) tok.build_vocab(train_data, min_freq=1) plc.build_vocab(train_data, min_freq=1) train_dataloader, valid_dataloader, test_dataloader = BucketIterator.splits( (train_data, valid_data, test_data), batch_size=args.per_gpu_train_batch_size, device=args.device) config = data_globals.config config.vocab_size = len(tok.vocab) if args.model_name_or_path is None: # start from inital model print('### LOADING INITIAL MODEL ###') model = VideoTransformer(config=config, args=args) model.apply(initialize_weights) else: # start from checkpoint print('### LOADING MODEL FROM CHECKPOINT:', args.model_name_or_path, '###') model = VideoTransformer.from_pretrained(config=config, args=args) model.to(args.device) logger.info("Training/evaluation parameters %s", args) val(args, model, test_dataloader) # Training if do_train: global_step, tr_loss = train(args, model, train_dataloader, test_dataloader) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) else: inference(args, model, valid_data, tok)
import torch from torchtext.data import Field, BucketIterator from torchtext.datasets import IWSLT, Multi30k from src.manager import BaselineModelManager, PointerSoftmaxModelManager src = Field(batch_first=True, include_lengths=True, lower=True) trg = Field(batch_first=True, include_lengths=True, lower=True) train_data, val, test = IWSLT.splits( exts=('.en', '.de'), fields=(src, trg), filter_pred=lambda x: max(len(vars(x)['src']), len(vars(x)['trg'])) <= 50) # build vocab using train data only src.build_vocab(train_data, min_freq=2, max_size=7700) trg.build_vocab(train_data, min_freq=2, max_size=9500) device = torch.device('cuda') train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, val, test), batch_size=32, device=device) pad_idx = trg.vocab.stoi['<pad>'] N_EPOCHS = 10 model = BaselineModelManager(src_vocab=src.vocab, tgt_vocab=trg.vocab, pad_idx=pad_idx) # model = PointerSoftmaxModelManager(src_vocab=src.vocab, tgt_vocab=trg.vocab, pad_idx=pad_idx)
import numpy as np import math src_spec = Field(tokenize="spacy", tokenizer_language="en", init_token='<sos>', eos_token='<eos>', lower=True) trg_spec = Field(tokenize="spacy", tokenizer_language="fr", init_token='<sos>', eos_token='<eos>', lower=True) train_data, valid_data, test_data = IWSLT.splits(exts=('.en', '.fr'), fields=(src_spec, trg_spec)) len(train_data.examples), len(valid_data.examples), len(test_data.examples) vars(train_data.examples[111]) vars(train_data.examples[11111]) vars(train_data.examples[111111]) src_spec.build_vocab(train_data, min_freq=2) trg_spec.build_vocab(train_data, min_freq=2) len(src_spec.vocab), len(trg_spec.vocab) src_spec.vocab.stoi["cat"], trg_spec.vocab.stoi["chat"] src_spec.vocab.itos[0], src_spec.vocab.itos[1], src_spec.vocab.itos[ 2], src_spec.vocab.itos[3]
eos_token='<eos>', lower=True, batch_first=True, fix_length=100) trg_spec = Field( tokenize="spacy", tokenizer_language="xx", # no language-specific tokenizer available for cz init_token='<sos>', eos_token='<eos>', lower=True, batch_first=True, fix_length=100) train_data, valid_data, test_data = IWSLT.splits( exts=('.en', '.cs'), fields=(src_spec, trg_spec), test='IWSLT16.TED.tst2013') # 2014 does not exist len(train_data.examples), len(valid_data.examples), len(test_data.examples) vars(train_data.examples[111]) vars(train_data.examples[11111]) vars(train_data.examples[111111]) src_spec.build_vocab(train_data, min_freq=2) trg_spec.build_vocab(train_data, min_freq=2) len(src_spec.vocab), len(trg_spec.vocab) src_spec.vocab.stoi["hi"], trg_spec.vocab.stoi["ahoj"]