コード例 #1
0
ファイル: utils.py プロジェクト: liyc7711/seq2seq-1
def load_dataset(batch_size):
    spacy_de = spacy.load('de')
    spacy_en = spacy.load('en')
    url = re.compile('(<url>.*</url>)')

    def tokenize_de(text):
        return [tok.text for tok in spacy_de.tokenizer(url.sub('@URL@', text))]

    def tokenize_en(text):
        return [tok.text for tok in spacy_en.tokenizer(url.sub('@URL@', text))]

    DE = Field(tokenize=tokenize_de,
               include_lengths=True,
               init_token='<sos>',
               eos_token='<eos>')
    EN = Field(tokenize=tokenize_en,
               include_lengths=True,
               init_token='<sos>',
               eos_token='<eos>')
    train, val, test = IWSLT.splits(exts=('.de', '.en'), fields=(DE, EN))
    DE.build_vocab(train.src, min_freq=3)
    EN.build_vocab(train.trg, min_freq=3)
    train_iter, val_iter, test_iter = BucketIterator.splits(
        (train, val, test),
        batch_size=batch_size,
        sort_within_batch=True,
        sort_key=lambda e: len(e.src),
        repeat=False)
    return train_iter, val_iter, test_iter, DE, EN
コード例 #2
0
def iwslt():
    for src_lang, tgt_lang in zip([FR, EN], [EN, DE]):
        bpe_dir = join(ROOT_BPE_DIR, IWSLT.name,
                       IWSLT.base_dirname.format(src_lang[1:], tgt_lang[1:]))
        if os.path.exists(bpe_dir):
            LOGGER.info('BPE IWSLT for {}-{} exists, skipping...'.format(
                src_lang[1:], tgt_lang[1:]))
            continue
        os.makedirs(bpe_dir)

        # Download
        corpus_dir = join(
            ROOT_CORPUS_DIR, IWSLT.name,
            IWSLT.base_dirname.format(src_lang[1:], tgt_lang[1:]))
        LOGGER.info('downloading in {}...'.format(corpus_dir))
        IWSLT.dirname = IWSLT.base_dirname.format(src_lang[1:], tgt_lang[1:])
        IWSLT.urls = [
            IWSLT.base_url.format(src_lang[1:], tgt_lang[1:], IWSLT.dirname)
        ]
        IWSLT.download(root=ROOT_CORPUS_DIR, check=corpus_dir)
        IWSLT.clean(corpus_dir)

        # Tokenize
        token_dir = join(ROOT_TOK_DIR, IWSLT.name,
                         IWSLT.base_dirname.format(src_lang[1:], tgt_lang[1:]))
        os.makedirs(token_dir)
        suffix_langs = [
            (src_lang[1:] + '-' + tgt_lang[1:] + src_lang, src_lang),
            (src_lang[1:] + '-' + tgt_lang[1:] + tgt_lang, tgt_lang)
        ]
        prefixs = ['train', 'IWSLT16.TED.tst2013']
        for prefix, (suffix, lang) in product(prefixs, suffix_langs):
            in_file = join(corpus_dir, prefix + '.' + suffix)
            tok_file = join(token_dir, prefix + '.' + suffix)
            bpe_file = join(bpe_dir, prefix + '.' + suffix)
            _tokenize(in_file=in_file, out_file=tok_file, lang=lang)
            _apply_bpe(in_file=tok_file, out_file=bpe_file)
コード例 #3
0
    def load(self):
        # for tokenizing the english sentences
        spacy_en = spacy.load('en')
        # for tokenizing the german sentences
        spacy_de = spacy.load('de')

        def tokenize_de(text):
            # tokenizes the german text into a list of strings(tokens) and reverse it
            # we are reversing the input sentences, as it is observed
            # by reversing the inputs we will get better results
            return [tok.text for tok in spacy_de.tokenizer(text)] ## [::-1]     # list[::-1] used to reverse the list

        def tokenize_en(text):
            # tokenizes the english text into a list of strings(tokens)
            return [tok.text for tok in spacy_en.tokenizer(text)]

        self.SRC = Field(tokenize=tokenize_de, init_token='<sos>', eos_token='<eos>', lower=True)
        self.TRG = Field(tokenize=tokenize_en, init_token='<sos>', eos_token='<eos>', lower=True)
        self.train_data, self.valid_data, self.test_data = IWSLT.splits(exts=('.de', '.en'),
                                                                        fields=(self.SRC, self.TRG))
        print("Number of training samples: {}".format(len(self.train_data.examples)))
        print("Number of validation samples: {}".format(len(self.valid_data.examples)))
        print("Number of testing samples: {}".format(len(self.test_data.examples)))
コード例 #4
0
def main(colab_args=None, do_train=True):
    if colab_args:
        args = colab_args
    else:
        parser = argparse.ArgumentParser()

        parser.add_argument(
            "--output_dir",
            type=str,
            required=True,
            help="The output directory where the model predictions and checkpoints will be written.",
        )
        parser.add_argument(
            "--should_continue", action="store_true", help="Whether to continue from latest checkpoint in output_dir"
        )
        parser.add_argument(
            "--model_name_or_path",
            default=None,
            type=str,
            help="The model checkpoint for weights initialization. Leave None if you want to train a model from scratch.",
        )
        parser.add_argument(
            "--data_path",
            default=None,
            type=str,
            help="The csv file for training the model"
        )
        parser.add_argument(
            "--config_name",
            default=None,
            type=str,
            help="Optional pretrained config name or path if not the same as model_name_or_path. If both are None, initialize a new config.",
        )
        parser.add_argument(
            "--tokenizer_name",
            default=None,
            type=str,
            help="Optional pretrained tokenizer name or path if not the same as model_name_or_path. If both are None, initialize a new tokenizer.",
        )
        parser.add_argument(
            "--cache_dir",
            default=None,
            type=str,
            help="Optional directory to store the pre-trained models downloaded from s3 (instead of the default one)",
        )
        parser.add_argument(
            "--block_size",
            default=-1,
            type=int,
            help="Optional input sequence length after tokenization."
                 "The training dataset will be truncated in block of this size for training."
                 "Default to the model max input length for single sentence inputs (take into account special tokens).",
        )
        parser.add_argument("--per_gpu_train_batch_size", default=4, type=int,
                            help="Batch size per GPU/CPU for training.")
        parser.add_argument(
            "--gradient_accumulation_steps",
            type=int,
            default=1,
            help="Number of updates steps to accumulate before performing a backward/update pass.",
        )
        parser.add_argument("--learning_rate", default=1e-5, type=float, help="The initial learning rate for Adam.")
        parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
        parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
        parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
        parser.add_argument(
            "--num_train_epochs", default=1.0, type=float, help="Total number of training epochs to perform."
        )
        parser.add_argument(
            "--max_steps",
            default=-1,
            type=int,
            help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
        )
        parser.add_argument("--log_dir", default=".", type=str, help="Directory to store the logs.")
        parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
        parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
        parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
        parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
        parser.add_argument(
            "--save_total_limit",
            type=int,
            default=None,
            help="Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default",
        )
        parser.add_argument(
            "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
        )
        parser.add_argument(
            "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
        )
        parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
        args = parser.parse_args()

    if args.should_continue:
        sorted_checkpoints = _sorted_checkpoints(args)
        if len(sorted_checkpoints) == 0:
            raise ValueError("Used --should_continue but no checkpoint was found in --output_dir.")
        else:
            args.model_name_or_path = sorted_checkpoints[-1]

    if (
            os.path.exists(args.output_dir)
            and os.listdir(args.output_dir)
            and args.do_train
            and not args.overwrite_output_dir
            and not args.should_continue
    ):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
                args.output_dir
            )
        )

    # Setup CUDA, GPU
    device = torch.device('cuda:{}'.format(torch.cuda.current_device()) if torch.cuda.is_available() else "cpu")
    args.n_gpu = 0 if device == 'cpu' else torch.cuda.device_count()
    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )

    # Set seed
    set_seed(args)

    # setup tokenizer and model
    tok = Field(tokenize=tokenize_en,
                init_token='<sos>',
                eos_token='<eos>',
                lower=True,
                batch_first=True)

    plc = Field(tokenize=tokenize_de,
                init_token='<sos>',
                eos_token='<eos>',
                lower=True,
                batch_first=True)

    train_data, valid_data, test_data = IWSLT.splits(exts=('.en', '.de'), fields=(tok, plc))

    tok.build_vocab(train_data, min_freq=1)
    plc.build_vocab(train_data, min_freq=1)

    train_dataloader, valid_dataloader, test_dataloader = BucketIterator.splits(
        (train_data, valid_data, test_data),
        batch_size=args.per_gpu_train_batch_size,
        device=args.device)

    config = data_globals.config
    config.vocab_size = len(tok.vocab)

    if args.model_name_or_path is None:
        # start from inital model
        print('### LOADING INITIAL MODEL ###')
        model = VideoTransformer(config=config, args=args)
        model.apply(initialize_weights)
    else:
        # start from checkpoint
        print('### LOADING MODEL FROM CHECKPOINT:', args.model_name_or_path, '###')
        model = VideoTransformer.from_pretrained(config=config, args=args)

    model.to(args.device)
    logger.info("Training/evaluation parameters %s", args)

    val(args, model, test_dataloader)

    # Training
    if do_train:
        global_step, tr_loss = train(args, model, train_dataloader, test_dataloader)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
    else:
        inference(args, model, valid_data, tok)
コード例 #5
0
import torch
from torchtext.data import Field, BucketIterator
from torchtext.datasets import IWSLT, Multi30k

from src.manager import BaselineModelManager, PointerSoftmaxModelManager

src = Field(batch_first=True, include_lengths=True, lower=True)
trg = Field(batch_first=True, include_lengths=True, lower=True)

train_data, val, test = IWSLT.splits(
    exts=('.en', '.de'),
    fields=(src, trg),
    filter_pred=lambda x: max(len(vars(x)['src']), len(vars(x)['trg'])) <= 50)

# build vocab using train data only
src.build_vocab(train_data, min_freq=2, max_size=7700)
trg.build_vocab(train_data, min_freq=2, max_size=9500)

device = torch.device('cuda')

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, val, test), batch_size=32, device=device)

pad_idx = trg.vocab.stoi['<pad>']

N_EPOCHS = 10

model = BaselineModelManager(src_vocab=src.vocab,
                             tgt_vocab=trg.vocab,
                             pad_idx=pad_idx)
# model = PointerSoftmaxModelManager(src_vocab=src.vocab, tgt_vocab=trg.vocab, pad_idx=pad_idx)
コード例 #6
0
import numpy as np
import math

src_spec = Field(tokenize="spacy",
                 tokenizer_language="en",
                 init_token='<sos>',
                 eos_token='<eos>',
                 lower=True)

trg_spec = Field(tokenize="spacy",
                 tokenizer_language="fr",
                 init_token='<sos>',
                 eos_token='<eos>',
                 lower=True)

train_data, valid_data, test_data = IWSLT.splits(exts=('.en', '.fr'),
                                                 fields=(src_spec, trg_spec))
len(train_data.examples), len(valid_data.examples), len(test_data.examples)

vars(train_data.examples[111])
vars(train_data.examples[11111])
vars(train_data.examples[111111])

src_spec.build_vocab(train_data, min_freq=2)
trg_spec.build_vocab(train_data, min_freq=2)

len(src_spec.vocab), len(trg_spec.vocab)

src_spec.vocab.stoi["cat"], trg_spec.vocab.stoi["chat"]

src_spec.vocab.itos[0], src_spec.vocab.itos[1], src_spec.vocab.itos[
    2], src_spec.vocab.itos[3]
コード例 #7
0
ファイル: transformer.py プロジェクト: ykopylov/torchbook
                 eos_token='<eos>',
                 lower=True,
                 batch_first=True,
                 fix_length=100)

trg_spec = Field(
    tokenize="spacy",
    tokenizer_language="xx",  # no language-specific tokenizer available for cz
    init_token='<sos>',
    eos_token='<eos>',
    lower=True,
    batch_first=True,
    fix_length=100)

train_data, valid_data, test_data = IWSLT.splits(
    exts=('.en', '.cs'),
    fields=(src_spec, trg_spec),
    test='IWSLT16.TED.tst2013')  # 2014 does not exist

len(train_data.examples), len(valid_data.examples), len(test_data.examples)

vars(train_data.examples[111])
vars(train_data.examples[11111])
vars(train_data.examples[111111])

src_spec.build_vocab(train_data, min_freq=2)
trg_spec.build_vocab(train_data, min_freq=2)

len(src_spec.vocab), len(trg_spec.vocab)

src_spec.vocab.stoi["hi"], trg_spec.vocab.stoi["ahoj"]