Example #1
0
            tokens += batch.ntokens
            if i % 50 == 1:
                elapsed = time.time() - start
                self.logger.info("Epoch Step: %d Loss: %f Tokens per Sec: %f",
                                 i, loss / batch.ntokens, tokens / elapsed)
                start = time.time()
                tokens = 0

        return total_loss / total_tokens

    def postprocess(self):
        pass


if __name__ == "__main__":
    ctx = Context(desc="Learning-fix based on Transformer")
    logger = ctx.logger

    logger.info("Build Data Process Engine based on input parsed dataset ...")
    engine = DataProcessEngine(ctx)

    logger.info("Preparing dataset and build model for trani ...")
    engine.preprocess(data_source_type="small")

    logger.info("Training and evaluating the model ...")
    engine.run(loss_func=SimpleLossComputeWithLablSmoothing,
               opt=get_std_opt(engine.model))

    logger.info("Testing and data clean ...")
    engine.postprocess()
Example #2
0
import logging
from nmt.utils.pipe import shared_tokens_generator, source_tokens_generator, target_tokens_generator
from nmt.utils.context import Context
from benchmarks.example.datasets import TranslationDataset, TokenizedTranslationDataset
from benchmarks.example.datasets import IndexedInputTargetTranslationDataset
from benchmarks.example.datasets import IndexedInputTargetTranslationDatasetOnTheFly, TranslationDatasetOnTheFly
from benchmarks.example.dictionaries import IndexDictionary

if __name__ == "__main__":
    context = Context(desc="dataset")
    logger = context.logger

    if logger.isEnabledFor(logging.DEBUG):
        # Preparing Raw train/val dataset: a file of each line (src, tgt)
        # src-train.txt + tgt-train.txt --> raw-train.txt
        # src-val.txt + tgt-val.txt --> raw-val.txt
        logger.debug("The raw train and validate datasets are generating ...")
        TranslationDataset.prepare(context)

    # a list of train dataset: [(src, tgt), ..., (src, tgt)], build from raw-train.txt
    logger.info(
        "The train dataset [(src, tgt), ..., (src, tgt)] is generating ...")
    translation_dataset = TranslationDataset(context, 'train')

    if logger.isEnabledFor(logging.DEBUG):
        # a list of train dataset: [(src, tgt), ..., (src, tgt)], build from src-train.txt, tgt-train.txt
        logger.debug(
            "The train dataset [(src, tgt), ..., (src, tgt)] is generating on the fly ..."
        )
        translation_dataset_on_the_fly = TranslationDatasetOnTheFly(
            context, 'train')
Example #3
0
        out = model(batch.src, batch.trg, batch.src_mask, batch.trg_mask)
        loss = loss_compute(out, batch.trg_y, batch.ntokens)
        total_loss += loss
        total_tokens += batch.ntokens
        tokens += batch.ntokens
        if i % 50 == 1:
            elapsed = time.time() - start
            ctx.logger.info("Epoch Step: %d Loss: %f Tokens per Sec: %f",
                            i, loss / batch.ntokens, tokens / elapsed)
            start = time.time()
            tokens = 0
    return total_loss / total_tokens

if __name__ == "__main__":
    # Train the simple copy task.
    ctx = Context(desc="Train")
    logger = ctx.logger
    vocab_size = 11  # V_Size
    criterion = LabelSmoothing(size=vocab_size, padding_idx=0, smoothing=0.0)
    model = build_model(ctx, vocab_size, vocab_size)
    logger.info(model)
    model_opt = NoamOpt(model.src_embed[0].d_model, 1, 400,
                        torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))

    for epoch in range(100):
        logger.debug("Training Epoch %d", epoch)
        model.train()
        run_epoch(data_gen(vocab_size, 30, 20),
                  model,
                  SimpleLossCompute(model.generator, criterion, model_opt),
                  ctx)
Example #4
0
                              target="train",
                              dataset=data_type,
                              padding=pad_idx,
                              src_vocab=None,
                              tgt_vocab=None)

    src_vocab = train_dataset.src_vocab
    tgt_vocab = train_dataset.tgt_vocab

    logger.info(f"Preparing eval dataset ... ")
    eval_dataset = LFDataset(ctx=context,
                             target="eval",
                             dataset=data_type,
                             padding=pad_idx,
                             src_vocab=src_vocab,
                             tgt_vocab=tgt_vocab)

    logger.info(f"Preparing test dataset ... ")
    test_dataset = LFDataset(ctx=context,
                             target="test",
                             dataset=data_type,
                             padding=pad_idx,
                             src_vocab=src_vocab,
                             tgt_vocab=tgt_vocab)

    return train_dataset, eval_dataset, test_dataset


if __name__ == "__main__":
    dataset_generation(Context("Learning-fix based on Transformer"))
Example #5
0
def run():
    ctx = Context("Train_MultiGPU")
    logger = ctx.logger
    nums_batch = ctx.batch_size
    epochs = ctx.epochs

    # For data loading.
    from torchtext import data, datasets
    logger.info(f"Preparing dataset with batch size ... ")
    import spacy

    # !pip install torchtext spacy
    # !python -m spacy download en
    # !python -m spacy download de

    logger.info("Load en/de data from local ...")
    spacy_de = spacy.load('de', path=ctx.project_raw_dir)
    spacy_en = spacy.load('en', path=ctx.project_raw_dir)

    def tokenize_de(text):
        return [tok.text for tok in spacy_de.tokenizer(text)]

    # tokenize_en("I am a Chinese")  --> ['I', 'am', 'a', 'Chinese']
    def tokenize_en(text):
        return [tok.text for tok in spacy_en.tokenizer(text)]

    # Preparing dataset
    logger.info("Build SRC and TGT Fields ...")
    BOS_WORD = '<s>'
    EOS_WORD = '</s>'
    BLANK_WORD = "<blank>"
    SRC = data.Field(tokenize=tokenize_de, pad_token=BLANK_WORD)
    TGT = data.Field(tokenize=tokenize_en,
                     init_token=BOS_WORD,
                     eos_token=EOS_WORD,
                     pad_token=BLANK_WORD)
    logger.info(
        "Split datasets into train, val and test using SRC/TGT fileds ...")
    MAX_LEN = 150
    # Spilt dataset in root path into train, val, and test dataset
    train, val, test = datasets.IWSLT.splits(
        exts=('.de', '.en'
              ),  # A tuple containing the extension to path for each language.
        fields=(
            SRC, TGT
        ),  # A tuple containing the fields that will be used for data in each language.
        root=ctx.project_raw_dir,  # Root dataset storage directory.
        filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and len(
            vars(x)['trg']) <= MAX_LEN)

    logger.info("Build vocabularies for src and tgt ...")
    MIN_FREQ = 2
    SRC.build_vocab(train.src, min_freq=MIN_FREQ)
    TGT.build_vocab(train.trg, min_freq=MIN_FREQ)

    # GPUs to use
    devices = ctx.device_id  #  [0, 1, 2, 3]
    pad_idx = TGT.vocab.stoi["<blank>"]
    logger.info("Build Model ...")
    model = build_model(ctx, len(SRC.vocab), len(TGT.vocab))
    model.cuda() if ctx.is_cuda else None

    # Print out log info for debug ...
    logger.info(model)

    criterion = LabelSmoothing(size=len(TGT.vocab),
                               padding_idx=pad_idx,
                               smoothing=0.1)
    criterion.cuda() if ctx.is_cuda else None

    logger.info("Generating Training and Validating Batch datasets ...")
    train_iter = MyIterator(train,
                            batch_size=nums_batch,
                            device=ctx.device,
                            repeat=False,
                            sort_key=lambda x: (len(x.src), len(x.trg)),
                            batch_size_fn=batch_size_fn,
                            train=True)
    logger.info(
        f"Trainning Dataset: epoch[{epochs}], iterations[{train_iter.iterations}], batch size [{nums_batch}]"
    )

    valid_iter = MyIterator(val,
                            batch_size=nums_batch,
                            device=ctx.device,
                            repeat=False,
                            sort_key=lambda x: (len(x.src), len(x.trg)),
                            batch_size_fn=batch_size_fn,
                            train=False)

    logger.info(
        f"Validate Dataset: epoch[{epochs}], iterations[{valid_iter.iterations}], batch size [{nums_batch}]"
    )

    if ctx.is_gpu_parallel:
        # Using multiple GPU resource to train ...
        model_parallel = nn.DataParallel(model, device_ids=devices)
        loss_func = MultiGPULossCompute
    elif ctx.is_cuda:
        # Using Single GPU resource to train ...
        model_parallel = model
        loss_func = SimpleLossCompute
    else:
        # Using Single CPU resource to train ...
        model_parallel = model
        loss_func = SimpleLossCompute

    logger.info("Training Process is begining ...")

    # Training or load model from checkpoint
    if True:
        model_opt = NoamOpt(model_size=model.src_embed[0].d_model,
                            factor=1,
                            warmup=2000,
                            optimizer=torch.optim.Adam(model.parameters(),
                                                       lr=0,
                                                       betas=(0.9, 0.98),
                                                       eps=1e-9))
        for epoch in range(epochs):
            # Set model in train
            model_parallel.train()
            run_epoch((rebatch(pad_idx, b) for b in train_iter),
                      model_parallel,
                      loss_func(model.generator,
                                criterion,
                                devices,
                                opt=model_opt), ctx)

            # Evaluation Model
            model_parallel.eval()

            # Get loss
            loss = run_epoch((rebatch(pad_idx, b) for b in valid_iter),
                             model_parallel,
                             loss_func(model.generator,
                                       criterion,
                                       devices,
                                       opt=None), ctx)
            logger.info("The loss is %d", loss)
    else:
        model = torch.load("iwslt.pt")

    logger.info("Training is over and Evaluate Model  ...")
    for i, batch in enumerate(valid_iter):
        src = batch.src.transpose(0, 1)[:1]
        src_mask = (src != SRC.vocab.stoi["<blank>"]).unsqueeze(-2)
        out = greedy_decode(model,
                            src,
                            src_mask,
                            max_len=60,
                            start_symbol=TGT.vocab.stoi["<s>"])
        print("Translation:", end="\t")
        for i in range(1, out.size(1)):
            sym = TGT.vocab.itos[out[0, i]]
            if sym == "</s>":
                break
            print(sym, end=" ")
        print()
        print("Target:", end="\t")
        for i in range(1, batch.trg.size(0)):
            sym = TGT.vocab.itos[batch.trg.data[i, 0]]
            if sym == "</s>": break
            print(sym, end=" ")
        print()
        break
Example #6
0
                    smoothing_function=smoothing_function.method3)
                line = "{bleu_score}\t{source}\t{target}\t|\t{prediction}".format(
                    bleu_score=sentence_bleu_score,
                    source=source,
                    target=target,
                    prediction=prediction)
                file.write(line + '\n')

        return corpus_bleu(list_of_references,
                           hypotheses,
                           smoothing_function=smoothing_function.method3)


if __name__ == "__main__":

    context = Context("Evaluation")
    logger = context.logger

    logger.info('Constructing dictionaries...')
    source_dictionary = IndexDictionary.load(
        context.proj_processed_dir,
        mode='source',
        vocabulary_size=context.vocabulary_size)
    target_dictionary = IndexDictionary.load(
        context.proj_processed_dir,
        mode='target',
        vocabulary_size=context.vocabulary_size)

    logger.info('Building model...')
    model = build_model(context, source_dictionary.vocabulary_size,
                        target_dictionary.vocabulary_size)
Example #7
0
        hypothesises, attentions = [], []
        for i, (times, k) in enumerate(ks[:num_candidates]):
            hypothesis, attention = beam.get_hypothesis(times, k)
            hypothesises.append(hypothesis)
            attentions.append(attention)

        self.attentions = attentions
        self.hypothesises = [[token.item() for token in h]
                             for h in hypothesises]
        hs = [self.postprocess(h) for h in self.hypothesises]
        return list(reversed(hs))


if __name__ == "__main__":

    context = Context(desc="Prediction")
    logger = context.logger

    logger.info('Constructing dictionaries...')
    source_dictionary = IndexDictionary.load(
        context.project_processed_dir,
        mode='source',
        vocabulary_size=context.vocabulary_size)
    target_dictionary = IndexDictionary.load(
        context.project_processed_dir,
        mode='target',
        vocabulary_size=context.vocabulary_size)

    logger.info('Building model...')
    model = build_model(context, source_dictionary.vocabulary_size,
                        target_dictionary.vocabulary_size)
Example #8
0
    else:
        loss_function = TokenCrossEntropyLoss()

    accuracy_function = AccuracyMetric()

    if ctx.optimizer == 'Noam':
        optimizer = NoamOptimizer(model.parameters(), d_model=ctx.d_model)
    elif ctx.optimizer == 'Adam':
        optimizer = Adam(model.parameters(), lr=ctx.lr)
    else:
        raise NotImplementedError()

    logger.info('Start training...')
    trainer = TransformerTrainer(model=model,
                                 train_dataloader=train_dataloader,
                                 val_dataloader=val_dataloader,
                                 loss_function=loss_function,
                                 metric_function=accuracy_function,
                                 optimizer=optimizer,
                                 run_name=run_name,
                                 ctx=ctx)

    trainer.run(ctx.epochs)

    return trainer


if __name__ == '__main__':
    run_trainer_standalone(
        Context(desc="Train Example Project with GPU Resource!"))