Esempio n. 1
0
    def preprocess(self, data_source_type="small"):

        self.logger.info(
            f"Loading {data_source_type} data from disk and parse it as a bunch of batches ..."
        )
        train_dataset, eval_dataset, test_dataset = dataset_generation(
            self.context, data_type=data_source_type)

        self.logger.info("Build iteral dataset ... ")
        self.train_iter = DataLoader(train_dataset,
                                     batch_size=self.nums_batch,
                                     shuffle=True,
                                     collate_fn=input_target_collate_fn)

        self.eval_iter = DataLoader(eval_dataset,
                                    batch_size=self.nums_batch,
                                    shuffle=True,
                                    collate_fn=input_target_collate_fn)

        self.test_iter = DataLoader(test_dataset,
                                    batch_size=self.nums_batch,
                                    shuffle=True,
                                    collate_fn=custom_collate_fn)

        self.logger.info("Build src/tgt Vocabulary ...")
        self.src_vocab = train_dataset.src_vocab
        self.tgt_vocab = train_dataset.tgt_vocab

        self.logger.info("Build transformer model ...")
        self.model = build_model(self.context, len(self.src_vocab),
                                 len(self.tgt_vocab))
        self.model.cuda() if self.context.is_cuda else None
        self.logger.debug(self.model)
Esempio n. 2
0
    def preprocess(self, data_source_type="small"):

        self.logger.info(f"Loading {data_source_type} data from disk and parse it as a bunch of batches ...")
        train_dataset, eval_dataset, test_dataset = dataset_generation(self.context, data_type=data_source_type)

        self.logger.info("Build iteral dataset ... ")
        self.train_iter = generated_iter_dataset(self.context, train_dataset, self.nums_batch)
        self.eval_iter = generated_iter_dataset(self.context, eval_dataset, self.nums_batch)
        self.test_iter = generated_iter_dataset(self.context, test_dataset, self.nums_batch)

        self.logger.info("Build src/tgt Vocabulary ...")
        self.src_vocab = train_dataset.src_vocab
        self.tgt_vocab = train_dataset.tgt_vocab

        self.logger.info("Build transformer model ...")
        self.model = build_model(self.context, len(self.src_vocab), len(self.tgt_vocab))
        # self.model.cuda() if self.context.is_cuda else None
        self.logger.debug(self.model)
Esempio n. 3
0
        if i % 50 == 1:
            elapsed = time.time() - start
            ctx.logger.info("Epoch Step: %d Loss: %f Tokens per Sec: %f", i,
                            loss / batch.ntokens, tokens / elapsed)
            start = time.time()
            tokens = 0
    return total_loss / total_tokens


if __name__ == "__main__":
    # Train the simple copy task.
    ctx = Context(desc="Train")
    logger = ctx.logger
    vocab_size = 11  # V_Size
    criterion = LabelSmoothing(size=vocab_size, padding_idx=0, smoothing=0.0)
    model = build_model(ctx, vocab_size, vocab_size)
    logger.info(model)
    model_opt = NoamOpt(
        model.src_embed[0].d_model, 1, 400,
        torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98),
                         eps=1e-9))

    for epoch in range(100):
        logger.debug("Training Epoch %d", epoch)
        model.train()
        run_epoch(data_gen(vocab_size, 30, 20), model,
                  SimpleLossCompute(model.generator, criterion, model_opt),
                  ctx)

        model.eval()
        run_epoch(data_gen(vocab_size, 30, 5), model,
Esempio n. 4
0

if __name__ == "__main__":

    context = Context(desc="Prediction")
    logger = context.logger

    logger.info('Constructing dictionaries...')
    source_dictionary = IndexDictionary.load(
        context.project_processed_dir,
        mode='source',
        vocabulary_size=context.vocabulary_size)
    target_dictionary = IndexDictionary.load(
        context.project_processed_dir,
        mode='target',
        vocabulary_size=context.vocabulary_size)

    logger.info('Building model...')
    model = build_model(context, source_dictionary.vocabulary_size,
                        target_dictionary.vocabulary_size)

    logger.info("Building Predictor ....")
    predictor = Predictor(ctx=context,
                          m=model,
                          src_dictionary=source_dictionary,
                          tgt_dictionary=target_dictionary)

    logger.info("Get Predict Result ...")
    for index, candidate in enumerate(predictor.predict_one()):
        logger.info(f'Candidate {index} : {candidate}')
Esempio n. 5
0
def run():
    ctx = Context("Train_MultiGPU")
    logger = ctx.logger
    nums_batch = ctx.batch_size
    epochs = ctx.epochs

    # For data loading.
    from torchtext import data, datasets
    logger.info(f"Preparing dataset with batch size ... ")
    import spacy

    # !pip install torchtext spacy
    # !python -m spacy download en
    # !python -m spacy download de

    logger.info("Load en/de data from local ...")
    spacy_de = spacy.load('de', path=ctx.project_raw_dir)
    spacy_en = spacy.load('en', path=ctx.project_raw_dir)


    def tokenize_de(text):
        return [tok.text for tok in spacy_de.tokenizer(text)]

    # tokenize_en("I am a Chinese")  --> ['I', 'am', 'a', 'Chinese']
    def tokenize_en(text):
        return [tok.text for tok in spacy_en.tokenizer(text)]

    # Preparing dataset
    logger.info("Build SRC and TGT Fields ...")
    BOS_WORD = '<s>'
    EOS_WORD = '</s>'
    BLANK_WORD = "<blank>"
    SRC = data.Field(tokenize=tokenize_de, pad_token=BLANK_WORD)
    TGT = data.Field(tokenize=tokenize_en, init_token=BOS_WORD,
                     eos_token=EOS_WORD, pad_token=BLANK_WORD)
    logger.info("Split datasets into train, val and test using SRC/TGT fileds ...")
    MAX_LEN = 150
    # Spilt dataset in root path into train, val, and test dataset
    train, val, test = datasets.IWSLT.splits(
        exts=('.de', '.en'),  # A tuple containing the extension to path for each language.
        fields=(SRC, TGT),  # A tuple containing the fields that will be used for data in each language.
        root=ctx.project_raw_dir,  # Root dataset storage directory.
        filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and len(vars(x)['trg']) <= MAX_LEN)

    logger.info("Build vocabularies for src and tgt ...")
    MIN_FREQ = 2
    SRC.build_vocab(train.src, min_freq=MIN_FREQ)
    TGT.build_vocab(train.trg, min_freq=MIN_FREQ)

    # GPUs to use
    devices = ctx.device_id   #  [0, 1, 2, 3]
    pad_idx = TGT.vocab.stoi["<blank>"]
    logger.info("Build Model ...")
    model = build_model(ctx, len(SRC.vocab), len(TGT.vocab))
    model.cuda() if ctx.is_cuda else None

    # Print out log info for debug ...
    logger.info(model)


    criterion = LabelSmoothing(size=len(TGT.vocab), padding_idx=pad_idx, smoothing=0.1)
    criterion.cuda() if ctx.is_cuda else None

    logger.info("Generating Training and Validating Batch datasets ...")
    train_iter = MyIterator(train, batch_size=nums_batch, device=ctx.device,
                            repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
                            batch_size_fn=batch_size_fn, train=True)
    logger.info(f"Trainning Dataset: epoch[{epochs}], iterations[{train_iter.iterations}], batch size [{nums_batch}]")

    valid_iter = MyIterator(val, batch_size=nums_batch, device=ctx.device,
                            repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
                            batch_size_fn=batch_size_fn, train=False)

    logger.info(f"Validate Dataset: epoch[{epochs}], iterations[{valid_iter.iterations}], batch size [{nums_batch}]")

    if ctx.is_gpu_parallel:
        # Using multiple GPU resource to train ...
        model_parallel = nn.DataParallel(model, device_ids=devices)
        loss_func = MultiGPULossCompute
    elif ctx.is_cuda:
        # Using Single GPU resource to train ...
        model_parallel = model
        loss_func = SimpleLossCompute
    else:
        # Using Single CPU resource to train ...
        model_parallel = model
        loss_func = SimpleLossCompute

    logger.info("Training Process is begining ...")

    # Training or load model from checkpoint
    if True:
        model_opt = NoamOpt(model_size = model.src_embed[0].d_model,
                            factor = 1,
                            warmup = 2000,
                            optimizer = torch.optim.Adam(model.parameters(),
                                                         lr=0, betas=(0.9, 0.98), eps=1e-9))
        for epoch in range(epochs):
            # Set model in train
            model_parallel.train()
            run_epoch((rebatch(pad_idx, b) for b in train_iter),
                      model_parallel,
                      loss_func(model.generator, criterion, devices, opt=model_opt),
                      ctx)

            # Evaluation Model
            model_parallel.eval()

            # Get loss
            loss = run_epoch((rebatch(pad_idx, b) for b in valid_iter),
                             model_parallel,
                             loss_func(model.generator, criterion, devices, opt=None),
                             ctx)
            logger.info("The loss is %d", loss)
    else:
        model = torch.load("iwslt.pt")

    logger.info("Training is over and Evaluate Model  ...")
    for i, batch in enumerate(valid_iter):
        src = batch.src.transpose(0, 1)[:1]
        src_mask = (src != SRC.vocab.stoi["<blank>"]).unsqueeze(-2)
        out = greedy_decode(model, src, src_mask,
                            max_len=60, start_symbol=TGT.vocab.stoi["<s>"])
        print("Translation:", end="\t")
        for i in range(1, out.size(1)):
            sym = TGT.vocab.itos[out[0, i]]
            if sym == "</s>":
                break
            print(sym, end=" ")
        print()
        print("Target:", end="\t")
        for i in range(1, batch.trg.size(0)):
            sym = TGT.vocab.itos[batch.trg.data[i, 0]]
            if sym == "</s>": break
            print(sym, end=" ")
        print()
        break
Esempio n. 6
0

if __name__ == "__main__":

    context = Context(desc="Prediction")
    logger = context.logger
    data_source_type = "small"
    logger.info(
        f"Loading {data_source_type} data from disk and parse it as a bunch of batches ..."
    )
    train_dataset, eval_dataset, test_dataset = dataset_generation(
        context, data_type=data_source_type)

    logger.info('Constructing dictionaries...')
    source_dictionary = train_dataset.src_vocab
    target_dictionary = train_dataset.tgt_vocab

    logger.info('Building model...')
    model = build_model(context, len(source_dictionary),
                        len(target_dictionary))

    logger.info("Building Predictor ....")
    predictor = Predictor(ctx=context,
                          m=model,
                          src_dictionary=source_dictionary,
                          tgt_dictionary=target_dictionary)

    logger.info("Get Predict Result ...")
    for index, candidate in enumerate(predictor.predict()):
        logger.info(f'Candidate {index} : {candidate}')
Esempio n. 7
0
def run_trainer_standalone(ctx):
    random.seed(0)
    np.random.seed(0)
    torch.manual_seed(0)
    logger = ctx.logger

    run_name = ("d_model={d_model}-"
                "layers_count={layers_count}-"
                "heads_count={heads_count}-"
                "pe={positional_encoding}-"
                "optimizer={optimizer}-"
                "{timestamp}").format(**ctx.config,
                                      timestamp=datetime.now().strftime(
                                          "%Y_%m_%d_%H_%M_%S"))

    logger.info(f'Run name : {run_name}')
    logger.info('Constructing dictionaries...')
    source_dictionary = IndexDictionary.load(
        ctx.project_processed_dir,
        mode='source',
        vocabulary_size=ctx.vocabulary_size)
    logger.info(
        f'Source dictionary vocabulary Size: {source_dictionary.vocabulary_size} tokens'
    )

    target_dictionary = IndexDictionary.load(
        ctx.project_processed_dir,
        mode='target',
        vocabulary_size=ctx.vocabulary_size)
    logger.info(
        f'Target dictionary vocabulary Size: {target_dictionary.vocabulary_size} tokens'
    )

    logger.info('Building model...')
    model = build_model(ctx, source_dictionary.vocabulary_size,
                        target_dictionary.vocabulary_size)

    logger.info(model)
    logger.info(
        'Encoder : {parameters_count} parameters'.format(parameters_count=sum(
            [p.nelement() for p in model.encoder.parameters()])))
    logger.info(
        'Decoder : {parameters_count} parameters'.format(parameters_count=sum(
            [p.nelement() for p in model.decoder.parameters()])))
    logger.info('Total : {parameters_count} parameters'.format(
        parameters_count=sum([p.nelement() for p in model.parameters()])))

    logger.info('Loading datasets...')
    train_dataset = IndexedInputTargetTranslationDataset(ctx=ctx,
                                                         phase='train')

    val_dataset = IndexedInputTargetTranslationDataset(ctx=ctx, phase='val')

    train_dataloader = DataLoader(train_dataset,
                                  batch_size=ctx.batch_size,
                                  shuffle=True,
                                  collate_fn=input_target_collate_fn)

    val_dataloader = DataLoader(val_dataset,
                                batch_size=ctx.batch_size,
                                collate_fn=input_target_collate_fn)

    if ctx.label_smoothing > 0.0:
        loss_function = LabelSmoothingLoss(
            ctx=ctx,
            label_smoothing=ctx.label_smoothing,
            vocabulary_size=target_dictionary.vocabulary_size)
    else:
        loss_function = TokenCrossEntropyLoss()

    accuracy_function = AccuracyMetric()

    if ctx.optimizer == 'Noam':
        optimizer = NoamOptimizer(model.parameters(), d_model=ctx.d_model)
    elif ctx.optimizer == 'Adam':
        optimizer = Adam(model.parameters(), lr=ctx.lr)
    else:
        raise NotImplementedError()

    logger.info('Start training...')
    trainer = TransformerTrainer(model=model,
                                 train_dataloader=train_dataloader,
                                 val_dataloader=val_dataloader,
                                 loss_function=loss_function,
                                 metric_function=accuracy_function,
                                 optimizer=optimizer,
                                 run_name=run_name,
                                 ctx=ctx)

    trainer.run(ctx.epochs)

    return trainer