tokens += batch.ntokens if i % 50 == 1: elapsed = time.time() - start self.logger.info("Epoch Step: %d Loss: %f Tokens per Sec: %f", i, loss / batch.ntokens, tokens / elapsed) start = time.time() tokens = 0 return total_loss / total_tokens def postprocess(self): pass if __name__ == "__main__": ctx = Context(desc="Learning-fix based on Transformer") logger = ctx.logger logger.info("Build Data Process Engine based on input parsed dataset ...") engine = DataProcessEngine(ctx) logger.info("Preparing dataset and build model for trani ...") engine.preprocess(data_source_type="small") logger.info("Training and evaluating the model ...") engine.run(loss_func=SimpleLossComputeWithLablSmoothing, opt=get_std_opt(engine.model)) logger.info("Testing and data clean ...") engine.postprocess()
import logging from nmt.utils.pipe import shared_tokens_generator, source_tokens_generator, target_tokens_generator from nmt.utils.context import Context from benchmarks.example.datasets import TranslationDataset, TokenizedTranslationDataset from benchmarks.example.datasets import IndexedInputTargetTranslationDataset from benchmarks.example.datasets import IndexedInputTargetTranslationDatasetOnTheFly, TranslationDatasetOnTheFly from benchmarks.example.dictionaries import IndexDictionary if __name__ == "__main__": context = Context(desc="dataset") logger = context.logger if logger.isEnabledFor(logging.DEBUG): # Preparing Raw train/val dataset: a file of each line (src, tgt) # src-train.txt + tgt-train.txt --> raw-train.txt # src-val.txt + tgt-val.txt --> raw-val.txt logger.debug("The raw train and validate datasets are generating ...") TranslationDataset.prepare(context) # a list of train dataset: [(src, tgt), ..., (src, tgt)], build from raw-train.txt logger.info( "The train dataset [(src, tgt), ..., (src, tgt)] is generating ...") translation_dataset = TranslationDataset(context, 'train') if logger.isEnabledFor(logging.DEBUG): # a list of train dataset: [(src, tgt), ..., (src, tgt)], build from src-train.txt, tgt-train.txt logger.debug( "The train dataset [(src, tgt), ..., (src, tgt)] is generating on the fly ..." ) translation_dataset_on_the_fly = TranslationDatasetOnTheFly( context, 'train')
out = model(batch.src, batch.trg, batch.src_mask, batch.trg_mask) loss = loss_compute(out, batch.trg_y, batch.ntokens) total_loss += loss total_tokens += batch.ntokens tokens += batch.ntokens if i % 50 == 1: elapsed = time.time() - start ctx.logger.info("Epoch Step: %d Loss: %f Tokens per Sec: %f", i, loss / batch.ntokens, tokens / elapsed) start = time.time() tokens = 0 return total_loss / total_tokens if __name__ == "__main__": # Train the simple copy task. ctx = Context(desc="Train") logger = ctx.logger vocab_size = 11 # V_Size criterion = LabelSmoothing(size=vocab_size, padding_idx=0, smoothing=0.0) model = build_model(ctx, vocab_size, vocab_size) logger.info(model) model_opt = NoamOpt(model.src_embed[0].d_model, 1, 400, torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)) for epoch in range(100): logger.debug("Training Epoch %d", epoch) model.train() run_epoch(data_gen(vocab_size, 30, 20), model, SimpleLossCompute(model.generator, criterion, model_opt), ctx)
target="train", dataset=data_type, padding=pad_idx, src_vocab=None, tgt_vocab=None) src_vocab = train_dataset.src_vocab tgt_vocab = train_dataset.tgt_vocab logger.info(f"Preparing eval dataset ... ") eval_dataset = LFDataset(ctx=context, target="eval", dataset=data_type, padding=pad_idx, src_vocab=src_vocab, tgt_vocab=tgt_vocab) logger.info(f"Preparing test dataset ... ") test_dataset = LFDataset(ctx=context, target="test", dataset=data_type, padding=pad_idx, src_vocab=src_vocab, tgt_vocab=tgt_vocab) return train_dataset, eval_dataset, test_dataset if __name__ == "__main__": dataset_generation(Context("Learning-fix based on Transformer"))
def run(): ctx = Context("Train_MultiGPU") logger = ctx.logger nums_batch = ctx.batch_size epochs = ctx.epochs # For data loading. from torchtext import data, datasets logger.info(f"Preparing dataset with batch size ... ") import spacy # !pip install torchtext spacy # !python -m spacy download en # !python -m spacy download de logger.info("Load en/de data from local ...") spacy_de = spacy.load('de', path=ctx.project_raw_dir) spacy_en = spacy.load('en', path=ctx.project_raw_dir) def tokenize_de(text): return [tok.text for tok in spacy_de.tokenizer(text)] # tokenize_en("I am a Chinese") --> ['I', 'am', 'a', 'Chinese'] def tokenize_en(text): return [tok.text for tok in spacy_en.tokenizer(text)] # Preparing dataset logger.info("Build SRC and TGT Fields ...") BOS_WORD = '<s>' EOS_WORD = '</s>' BLANK_WORD = "<blank>" SRC = data.Field(tokenize=tokenize_de, pad_token=BLANK_WORD) TGT = data.Field(tokenize=tokenize_en, init_token=BOS_WORD, eos_token=EOS_WORD, pad_token=BLANK_WORD) logger.info( "Split datasets into train, val and test using SRC/TGT fileds ...") MAX_LEN = 150 # Spilt dataset in root path into train, val, and test dataset train, val, test = datasets.IWSLT.splits( exts=('.de', '.en' ), # A tuple containing the extension to path for each language. fields=( SRC, TGT ), # A tuple containing the fields that will be used for data in each language. root=ctx.project_raw_dir, # Root dataset storage directory. filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and len( vars(x)['trg']) <= MAX_LEN) logger.info("Build vocabularies for src and tgt ...") MIN_FREQ = 2 SRC.build_vocab(train.src, min_freq=MIN_FREQ) TGT.build_vocab(train.trg, min_freq=MIN_FREQ) # GPUs to use devices = ctx.device_id # [0, 1, 2, 3] pad_idx = TGT.vocab.stoi["<blank>"] logger.info("Build Model ...") model = build_model(ctx, len(SRC.vocab), len(TGT.vocab)) model.cuda() if ctx.is_cuda else None # Print out log info for debug ... logger.info(model) criterion = LabelSmoothing(size=len(TGT.vocab), padding_idx=pad_idx, smoothing=0.1) criterion.cuda() if ctx.is_cuda else None logger.info("Generating Training and Validating Batch datasets ...") train_iter = MyIterator(train, batch_size=nums_batch, device=ctx.device, repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)), batch_size_fn=batch_size_fn, train=True) logger.info( f"Trainning Dataset: epoch[{epochs}], iterations[{train_iter.iterations}], batch size [{nums_batch}]" ) valid_iter = MyIterator(val, batch_size=nums_batch, device=ctx.device, repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)), batch_size_fn=batch_size_fn, train=False) logger.info( f"Validate Dataset: epoch[{epochs}], iterations[{valid_iter.iterations}], batch size [{nums_batch}]" ) if ctx.is_gpu_parallel: # Using multiple GPU resource to train ... model_parallel = nn.DataParallel(model, device_ids=devices) loss_func = MultiGPULossCompute elif ctx.is_cuda: # Using Single GPU resource to train ... model_parallel = model loss_func = SimpleLossCompute else: # Using Single CPU resource to train ... model_parallel = model loss_func = SimpleLossCompute logger.info("Training Process is begining ...") # Training or load model from checkpoint if True: model_opt = NoamOpt(model_size=model.src_embed[0].d_model, factor=1, warmup=2000, optimizer=torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)) for epoch in range(epochs): # Set model in train model_parallel.train() run_epoch((rebatch(pad_idx, b) for b in train_iter), model_parallel, loss_func(model.generator, criterion, devices, opt=model_opt), ctx) # Evaluation Model model_parallel.eval() # Get loss loss = run_epoch((rebatch(pad_idx, b) for b in valid_iter), model_parallel, loss_func(model.generator, criterion, devices, opt=None), ctx) logger.info("The loss is %d", loss) else: model = torch.load("iwslt.pt") logger.info("Training is over and Evaluate Model ...") for i, batch in enumerate(valid_iter): src = batch.src.transpose(0, 1)[:1] src_mask = (src != SRC.vocab.stoi["<blank>"]).unsqueeze(-2) out = greedy_decode(model, src, src_mask, max_len=60, start_symbol=TGT.vocab.stoi["<s>"]) print("Translation:", end="\t") for i in range(1, out.size(1)): sym = TGT.vocab.itos[out[0, i]] if sym == "</s>": break print(sym, end=" ") print() print("Target:", end="\t") for i in range(1, batch.trg.size(0)): sym = TGT.vocab.itos[batch.trg.data[i, 0]] if sym == "</s>": break print(sym, end=" ") print() break
smoothing_function=smoothing_function.method3) line = "{bleu_score}\t{source}\t{target}\t|\t{prediction}".format( bleu_score=sentence_bleu_score, source=source, target=target, prediction=prediction) file.write(line + '\n') return corpus_bleu(list_of_references, hypotheses, smoothing_function=smoothing_function.method3) if __name__ == "__main__": context = Context("Evaluation") logger = context.logger logger.info('Constructing dictionaries...') source_dictionary = IndexDictionary.load( context.proj_processed_dir, mode='source', vocabulary_size=context.vocabulary_size) target_dictionary = IndexDictionary.load( context.proj_processed_dir, mode='target', vocabulary_size=context.vocabulary_size) logger.info('Building model...') model = build_model(context, source_dictionary.vocabulary_size, target_dictionary.vocabulary_size)
hypothesises, attentions = [], [] for i, (times, k) in enumerate(ks[:num_candidates]): hypothesis, attention = beam.get_hypothesis(times, k) hypothesises.append(hypothesis) attentions.append(attention) self.attentions = attentions self.hypothesises = [[token.item() for token in h] for h in hypothesises] hs = [self.postprocess(h) for h in self.hypothesises] return list(reversed(hs)) if __name__ == "__main__": context = Context(desc="Prediction") logger = context.logger logger.info('Constructing dictionaries...') source_dictionary = IndexDictionary.load( context.project_processed_dir, mode='source', vocabulary_size=context.vocabulary_size) target_dictionary = IndexDictionary.load( context.project_processed_dir, mode='target', vocabulary_size=context.vocabulary_size) logger.info('Building model...') model = build_model(context, source_dictionary.vocabulary_size, target_dictionary.vocabulary_size)
else: loss_function = TokenCrossEntropyLoss() accuracy_function = AccuracyMetric() if ctx.optimizer == 'Noam': optimizer = NoamOptimizer(model.parameters(), d_model=ctx.d_model) elif ctx.optimizer == 'Adam': optimizer = Adam(model.parameters(), lr=ctx.lr) else: raise NotImplementedError() logger.info('Start training...') trainer = TransformerTrainer(model=model, train_dataloader=train_dataloader, val_dataloader=val_dataloader, loss_function=loss_function, metric_function=accuracy_function, optimizer=optimizer, run_name=run_name, ctx=ctx) trainer.run(ctx.epochs) return trainer if __name__ == '__main__': run_trainer_standalone( Context(desc="Train Example Project with GPU Resource!"))