def preprocess(self, data_source_type="small"): self.logger.info( f"Loading {data_source_type} data from disk and parse it as a bunch of batches ..." ) train_dataset, eval_dataset, test_dataset = dataset_generation( self.context, data_type=data_source_type) self.logger.info("Build iteral dataset ... ") self.train_iter = DataLoader(train_dataset, batch_size=self.nums_batch, shuffle=True, collate_fn=input_target_collate_fn) self.eval_iter = DataLoader(eval_dataset, batch_size=self.nums_batch, shuffle=True, collate_fn=input_target_collate_fn) self.test_iter = DataLoader(test_dataset, batch_size=self.nums_batch, shuffle=True, collate_fn=custom_collate_fn) self.logger.info("Build src/tgt Vocabulary ...") self.src_vocab = train_dataset.src_vocab self.tgt_vocab = train_dataset.tgt_vocab self.logger.info("Build transformer model ...") self.model = build_model(self.context, len(self.src_vocab), len(self.tgt_vocab)) self.model.cuda() if self.context.is_cuda else None self.logger.debug(self.model)
def preprocess(self, data_source_type="small"): self.logger.info(f"Loading {data_source_type} data from disk and parse it as a bunch of batches ...") train_dataset, eval_dataset, test_dataset = dataset_generation(self.context, data_type=data_source_type) self.logger.info("Build iteral dataset ... ") self.train_iter = generated_iter_dataset(self.context, train_dataset, self.nums_batch) self.eval_iter = generated_iter_dataset(self.context, eval_dataset, self.nums_batch) self.test_iter = generated_iter_dataset(self.context, test_dataset, self.nums_batch) self.logger.info("Build src/tgt Vocabulary ...") self.src_vocab = train_dataset.src_vocab self.tgt_vocab = train_dataset.tgt_vocab self.logger.info("Build transformer model ...") self.model = build_model(self.context, len(self.src_vocab), len(self.tgt_vocab)) # self.model.cuda() if self.context.is_cuda else None self.logger.debug(self.model)
if i % 50 == 1: elapsed = time.time() - start ctx.logger.info("Epoch Step: %d Loss: %f Tokens per Sec: %f", i, loss / batch.ntokens, tokens / elapsed) start = time.time() tokens = 0 return total_loss / total_tokens if __name__ == "__main__": # Train the simple copy task. ctx = Context(desc="Train") logger = ctx.logger vocab_size = 11 # V_Size criterion = LabelSmoothing(size=vocab_size, padding_idx=0, smoothing=0.0) model = build_model(ctx, vocab_size, vocab_size) logger.info(model) model_opt = NoamOpt( model.src_embed[0].d_model, 1, 400, torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)) for epoch in range(100): logger.debug("Training Epoch %d", epoch) model.train() run_epoch(data_gen(vocab_size, 30, 20), model, SimpleLossCompute(model.generator, criterion, model_opt), ctx) model.eval() run_epoch(data_gen(vocab_size, 30, 5), model,
if __name__ == "__main__": context = Context(desc="Prediction") logger = context.logger logger.info('Constructing dictionaries...') source_dictionary = IndexDictionary.load( context.project_processed_dir, mode='source', vocabulary_size=context.vocabulary_size) target_dictionary = IndexDictionary.load( context.project_processed_dir, mode='target', vocabulary_size=context.vocabulary_size) logger.info('Building model...') model = build_model(context, source_dictionary.vocabulary_size, target_dictionary.vocabulary_size) logger.info("Building Predictor ....") predictor = Predictor(ctx=context, m=model, src_dictionary=source_dictionary, tgt_dictionary=target_dictionary) logger.info("Get Predict Result ...") for index, candidate in enumerate(predictor.predict_one()): logger.info(f'Candidate {index} : {candidate}')
def run(): ctx = Context("Train_MultiGPU") logger = ctx.logger nums_batch = ctx.batch_size epochs = ctx.epochs # For data loading. from torchtext import data, datasets logger.info(f"Preparing dataset with batch size ... ") import spacy # !pip install torchtext spacy # !python -m spacy download en # !python -m spacy download de logger.info("Load en/de data from local ...") spacy_de = spacy.load('de', path=ctx.project_raw_dir) spacy_en = spacy.load('en', path=ctx.project_raw_dir) def tokenize_de(text): return [tok.text for tok in spacy_de.tokenizer(text)] # tokenize_en("I am a Chinese") --> ['I', 'am', 'a', 'Chinese'] def tokenize_en(text): return [tok.text for tok in spacy_en.tokenizer(text)] # Preparing dataset logger.info("Build SRC and TGT Fields ...") BOS_WORD = '<s>' EOS_WORD = '</s>' BLANK_WORD = "<blank>" SRC = data.Field(tokenize=tokenize_de, pad_token=BLANK_WORD) TGT = data.Field(tokenize=tokenize_en, init_token=BOS_WORD, eos_token=EOS_WORD, pad_token=BLANK_WORD) logger.info("Split datasets into train, val and test using SRC/TGT fileds ...") MAX_LEN = 150 # Spilt dataset in root path into train, val, and test dataset train, val, test = datasets.IWSLT.splits( exts=('.de', '.en'), # A tuple containing the extension to path for each language. fields=(SRC, TGT), # A tuple containing the fields that will be used for data in each language. root=ctx.project_raw_dir, # Root dataset storage directory. filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and len(vars(x)['trg']) <= MAX_LEN) logger.info("Build vocabularies for src and tgt ...") MIN_FREQ = 2 SRC.build_vocab(train.src, min_freq=MIN_FREQ) TGT.build_vocab(train.trg, min_freq=MIN_FREQ) # GPUs to use devices = ctx.device_id # [0, 1, 2, 3] pad_idx = TGT.vocab.stoi["<blank>"] logger.info("Build Model ...") model = build_model(ctx, len(SRC.vocab), len(TGT.vocab)) model.cuda() if ctx.is_cuda else None # Print out log info for debug ... logger.info(model) criterion = LabelSmoothing(size=len(TGT.vocab), padding_idx=pad_idx, smoothing=0.1) criterion.cuda() if ctx.is_cuda else None logger.info("Generating Training and Validating Batch datasets ...") train_iter = MyIterator(train, batch_size=nums_batch, device=ctx.device, repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)), batch_size_fn=batch_size_fn, train=True) logger.info(f"Trainning Dataset: epoch[{epochs}], iterations[{train_iter.iterations}], batch size [{nums_batch}]") valid_iter = MyIterator(val, batch_size=nums_batch, device=ctx.device, repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)), batch_size_fn=batch_size_fn, train=False) logger.info(f"Validate Dataset: epoch[{epochs}], iterations[{valid_iter.iterations}], batch size [{nums_batch}]") if ctx.is_gpu_parallel: # Using multiple GPU resource to train ... model_parallel = nn.DataParallel(model, device_ids=devices) loss_func = MultiGPULossCompute elif ctx.is_cuda: # Using Single GPU resource to train ... model_parallel = model loss_func = SimpleLossCompute else: # Using Single CPU resource to train ... model_parallel = model loss_func = SimpleLossCompute logger.info("Training Process is begining ...") # Training or load model from checkpoint if True: model_opt = NoamOpt(model_size = model.src_embed[0].d_model, factor = 1, warmup = 2000, optimizer = torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)) for epoch in range(epochs): # Set model in train model_parallel.train() run_epoch((rebatch(pad_idx, b) for b in train_iter), model_parallel, loss_func(model.generator, criterion, devices, opt=model_opt), ctx) # Evaluation Model model_parallel.eval() # Get loss loss = run_epoch((rebatch(pad_idx, b) for b in valid_iter), model_parallel, loss_func(model.generator, criterion, devices, opt=None), ctx) logger.info("The loss is %d", loss) else: model = torch.load("iwslt.pt") logger.info("Training is over and Evaluate Model ...") for i, batch in enumerate(valid_iter): src = batch.src.transpose(0, 1)[:1] src_mask = (src != SRC.vocab.stoi["<blank>"]).unsqueeze(-2) out = greedy_decode(model, src, src_mask, max_len=60, start_symbol=TGT.vocab.stoi["<s>"]) print("Translation:", end="\t") for i in range(1, out.size(1)): sym = TGT.vocab.itos[out[0, i]] if sym == "</s>": break print(sym, end=" ") print() print("Target:", end="\t") for i in range(1, batch.trg.size(0)): sym = TGT.vocab.itos[batch.trg.data[i, 0]] if sym == "</s>": break print(sym, end=" ") print() break
if __name__ == "__main__": context = Context(desc="Prediction") logger = context.logger data_source_type = "small" logger.info( f"Loading {data_source_type} data from disk and parse it as a bunch of batches ..." ) train_dataset, eval_dataset, test_dataset = dataset_generation( context, data_type=data_source_type) logger.info('Constructing dictionaries...') source_dictionary = train_dataset.src_vocab target_dictionary = train_dataset.tgt_vocab logger.info('Building model...') model = build_model(context, len(source_dictionary), len(target_dictionary)) logger.info("Building Predictor ....") predictor = Predictor(ctx=context, m=model, src_dictionary=source_dictionary, tgt_dictionary=target_dictionary) logger.info("Get Predict Result ...") for index, candidate in enumerate(predictor.predict()): logger.info(f'Candidate {index} : {candidate}')
def run_trainer_standalone(ctx): random.seed(0) np.random.seed(0) torch.manual_seed(0) logger = ctx.logger run_name = ("d_model={d_model}-" "layers_count={layers_count}-" "heads_count={heads_count}-" "pe={positional_encoding}-" "optimizer={optimizer}-" "{timestamp}").format(**ctx.config, timestamp=datetime.now().strftime( "%Y_%m_%d_%H_%M_%S")) logger.info(f'Run name : {run_name}') logger.info('Constructing dictionaries...') source_dictionary = IndexDictionary.load( ctx.project_processed_dir, mode='source', vocabulary_size=ctx.vocabulary_size) logger.info( f'Source dictionary vocabulary Size: {source_dictionary.vocabulary_size} tokens' ) target_dictionary = IndexDictionary.load( ctx.project_processed_dir, mode='target', vocabulary_size=ctx.vocabulary_size) logger.info( f'Target dictionary vocabulary Size: {target_dictionary.vocabulary_size} tokens' ) logger.info('Building model...') model = build_model(ctx, source_dictionary.vocabulary_size, target_dictionary.vocabulary_size) logger.info(model) logger.info( 'Encoder : {parameters_count} parameters'.format(parameters_count=sum( [p.nelement() for p in model.encoder.parameters()]))) logger.info( 'Decoder : {parameters_count} parameters'.format(parameters_count=sum( [p.nelement() for p in model.decoder.parameters()]))) logger.info('Total : {parameters_count} parameters'.format( parameters_count=sum([p.nelement() for p in model.parameters()]))) logger.info('Loading datasets...') train_dataset = IndexedInputTargetTranslationDataset(ctx=ctx, phase='train') val_dataset = IndexedInputTargetTranslationDataset(ctx=ctx, phase='val') train_dataloader = DataLoader(train_dataset, batch_size=ctx.batch_size, shuffle=True, collate_fn=input_target_collate_fn) val_dataloader = DataLoader(val_dataset, batch_size=ctx.batch_size, collate_fn=input_target_collate_fn) if ctx.label_smoothing > 0.0: loss_function = LabelSmoothingLoss( ctx=ctx, label_smoothing=ctx.label_smoothing, vocabulary_size=target_dictionary.vocabulary_size) else: loss_function = TokenCrossEntropyLoss() accuracy_function = AccuracyMetric() if ctx.optimizer == 'Noam': optimizer = NoamOptimizer(model.parameters(), d_model=ctx.d_model) elif ctx.optimizer == 'Adam': optimizer = Adam(model.parameters(), lr=ctx.lr) else: raise NotImplementedError() logger.info('Start training...') trainer = TransformerTrainer(model=model, train_dataloader=train_dataloader, val_dataloader=val_dataloader, loss_function=loss_function, metric_function=accuracy_function, optimizer=optimizer, run_name=run_name, ctx=ctx) trainer.run(ctx.epochs) return trainer