def train(train_file, validation_file, batch_size, epoch_limit, file_name, gpu_mode): transformations = transforms.Compose([transforms.ToTensor()]) sys.stderr.write(TextColor.PURPLE + 'Loading data\n' + TextColor.END) train_data_set = PileupDataset(train_file, transformations) train_loader = DataLoader(train_data_set, batch_size=batch_size, shuffle=True, num_workers=16, pin_memory=gpu_mode) sys.stderr.write(TextColor.PURPLE + 'Data loading finished\n' + TextColor.END) model = Model() if gpu_mode: model = torch.nn.DataParallel(model).cuda() # Loss and Optimizer criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), lr=0.001) # Train the Model sys.stderr.write(TextColor.PURPLE + 'Training starting\n' + TextColor.END) seq_len = 3 iteration_jump = 1 for epoch in range(epoch_limit): total_loss = 0 total_images = 0 total_could_be = 0 for i, (images, labels) in enumerate(train_loader): hidden = model.init_hidden(images.size(0)) # if batch size not distributable among all GPUs then skip if gpu_mode is True and images.size(0) % 8 != 0: continue images = Variable(images, requires_grad=False) labels = Variable(labels, requires_grad=False) if gpu_mode: images = images.cuda() labels = labels.cuda() for row in range(0, images.size(2), iteration_jump): # segmentation of image. Currently using seq_len if row + seq_len > images.size(2): continue x = images[:, :, row:row + seq_len, :] y = labels[:, row:row + seq_len] total_variation = torch.sum(y).data[0] total_could_be += batch_size # print(total_variation) if total_variation == 0 and random.uniform(0, 1) * 100 > 5: continue elif random.uniform(0, 1) < total_variation / batch_size < 0.02: continue # print(x) # print(y) # exit() # Forward + Backward + Optimize optimizer.zero_grad() outputs = model(x, hidden) hidden = repackage_hidden(hidden) # print('Label: ', y.data[0]) # print('Values:', outputs.data[0]) # print(y.contiguous().view(-1)) # exit() # outputs = outputs.view(1, outputs.size(0), -1) required for CTCLoss loss = criterion(outputs.contiguous().view(-1, 3), y.contiguous().view(-1)) # print(outputs.contiguous().view(-1, 3).size()) # print(y.contiguous().view(-1).size()) # exit() loss.backward() optimizer.step() # loss count total_images += batch_size total_loss += loss.data[0] sys.stderr.write(TextColor.BLUE + "EPOCH: " + str(epoch) + " Batches done: " + str(i + 1)) sys.stderr.write(" Loss: " + str(total_loss / total_images) + "\n" + TextColor.END) print( str(epoch) + "\t" + str(i + 1) + "\t" + str(total_loss / total_images)) # After each epoch do validation validate(validation_file, batch_size, gpu_mode, model, seq_len) sys.stderr.write(TextColor.YELLOW + 'Could be: ' + str(total_could_be) + ' Chosen: ' + str(total_images) + "\n" + TextColor.END) sys.stderr.write(TextColor.YELLOW + 'EPOCH: ' + str(epoch)) sys.stderr.write(' Loss: ' + str(total_loss / total_images) + "\n" + TextColor.END) torch.save(model, file_name + '_checkpoint_' + str(epoch) + '.pkl') torch.save( model.state_dict(), file_name + '_checkpoint_' + str(epoch) + '-params' + '.pkl') sys.stderr.write(TextColor.PURPLE + 'Finished training\n' + TextColor.END) torch.save(model, file_name + '_final.pkl') sys.stderr.write(TextColor.PURPLE + 'Model saved as:' + file_name + '.pkl\n' + TextColor.END) torch.save(model.state_dict(), file_name + '_final_params' + '.pkl') sys.stderr.write(TextColor.PURPLE + 'Model parameters saved as:' + file_name + '-params.pkl\n' + TextColor.END)
class BERTable(): def __init__(self, df, column_type, embedding_dim=5, n_layers=5, dim_feedforward=100, n_head=5, dropout=0.15, ns_exponent=0.75, share_category=False, use_pos=False, device='cpu'): self.logger = create_logger(name="BERTable") self.col_type = {'numerical': [], 'categorical': [], 'vector': []} for i, data_type in enumerate(column_type): self.col_type[data_type].append(i) self.embedding_dim = embedding_dim self.use_pos = use_pos self.device = device self.vocab = Vocab(df, self.col_type, share_category, ns_exponent) vocab_size = { 'numerical': len(self.vocab.item2idx['numerical']), 'categorical': len(self.vocab.item2idx['categorical']) } vector_dims = [np.shape(df[col])[1] for col in self.col_type['vector']] tab_len = len(column_type) self.model = Model(vocab_size, self.col_type, use_pos, vector_dims, embedding_dim, dim_feedforward, tab_len, n_layers, n_head, dropout) def pretrain(self, df, max_epochs=3, lr=1e-4, lr_weight={ 'numerical': 0.33, 'categorical': 0.33, 'vector': 0.33 }, loss_clip=[0, 100], n_sample=4, mask_rate=0.15, replace_rate=0.8, batch_size=32, shuffle=True, num_workers=1): self.model.loss_clip = loss_clip self.logger.info("[-] Converting to indices") data = self.vocab.convert(df, num_workers) self.model.to(self.device) self.model.train() optimizer = torch.optim.Adam(self.model.parameters(), lr=float(lr)) self.logger.info("[-] Start Pretraining") process_bar = tqdm(range(max_epochs), desc=f"[Progress]", total=max_epochs, leave=True, position=0) for epoch in process_bar: generator = create_dataloader(data, self.col_type, self.vocab, self.embedding_dim, self.use_pos, batch_size, num_workers, mask_rate=mask_rate, replace_rate=replace_rate, n_sample=n_sample, shuffle=shuffle) metric_bar = tqdm([0], desc=f"[Metric]", bar_format="{desc} {postfix}", leave=False, position=2) epoch_bar = tqdm(generator, desc=f"[Epoch]", leave=False, position=1) loss_history = {'numerical': [], 'categorical': [], 'vector': []} for batch_data in epoch_bar: batch_data = transfer(batch_data, self.device) _, losses = self.model.forward(batch_data, mode='train') loss = sum([ losses[data_type] / len(self.col_type[data_type]) * lr_weight[data_type] for data_type in self.col_type if len(self.col_type[data_type]) > 0 ]) optimizer.zero_grad() loss.backward() optimizer.step() display = '' for types in losses: loss_history[types].append(losses[types].item()) display += f'{types}: {np.mean(loss_history[types]):5.2f} ' metric_bar.set_postfix_str(display) process_bar.write(f'[Log] Epoch {epoch:0>2d}| ' + display) epoch_bar.close() metric_bar.close() process_bar.close() self.model.cpu() # def transform(self, df, batch_size=32, num_workers=1): # self.logger.info("[-] Converting to indices") # data = self.vocab.convert(df, num_workers) # generator = create_dataloader( # data, self.col_type, self.vocab, # self.embedding_dim, self.use_pos, # batch_size, num_workers, mode='test') # self.logger.info("[-] Start Transforming") # process_bar = tqdm( # generator, # desc=f"[Process]", # leave=False, # position=0) # self.model.to(self.device) # self.model.eval() # df_t = [] # for batch_data in process_bar: # batch_data = transfer(batch_data, self.device) # feature = self.model.forward(batch_data, mode='test') # df_t += list(feature.cpu().detach().numpy()) # process_bar.close() # self.model.cpu() # return df_t def save(self, model_path='model.ckpt', vocab_path='vocab.pkl'): torch.save(self.model.state_dict(), model_path) with open(vocab_path, 'wb') as file: pkl.dump(self.vocab, file)
def training_process(device, nb_class_labels, model_path, result_dir, patience, epochs, do_pre_train, tr_feat_path, tr_labels_path, val_feat_path, val_labels_path, tr_batch_size, val_batch_size, adapt_patience, adapt_epochs, d_lr, tgt_lr, update_cnt, factor): """Implements the complete training process of the AUDASC method. :param device: The device that we will use. :type device: str :param nb_class_labels: The amount of labels for label classification. :type nb_class_labels: int :param model_path: The path of previously saved model (if any) :type model_path: str :param result_dir: The directory to save newly pre-trained model. :type result_dir: str :param patience: The patience for the pre-training step. :type patience: int :param epochs: The epochs for the pre-training step. :type epochs: int :param do_pre_train: Flag to indicate if we do pre-training. :type do_pre_train: bool :param tr_feat_path: The path for loading the training features. :type tr_feat_path: str :param tr_labels_path: The path for loading the training labels. :type tr_labels_path: str :param val_feat_path: The path for loading the validation features. :type val_feat_path: str :param val_labels_path: The path for loading the validation labels. :type val_labels_path: str :param tr_batch_size: The batch used for pre-training. :type tr_batch_size: int :param val_batch_size: The batch size used for validation. :type val_batch_size: int :param adapt_patience: The patience for the domain adaptation step. :type adapt_patience: int :param adapt_epochs: The epochs for the domain adaptation step. :type adapt_epochs: int :param d_lr: The learning rate for the discriminator. :type d_lr: float :param tgt_lr: The learning rate for the adapted model. :type tgt_lr: float :param update_cnt: An update controller for adversarial loss :type update_cnt: int :param factor: the coefficient used to be multiplied by classification loss. :type factor: int """ tr_feat = device_exchange(file_io.load_pickled_features(tr_feat_path), device=device) tr_labels = device_exchange(file_io.load_pickled_features(tr_labels_path), device=device) val_feat = device_exchange(file_io.load_pickled_features(val_feat_path), device=device) val_labels = device_exchange( file_io.load_pickled_features(val_labels_path), device=device) loss_func = functional.cross_entropy non_adapted_cnn = Model().to(device) label_classifier = LabelClassifier(nb_class_labels).to(device) if not path.exists(result_dir): makedirs(result_dir) if do_pre_train: state_dict_path = result_dir printing.info_msg('Pre-training step') optimizer_source = torch.optim.Adam( list(non_adapted_cnn.parameters()) + list(label_classifier.parameters()), lr=1e-4) pre_training.pre_training(model=non_adapted_cnn, label_classifier=label_classifier, optimizer=optimizer_source, tr_batch_size=tr_batch_size, val_batch_size=val_batch_size, tr_feat=tr_feat['A'], tr_labels=tr_labels['A'], val_feat=val_feat['A'], val_labels=val_labels['A'], epochs=epochs, criterion=loss_func, patience=patience, result_dir=state_dict_path) del optimizer_source else: printing.info_msg('Loading a pre-trained non-adapted model') state_dict_path = model_path if not path.exists(state_dict_path): raise ValueError( 'The path for loading the pre trained model does not exist!') non_adapted_cnn.load_state_dict( torch.load(path.join(state_dict_path, 'non_adapted_cnn.pytorch'))) label_classifier.load_state_dict( torch.load(path.join(state_dict_path, 'label_classifier.pytorch'))) printing.info_msg('Training the Adversarial Adaptation Model') target_cnn = Model().to(device) target_cnn.load_state_dict(non_adapted_cnn.state_dict()) discriminator = Discriminator(2).to(device) target_model_opt = torch.optim.Adam(target_cnn.parameters(), lr=tgt_lr) discriminator_opt = torch.optim.Adam(discriminator.parameters(), lr=d_lr) domain_adaptation.domain_adaptation( non_adapted_cnn, target_cnn, label_classifier, discriminator, target_model_opt, discriminator_opt, loss_func, loss_func, loss_func, tr_feat, tr_labels, val_feat, val_labels, adapt_epochs, update_cnt, result_dir, adapt_patience, device, factor)
class Train: def __init__(self, model_name, corpus_dataset): self._config = TrainConfig() self._model_name = model_name self._data_loader = corpus_dataset.get_data_loader( self._config.batch_size) self._vocabulary = corpus_dataset.vocabulary self._model = Model(vocabulary=corpus_dataset.vocabulary, training=True) # TODO: Support for other optimizers self._optimizer = optim.Adam(self._model.parameters(), lr=self._config.learning_rate) self._global_step = -1 self._train_logger = logging.getLogger('Train') logging.basicConfig(level=logging.INFO) def train_step(self, input_seqs, input_lengths, target_seqs, masks): self._optimizer.zero_grad() step_loss, print_loss, _ = self._model(input_seqs, input_lengths, target_seqs, masks, self._global_step) self._train_logger.info('Step {}: Training loss: {}'.format( self._global_step, print_loss)) step_loss.backward() if self._config.use_gradient_clipping: _ = nn.utils.clip_grad_norm_(self._model.parameters(), self._config.gradient_clipping_value) self._optimizer.step() def train(self, num_steps, save_num_steps, save_folder='./data/models/train_dev'): if self._global_step < 0: self._global_step = 0 elif self._global_step >= num_steps: logging.info( 'Global step past number of steps requested. No training needed. Global Step = {}. ' 'Num training steps = {}'.format(self._global_step, num_steps)) return stop_training = False while not stop_training: for input_seqs, input_lengths, target_seqs, masks in self._data_loader: self.train_step(input_seqs, input_lengths, target_seqs, masks) self._global_step += 1 if self._global_step % save_num_steps == 0: self.save_checkpoint(save_folder) just_saved = True else: just_saved = False if self._global_step >= num_steps: stop_training = True logging.info('Finished training at step {}'.format( self._global_step)) if not just_saved: self.save_checkpoint(save_folder) break def save_checkpoint(self, save_folder): makedirs(save_folder, exist_ok=True) save_path = path.join(save_folder, 'checkpoint-{}.tar'.format(self._global_step)) logging.info('Saving checkpoint at step {}'.format(self._global_step)) torch.save( { 'name': self._model_name, 'global_step': self._global_step, 'model': self._model.state_dict(), 'optimizer': self._optimizer.state_dict(), 'vocabulary': self._vocabulary.__dict__, }, save_path) logging.info('Checkpoint saved at {}'.format(save_path)) @staticmethod def load_from_checkpoint(checkpoint_path, corpus_dataset): checkpoint = torch.load(checkpoint_path) train_obj = Train(checkpoint['name'], corpus_dataset) train_obj._vocabulary.__dict__ = checkpoint['vocabulary'] train_obj._global_step = checkpoint['global_step'] train_obj._model.load_state_dict(checkpoint['model']) train_obj._train_logger.info( 'Restored from checkpoint {}'.format(checkpoint_path)) return train_obj