def prepare_encoder(self, priming_data): random.seed(len(priming_data)) if self._prepared: raise Exception('You can only call "prepare_encoder" once for a given encoder.') self.onehot_encoder.prepare_encoder(priming_data) input_len = self.onehot_encoder._lang.n_words self.use_autoencoder = self.max_encoded_length is not None and input_len > self.max_encoded_length if self.use_autoencoder: logging.info('Preparing a categorical autoencoder, this might take a while') embeddings_layer_len = self.max_encoded_length self.net = DefaultNet(dynamic_parameters={}, shape=[ input_len, embeddings_layer_len, input_len], selfaware=False) criterion = torch.nn.CrossEntropyLoss() optimizer = Ranger(self.net.parameters()) gym = Gym(model=self.net, optimizer=optimizer, scheduler=None, loss_criterion=criterion, device=self.net.device, name=self.name, input_encoder=self.onehot_encoder.encode, output_encoder=self._encoder_targets) batch_size = min(200, int(len(priming_data) / 50)) priming_data_str = [str(x) for x in priming_data] train_data_loader = DataLoader(list(zip(priming_data_str,priming_data_str)), batch_size=batch_size, shuffle=True) test_data_loader = None best_model, error, training_time = gym.fit(train_data_loader, test_data_loader, desired_error=self.desired_error, max_time=self.max_training_time, callback=self._train_callback, eval_every_x_epochs=1, max_unimproving_models=5) self.net = best_model.to(self.net.device) modules = [module for module in self.net.modules() if type( module) != torch.nn.Sequential and type(module) != DefaultNet] self.encoder = torch.nn.Sequential(*modules[0:2]).eval() self.decoder = torch.nn.Sequential(*modules[2:3]).eval() logging.info('Categorical autoencoder ready') self._prepared = True
class DistilBertEncoder: def __init__(self, is_target=False, aim=ENCODER_AIM.BALANCE): self.name = 'Text Transformer Encoder' self._tokenizer = None self._model = None self._pad_id = None self._pytorch_wrapper = torch.FloatTensor self._max_len = None self._max_ele = None self._prepared = False self._model_type = None self.desired_error = 0.01 self.max_training_time = CONFIG.MAX_ENCODER_TRAINING_TIME self._head = None # Possible: speed, balance, accuracy self.aim = aim if self.aim == ENCODER_AIM.SPEED: # uses more memory, takes very long to train and outputs weird debugging statements to the command line, consider waiting until it gets better or try to investigate why this happens (changing the pretrained model doesn't seem to help) self._classifier_model_class = AlbertForSequenceClassification self._embeddings_model_class = AlbertModel self._tokenizer_class = AlbertTokenizer self._pretrained_model_name = 'albert-base-v2' self._model_max_len = 768 if self.aim == ENCODER_AIM.BALANCE: self._classifier_model_class = DistilBertForSequenceClassification self._embeddings_model_class = DistilBertModel self._tokenizer_class = DistilBertTokenizer self._pretrained_model_name = 'distilbert-base-uncased' self._model_max_len = 768 if self.aim == ENCODER_AIM.ACCURACY: self._classifier_model_class = DistilBertForSequenceClassification self._embeddings_model_class = DistilBertModel self._tokenizer_class = DistilBertTokenizer self._pretrained_model_name = 'distilbert-base-uncased' self._model_max_len = 768 device_str = "cuda" if CONFIG.USE_CUDA else "cpu" if CONFIG.USE_DEVICE is not None: device_str = CONFIG.USE_DEVICE self.device = torch.device(device_str) def _train_callback(self, error, real_buff, predicted_buff): logging.info(f'{self.name} reached a loss of {error} while training !') @staticmethod def categorical_train_function(model, data, gym, test=False): input, real = data input = input.to(gym.device) labels = torch.tensor([torch.argmax(x) for x in real]).to(gym.device) outputs = gym.model(input, labels=labels) loss, logits = outputs[:2] if not test: loss.backward() gym.optimizer.step() gym.scheduler.step() gym.optimizer.zero_grad() return loss @staticmethod def numerical_train_function(model, data, gym, backbone, test=False): input, real = data input = input.to(gym.device) real = real.to(gym.device) embeddings = backbone(input)[0][:, 0, :] outputs = gym.model(embeddings) loss = gym.loss_criterion(outputs, real) if not test: loss.backward() gym.optimizer.step() gym.scheduler.step() gym.optimizer.zero_grad() return loss def prepare_encoder(self, priming_data, training_data=None): if self._prepared: raise Exception( 'You can only call "prepare_encoder" once for a given encoder.' ) priming_data = [x if x is not None else '' for x in priming_data] self._max_len = min(max([len(x) for x in priming_data]), self._model_max_len) self._tokenizer = self._tokenizer_class.from_pretrained( self._pretrained_model_name) self._pad_id = self._tokenizer.convert_tokens_to_ids( [self._tokenizer.pad_token])[0] # @TODO: Support multiple targets if they are all categorical or train for the categorical target if it's a mix (maybe ?) # @TODO: Attach a language modeling head and/or use GPT2 and/or provide outputs better suited to a LM head (which will be the mixer) if the output if text if training_data is not None and 'targets' in training_data and len( training_data['targets'] ) == 1 and training_data['targets'][0][ 'output_type'] == COLUMN_DATA_TYPES.CATEGORICAL and CONFIG.TRAIN_TO_PREDICT_TARGET: self._model_type = 'classifier' self._model = self._classifier_model_class.from_pretrained( self._pretrained_model_name, num_labels=len( set(training_data['targets'][0]['unencoded_output'])) + 1).to(self.device) batch_size = 10 no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in self._model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.000001 }, { 'params': [ p for n, p in self._model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=5e-5, eps=1e-8) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=10, num_training_steps=len(priming_data) * 15 / 20) gym = Gym(model=self._model, optimizer=optimizer, scheduler=scheduler, loss_criterion=None, device=self.device, name=self.name) input = [ self._tokenizer.encode(x[:self._max_len], add_special_tokens=True) for x in priming_data ] tokenized_max_len = max([len(x) for x in input]) input = torch.tensor([ x + [self._pad_id] * (tokenized_max_len - len(x)) for x in input ]) real = training_data['targets'][0]['encoded_output'] merged_data = list(zip(input, real)) train_data_loader = DataLoader( merged_data[:int(len(merged_data) * 9 / 10)], batch_size=batch_size, shuffle=True) test_data_loader = DataLoader( merged_data[int(len(merged_data) * 9 / 10):], batch_size=batch_size, shuffle=True) best_model, error, training_time = gym.fit( train_data_loader=train_data_loader, test_data_loader=test_data_loader, desired_error=self.desired_error, max_time=self.max_training_time, callback=self._train_callback, eval_every_x_epochs=1, max_unimproving_models=10, custom_train_func=partial(self.categorical_train_function, test=False), custom_test_func=partial(self.categorical_train_function, test=True)) self._model = best_model.to(self.device) elif all([ x['output_type'] == COLUMN_DATA_TYPES.NUMERIC or x['output_type'] == COLUMN_DATA_TYPES.CATEGORICAL for x in training_data['targets'] ]) and CONFIG.TRAIN_TO_PREDICT_TARGET: self.desired_error = 0.01 self._model_type = 'generic_target_predictor' self._model = self._embeddings_model_class.from_pretrained( self._pretrained_model_name).to(self.device) batch_size = 10 self._head = DefaultNet(ds=None, dynamic_parameters={}, shape=funnel( 768, sum([ len(x['encoded_output'][0]) for x in training_data['targets'] ]), depth=5), selfaware=False) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in self._head.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.000001 }, { 'params': [ p for n, p in self._head.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=5e-5, eps=1e-8) #optimizer = Ranger(self._head.parameters(),lr=5e-5) # num_training_steps is kind of an estimation scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=10, num_training_steps=len(priming_data) * 15 / 20) criterion = torch.nn.MSELoss() gym = Gym(model=self._head, optimizer=optimizer, scheduler=scheduler, loss_criterion=criterion, device=self.device, name=self.name) input = [ self._tokenizer.encode(x[:self._max_len], add_special_tokens=True) for x in priming_data ] tokenized_max_len = max([len(x) for x in input]) input = torch.tensor([ x + [self._pad_id] * (tokenized_max_len - len(x)) for x in input ]) real = [[]] * len(training_data['targets'][0]['encoded_output']) for i in range(len(real)): for target in training_data['targets']: real[i] = real[i] + target['encoded_output'][i] real = torch.tensor(real) merged_data = list(zip(input, real)) train_data_loader = DataLoader( merged_data[:int(len(merged_data) * 9 / 10)], batch_size=batch_size, shuffle=True) test_data_loader = DataLoader( merged_data[int(len(merged_data) * 9 / 10):], batch_size=batch_size, shuffle=True) self._model.eval() best_model, error, training_time = gym.fit( train_data_loader=train_data_loader, test_data_loader=test_data_loader, desired_error=self.desired_error, max_time=self.max_training_time, callback=self._train_callback, eval_every_x_epochs=1, max_unimproving_models=10, custom_train_func=partial(self.numerical_train_function, backbone=self._model, test=False), custom_test_func=partial(self.numerical_train_function, backbone=self._model, test=True)) self._head = best_model.to(self.device) else: self._model_type = 'embeddings_generator' self._model = self._embeddings_model_class.from_pretrained( self._pretrained_model_name).to(self.device) self._prepared = True def encode(self, column_data): encoded_representation = [] self._model.eval() with torch.no_grad(): for text in column_data: if text is None: text = '' input = torch.tensor( self._tokenizer.encode(text[:self._max_len], add_special_tokens=True)).to( self.device).unsqueeze(0) if self._model_type == 'generic_target_predictor': embeddings = self._model(input) output = self._head(embeddings[0][:, 0, :]) encoded_representation.append(output.tolist()[0]) elif self._model_type == 'classifier': output = self._model(input) logits = output[0] predicted_targets = logits[0].tolist() encoded_representation.append(predicted_targets) else: output = self._model(input) embeddings = output[0][:, 0, :].cpu().numpy()[0] encoded_representation.append(embeddings) return self._pytorch_wrapper(encoded_representation) def decode(self, encoded_values_tensor, max_length=100): # When test is an output... a bit trickier to handle this case, thinking on it pass
def prepare_encoder(self, priming_data, training_data=None): if self._prepared: raise Exception( 'You can only call "prepare_encoder" once for a given encoder.' ) priming_data = [x if x is not None else '' for x in priming_data] self._max_len = min(max([len(x) for x in priming_data]), self._model_max_len) self._tokenizer = self._tokenizer_class.from_pretrained( self._pretrained_model_name) self._pad_id = self._tokenizer.convert_tokens_to_ids( [self._tokenizer.pad_token])[0] # @TODO: Support multiple targets if they are all categorical or train for the categorical target if it's a mix (maybe ?) # @TODO: Attach a language modeling head and/or use GPT2 and/or provide outputs better suited to a LM head (which will be the mixer) if the output if text if training_data is not None and 'targets' in training_data and len( training_data['targets'] ) == 1 and training_data['targets'][0][ 'output_type'] == COLUMN_DATA_TYPES.CATEGORICAL and CONFIG.TRAIN_TO_PREDICT_TARGET: self._model_type = 'classifier' self._model = self._classifier_model_class.from_pretrained( self._pretrained_model_name, num_labels=len( set(training_data['targets'][0]['unencoded_output'])) + 1).to(self.device) batch_size = 10 no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in self._model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.000001 }, { 'params': [ p for n, p in self._model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=5e-5, eps=1e-8) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=10, num_training_steps=len(priming_data) * 15 / 20) gym = Gym(model=self._model, optimizer=optimizer, scheduler=scheduler, loss_criterion=None, device=self.device, name=self.name) input = [ self._tokenizer.encode(x[:self._max_len], add_special_tokens=True) for x in priming_data ] tokenized_max_len = max([len(x) for x in input]) input = torch.tensor([ x + [self._pad_id] * (tokenized_max_len - len(x)) for x in input ]) real = training_data['targets'][0]['encoded_output'] merged_data = list(zip(input, real)) train_data_loader = DataLoader( merged_data[:int(len(merged_data) * 9 / 10)], batch_size=batch_size, shuffle=True) test_data_loader = DataLoader( merged_data[int(len(merged_data) * 9 / 10):], batch_size=batch_size, shuffle=True) best_model, error, training_time = gym.fit( train_data_loader=train_data_loader, test_data_loader=test_data_loader, desired_error=self.desired_error, max_time=self.max_training_time, callback=self._train_callback, eval_every_x_epochs=1, max_unimproving_models=10, custom_train_func=partial(self.categorical_train_function, test=False), custom_test_func=partial(self.categorical_train_function, test=True)) self._model = best_model.to(self.device) elif all([ x['output_type'] == COLUMN_DATA_TYPES.NUMERIC or x['output_type'] == COLUMN_DATA_TYPES.CATEGORICAL for x in training_data['targets'] ]) and CONFIG.TRAIN_TO_PREDICT_TARGET: self.desired_error = 0.01 self._model_type = 'generic_target_predictor' self._model = self._embeddings_model_class.from_pretrained( self._pretrained_model_name).to(self.device) batch_size = 10 self._head = DefaultNet(ds=None, dynamic_parameters={}, shape=funnel( 768, sum([ len(x['encoded_output'][0]) for x in training_data['targets'] ]), depth=5), selfaware=False) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in self._head.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.000001 }, { 'params': [ p for n, p in self._head.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=5e-5, eps=1e-8) #optimizer = Ranger(self._head.parameters(),lr=5e-5) # num_training_steps is kind of an estimation scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=10, num_training_steps=len(priming_data) * 15 / 20) criterion = torch.nn.MSELoss() gym = Gym(model=self._head, optimizer=optimizer, scheduler=scheduler, loss_criterion=criterion, device=self.device, name=self.name) input = [ self._tokenizer.encode(x[:self._max_len], add_special_tokens=True) for x in priming_data ] tokenized_max_len = max([len(x) for x in input]) input = torch.tensor([ x + [self._pad_id] * (tokenized_max_len - len(x)) for x in input ]) real = [[]] * len(training_data['targets'][0]['encoded_output']) for i in range(len(real)): for target in training_data['targets']: real[i] = real[i] + target['encoded_output'][i] real = torch.tensor(real) merged_data = list(zip(input, real)) train_data_loader = DataLoader( merged_data[:int(len(merged_data) * 9 / 10)], batch_size=batch_size, shuffle=True) test_data_loader = DataLoader( merged_data[int(len(merged_data) * 9 / 10):], batch_size=batch_size, shuffle=True) self._model.eval() best_model, error, training_time = gym.fit( train_data_loader=train_data_loader, test_data_loader=test_data_loader, desired_error=self.desired_error, max_time=self.max_training_time, callback=self._train_callback, eval_every_x_epochs=1, max_unimproving_models=10, custom_train_func=partial(self.numerical_train_function, backbone=self._model, test=False), custom_test_func=partial(self.numerical_train_function, backbone=self._model, test=True)) self._head = best_model.to(self.device) else: self._model_type = 'embeddings_generator' self._model = self._embeddings_model_class.from_pretrained( self._pretrained_model_name).to(self.device) self._prepared = True
class CategoricalAutoEncoder: def __init__(self, is_target=False): self._pytorch_wrapper = torch.FloatTensor self._prepared = False self.name = 'Categorical Autoencoder' self.net = None self.encoder = None self.decoder = None self.onehot_encoder = OneHotEncoder() self.desired_error = 0.01 self.use_autoencoder = None if is_target: self.max_encoded_length = None else: self.max_encoded_length = 100 self.max_training_time = CONFIG.MAX_ENCODER_TRAINING_TIME def _train_callback(self, error, real_buff, predicted_buff): logging.info(f'{self.name} reached a loss of {error} while training !') def _encoder_targets(self, data): oh_encoded_categories = self.onehot_encoder.encode(data) target = oh_encoded_categories.cpu().numpy() target_indexes = np.where(target>0)[1] targets_c = torch.LongTensor(target_indexes) labels = targets_c.to(self.net.device) return labels def prepare_encoder(self, priming_data): random.seed(len(priming_data)) if self._prepared: raise Exception('You can only call "prepare_encoder" once for a given encoder.') self.onehot_encoder.prepare_encoder(priming_data) input_len = self.onehot_encoder._lang.n_words self.use_autoencoder = self.max_encoded_length is not None and input_len > self.max_encoded_length if self.use_autoencoder: logging.info('Preparing a categorical autoencoder, this might take a while') embeddings_layer_len = self.max_encoded_length self.net = DefaultNet(ds=None, dynamic_parameters={},shape=[input_len, embeddings_layer_len, input_len], selfaware=False) criterion = torch.nn.CrossEntropyLoss() optimizer = Ranger(self.net.parameters()) gym = Gym(model=self.net, optimizer=optimizer, scheduler=None, loss_criterion=criterion, device=self.net.device, name=self.name, input_encoder=self.onehot_encoder.encode, output_encoder=self._encoder_targets) batch_size = min(200, int(len(priming_data)/50)) train_data_loader = DataLoader(list(zip(priming_data,priming_data)), batch_size=batch_size, shuffle=True) test_data_loader = None best_model, error, training_time = gym.fit(train_data_loader, test_data_loader, desired_error=self.desired_error, max_time=self.max_training_time, callback=self._train_callback, eval_every_x_epochs=1, max_unimproving_models=5) self.net = best_model.to(self.net.device) modules = [module for module in self.net.modules() if type(module) != torch.nn.Sequential and type(module) != DefaultNet] self.encoder = torch.nn.Sequential(*modules[0:2]) self.decoder = torch.nn.Sequential(*modules[2:3]) logging.info('Categorical autoencoder ready') self._prepared = True def encode(self, column_data): oh_encoded_tensor = self.onehot_encoder.encode(column_data) if not self.use_autoencoder: return oh_encoded_tensor else: oh_encoded_tensor = oh_encoded_tensor.to(self.net.device) embeddings = self.encoder(oh_encoded_tensor) return embeddings def decode(self, encoded_data): if not self.use_autoencoder: return self.onehot_encoder.decode(encoded_data) else: oh_encoded_tensor = self.decoder(encoded_data) oh_encoded_tensor = oh_encoded_tensor.to('cpu') decoded_categories = self.onehot_encoder.decode(oh_encoded_tensor) return decoded_categories
class CategoricalAutoEncoder(BaseEncoder): def __init__(self, is_target=False, max_encoded_length=100): super().__init__(is_target) self._prepared = False self.name = 'Categorical Autoencoder' self.net = None self.encoder = None self.decoder = None self.predict_proba = None # whether to return the belief distribution as well self.onehot_encoder = OneHotEncoder(is_target=self.is_target) self.desired_error = 0.01 self.use_autoencoder = None if self.is_target: self.max_encoded_length = None else: self.max_encoded_length = max_encoded_length self.max_training_time = 7200 def _train_callback(self, error, real_buff, predicted_buff): log.info(f'{self.name} reached a loss of {error} while training !') def _encoder_targets(self, data): oh_encoded_categories = self.onehot_encoder.encode(data) target = oh_encoded_categories.cpu().numpy() target_indexes = np.where(target > 0)[1] targets_c = torch.LongTensor(target_indexes) labels = targets_c.to(self.net.device) return labels def to(self, device, available_devices): if self.use_autoencoder: self.net = self.net.to(device, available_devices) return self def prepare(self, priming_data): random.seed(len(priming_data)) if self._prepared: raise Exception( 'You can only call "prepare" once for a given encoder.') self.onehot_encoder.prepare(priming_data) input_len = self.onehot_encoder._lang.n_words self.use_autoencoder = self.max_encoded_length is not None and input_len > self.max_encoded_length if self.use_autoencoder: log.info( 'Preparing a categorical autoencoder, this might take a while') embeddings_layer_len = self.max_encoded_length self.net = DefaultNet( dynamic_parameters={}, shape=[input_len, embeddings_layer_len, input_len]) criterion = torch.nn.CrossEntropyLoss() optimizer = Ranger(self.net.parameters()) gym = Gym(model=self.net, optimizer=optimizer, scheduler=None, loss_criterion=criterion, device=self.net.device, name=self.name, input_encoder=self.onehot_encoder.encode, output_encoder=self._encoder_targets) batch_size = min(200, int(len(priming_data) / 50)) priming_data_str = [str(x) for x in priming_data] train_data_loader = DataLoader(list( zip(priming_data_str, priming_data_str)), batch_size=batch_size, shuffle=True) test_data_loader = None best_model, error, training_time = gym.fit( train_data_loader, test_data_loader, desired_error=self.desired_error, max_time=self.max_training_time, callback=self._train_callback, eval_every_x_epochs=1, max_unimproving_models=5) self.net = best_model.to(self.net.device) modules = [ module for module in self.net.modules() if type(module) != torch.nn.Sequential and type(module) != DefaultNet ] self.encoder = torch.nn.Sequential(*modules[0:2]).eval() self.decoder = torch.nn.Sequential(*modules[2:3]).eval() log.info('Categorical autoencoder ready') self._prepared = True def encode(self, column_data): if not column_data: column_data = [''] oh_encoded_tensor = self.onehot_encoder.encode(column_data) if not self.use_autoencoder: return oh_encoded_tensor else: with torch.no_grad(): oh_encoded_tensor = oh_encoded_tensor.to(self.net.device) embeddings = self.encoder(oh_encoded_tensor) return embeddings def decode(self, encoded_data): self.onehot_encoder.predict_proba = self.predict_proba if not self.use_autoencoder: return self.onehot_encoder.decode(encoded_data) else: with torch.no_grad(): oh_encoded_tensor = self.decoder(encoded_data) oh_encoded_tensor = oh_encoded_tensor.to('cpu') return self.onehot_encoder.decode(oh_encoded_tensor)