def learn(self, from_data, test_data=None, callback_on_iter=None, eval_every_x_epochs=20, stop_training_after_seconds=None, stop_model_building_after_seconds=None): """ Train and save a model (you can use this to retrain model from data) :param from_data: (Pandas DataFrame) The data to learn from :param test_data: (Pandas DataFrame) The data to test accuracy and learn_error from :param callback_on_iter: This is function that can be called on every X evaluation cycle :param eval_every_x_epochs: This is every how many epochs we want to calculate the test error and accuracy :return: None """ # This is a helper function that will help us auto-determine roughly what data types are in each column # NOTE: That this assumes the data is clean and will only return types for 'CATEGORICAL', 'NUMERIC' and 'TEXT' def type_map(col_name): col_pd_type = from_data[col_name].dtype col_pd_type = str(col_pd_type) if col_pd_type in ['int64', 'float64', 'timedelta']: return COLUMN_DATA_TYPES.NUMERIC elif col_pd_type in ['bool', 'category']: return COLUMN_DATA_TYPES.CATEGORICAL else: # if the number of uniques is elss than 100 or less, # than 10% of the total number of rows then keep it as categorical unique = from_data[col_name].nunique() if unique < 100 or unique < len(from_data[col_name]) / 10: return COLUMN_DATA_TYPES.CATEGORICAL # else assume its text return COLUMN_DATA_TYPES.TEXT # generate the configuration and set the order for the input and output columns if self._generate_config is True: self._input_columns = [ col for col in from_data if col not in self._output_columns ] self.config = { 'input_features': [{ 'name': col, 'type': type_map(col) } for col in self._input_columns], 'output_features': [{ 'name': col, 'type': type_map(col) } for col in self._output_columns] } self.config = predictor_config_schema.validate(self.config) logging.info('Automatically generated a configuration') logging.info(self.config) else: self._output_columns = [ col['name'] for col in self.config['output_features'] ] self._input_columns = [ col['name'] for col in self.config['input_features'] ] if stop_training_after_seconds is None: stop_training_after_seconds = round(from_data.shape[0] * from_data.shape[1] / 5) if stop_model_building_after_seconds is None: stop_model_building_after_seconds = stop_training_after_seconds * 3 from_data_ds = DataSource(from_data, self.config) if test_data is not None: test_data_ds = DataSource(test_data, self.config) else: test_data_ds = from_data_ds.extractRandomSubset(0.1) from_data_ds.training = True mixer_class = NnMixer mixer_params = {} if 'mixer' in self.config: if 'class' in self.config['mixer']: mixer_class = self.config['mixer']['class'] if 'attrs' in self.config['mixer']: mixer_params = self.config['mixer']['attrs'] # Initialize data sources if len(from_data_ds) > 100: nr_subsets = 3 else: # Don't use k-fold cross validation for very small input sizes nr_subsets = 1 from_data_ds.prepare_encoders() from_data_ds.create_subsets(nr_subsets) try: mixer_class({}).fit_data_source(from_data_ds) except Exception as e: # Not all mixers might require this # print(e) pass input_size = len(from_data_ds[0][0]) training_data_length = len(from_data_ds) test_data_ds.transformer = from_data_ds.transformer test_data_ds.encoders = from_data_ds.encoders test_data_ds.output_weights = from_data_ds.output_weights test_data_ds.create_subsets(nr_subsets) if 'optimizer' in self.config: optimizer = self.config['optimizer']() while True: training_time_per_iteration = stop_model_building_after_seconds / optimizer.total_trials # Some heuristics... if training_time_per_iteration > input_size: if training_time_per_iteration > min( (training_data_length / (4 * input_size)), 16 * input_size): break optimizer.total_trials = optimizer.total_trials - 1 if optimizer.total_trials < 8: optimizer.total_trials = 8 break training_time_per_iteration = stop_model_building_after_seconds / optimizer.total_trials best_parameters = optimizer.evaluate( lambda dynamic_parameters: Predictor.evaluate_mixer( self.config, mixer_class, mixer_params, from_data_ds, test_data_ds, dynamic_parameters, max_training_time=training_time_per_iteration, max_epochs=None)) logging.info('Using hyperparameter set: ', best_parameters) else: best_parameters = {} self._mixer = mixer_class(best_parameters, self.config) for param in mixer_params: if hasattr(self._mixer, param): setattr(self._mixer, param, mixer_params[param]) else: logging.warning( 'trying to set mixer param {param} but mixerclass {mixerclass} does not have such parameter' .format(param=param, mixerclass=str(type(self._mixer)))) def callback_on_iter_w_acc(epoch, training_error, test_error, delta_mean): callback_on_iter(epoch, training_error, test_error, delta_mean, self.calculate_accuracy(test_data_ds)) self._mixer.fit( train_ds=from_data_ds, test_ds=test_data_ds, callback=callback_on_iter_w_acc, stop_training_after_seconds=stop_training_after_seconds, eval_every_x_epochs=eval_every_x_epochs) self.train_accuracy = self.calculate_accuracy(test_data_ds) # Train some alternative mixers if CONFIG.HELPER_MIXERS and self.has_boosting_mixer and ( CONFIG.FORCE_HELPER_MIXERS or len(from_data_ds) < 12 * pow(10, 3)): try: self._helper_mixers = self.train_helper_mixers( from_data_ds, test_data_ds, self._mixer.quantiles[self._mixer.quantiles_pair[0] + 1:self._mixer.quantiles_pair[1] + 1]) except Exception as e: logging.warning( f'Failed to train helper mixers with error: {e}') return self
def learn(self, from_data, test_data=None, callback_on_iter=None, eval_every_x_epochs=20, stop_training_after_seconds=None, stop_model_building_after_seconds=None): """ Train and save a model (you can use this to retrain model from data) :param from_data: (Pandas DataFrame) The data to learn from :param test_data: (Pandas DataFrame) The data to test accuracy and learn_error from :param callback_on_iter: This is function that can be called on every X evaluation cycle :param eval_every_x_epochs: This is every how many epochs we want to calculate the test error and accuracy :return: None """ # This is a helper function that will help us auto-determine roughly what data types are in each column # NOTE: That this assumes the data is clean and will only return types for 'CATEGORICAL', 'NUMERIC' and 'TEXT' def type_map(col_name): col_pd_type = from_data[col_name].dtype col_pd_type = str(col_pd_type) if col_pd_type in ['int64', 'float64', 'timedelta']: return COLUMN_DATA_TYPES.NUMERIC elif col_pd_type in ['bool', 'category']: return COLUMN_DATA_TYPES.CATEGORICAL else: # if the number of uniques is elss than 100 or less than 10% of the total number of rows then keep it as categorical unique = from_data[col_name].nunique() if unique < 100 or unique < len(from_data[col_name]) / 10: return COLUMN_DATA_TYPES.CATEGORICAL # else assume its text return COLUMN_DATA_TYPES.TEXT # generate the configuration and set the order for the input and output columns if self._generate_config == True: self._input_columns = [ col for col in from_data if col not in self._output_columns ] self.config = { 'input_features': [{ 'name': col, 'type': type_map(col) } for col in self._input_columns], 'output_features': [{ 'name': col, 'type': type_map(col) } for col in self._output_columns] } logging.info('Automatically generated a configuration') logging.info(self.config) else: self._output_columns = [ col['name'] for col in self.config['input_features'] ] self._input_columns = [ col['name'] for col in self.config['output_features'] ] # @TODO Make Cross Entropy Loss work with multiple outputs if len(self.config['output_features'] ) == 1 and self.config['output_features'][0]['type'] in ( COLUMN_DATA_TYPES.CATEGORICAL): is_categorical_output = True else: is_categorical_output = False if stop_training_after_seconds is None: stop_training_after_seconds = round(from_data.shape[0] * from_data.shape[1] / 5) if stop_model_building_after_seconds is None: stop_model_building_after_seconds = stop_training_after_seconds * 3 from_data_ds = DataSource(from_data, self.config) if test_data is not None: test_data_ds = DataSource(test_data, self.config) else: test_data_ds = from_data_ds.extractRandomSubset(0.1) from_data_ds.training = True mixer_class = NnMixer mixer_params = {} if 'mixer' in self.config: if 'class' in self.config['mixer']: mixer_class = self.config['mixer']['class'] if 'attrs' in self.config['mixer']: mixer_params = self.config['mixer']['attrs'] # Initialize data sources nr_subsets = 3 from_data_ds.prepare_encoders() from_data_ds.create_subsets(nr_subsets) try: mixer_class({}).fit_data_source(from_data_ds) except: # Not all mixers might require this pass input_size = len(from_data_ds[0][0]) training_data_length = len(from_data_ds) test_data_ds.transformer = from_data_ds.transformer test_data_ds.encoders = from_data_ds.encoders test_data_ds.output_weights = from_data_ds.output_weights test_data_ds.create_subsets(nr_subsets) if 'optimizer' in self.config: optimizer = self.config['optimizer']() while True: training_time_per_iteration = stop_model_building_after_seconds / optimizer.total_trials # Some heuristics... if training_time_per_iteration > input_size: if training_time_per_iteration > min( (training_data_length / (4 * input_size)), 16 * input_size): break optimizer.total_trials = optimizer.total_trials - 1 if optimizer.total_trials < 8: optimizer.total_trials = 8 break training_time_per_iteration = stop_model_building_after_seconds / optimizer.total_trials best_parameters = optimizer.evaluate( lambda dynamic_parameters: Predictor.evaluate_mixer( mixer_class, mixer_params, from_data_ds, test_data_ds, dynamic_parameters, is_categorical_output, max_training_time=training_time_per_iteration, max_epochs=None)) logging.info('Using hyperparameter set: ', best_parameters) else: best_parameters = {} mixer = mixer_class(best_parameters, is_categorical_output=is_categorical_output) self._mixer = mixer for param in mixer_params: if hasattr(mixer, param): setattr(mixer, param, mixer_params[param]) else: logging.warning( 'trying to set mixer param {param} but mixerclass {mixerclass} does not have such parameter' .format(param=param, mixerclass=str(type(mixer)))) started = time.time() epoch = 0 eval_next_on_epoch = eval_every_x_epochs stop_training = False for subset_iteration in [1, 2]: if stop_training: break for subset_id in [*from_data_ds.subsets.keys()]: if stop_training: break subset_train_ds = from_data_ds.subsets[subset_id] subset_test_ds = test_data_ds.subsets[subset_id] lowest_error = None last_test_error = None last_subset_test_error = None test_error_delta_buff = [] subset_test_error_delta_buff = [] best_model = None #iterate over the iter_fit and see what the epoch and mixer error is for epoch, training_error in enumerate( mixer.iter_fit(subset_train_ds)): logging.info( 'training iteration {iter_i}, error {error}'.format( iter_i=epoch, error=training_error)) if epoch >= eval_next_on_epoch: # Prime the model on each subset for a bit if subset_iteration == 1: break eval_next_on_epoch += eval_every_x_epochs test_error = mixer.error(test_data_ds) subset_test_error = mixer.error(subset_test_ds) if lowest_error is None or test_error < lowest_error: lowest_error = test_error best_model = mixer.get_model_copy() if last_subset_test_error is None: subset_test_error_delta_buff.append(0) else: subset_test_error_delta_buff.append( last_subset_test_error - subset_test_error) if last_test_error is None: test_error_delta_buff.append(0) else: test_error_delta_buff.append(last_test_error - test_error) last_test_error = test_error delta_mean = np.mean(test_error_delta_buff[-10:]) subset_delta_mean = np.mean( subset_test_error_delta_buff[-10:]) if callback_on_iter is not None: callback_on_iter( epoch, training_error, test_error, delta_mean, self.calculate_accuracy(test_data_ds)) ## Stop if the model is overfitting if delta_mean < 0 and len(test_error_delta_buff) > 9: stop_training = True # Stop if we're past the time limit allocated for training if (time.time() - started) > stop_training_after_seconds: stop_training = True # If the training subset is overfitting on it's associated testing subset if subset_delta_mean < 0 and len( subset_test_error_delta_buff) > 9: break if stop_training: mixer.update_model(best_model) self._mixer = mixer self.train_accuracy = self.calculate_accuracy( test_data_ds) self.overall_certainty = mixer.overall_certainty() if subset_id == 'full': logging.info('Finished training model !') else: logging.info( 'Finished fitting on {subset_id} of {no_subsets} subset' .format(subset_id=subset_id, no_subsets=len( from_data_ds.subsets.keys()))) break self._mixer.encoders = from_data_ds.encoders return self
def learn(self, from_data, test_data=None, callback_on_iter=None, eval_every_x_epochs=20, stop_training_after_seconds=None, stop_model_building_after_seconds=None): """ Train and save a model (you can use this to retrain model from data) :param from_data: (Pandas DataFrame) The data to learn from :param test_data: (Pandas DataFrame) The data to test accuracy and learn_error from :param callback_on_iter: This is function that can be called on every X evaluation cycle :param eval_every_x_epochs: This is every how many epochs we want to calculate the test error and accuracy :return: None """ # This is a helper function that will help us auto-determine roughly what data types are in each column # NOTE: That this assumes the data is clean and will only return types for 'CATEGORICAL', 'NUMERIC' and 'TEXT' def type_map(col_name): col_pd_type = from_data[col_name].dtype col_pd_type = str(col_pd_type) if col_pd_type in ['int64', 'float64', 'timedelta']: return COLUMN_DATA_TYPES.NUMERIC elif col_pd_type in ['bool', 'category']: return COLUMN_DATA_TYPES.CATEGORICAL else: # if the number of uniques is elss than 100 or less, # than 10% of the total number of rows then keep it as categorical unique = from_data[col_name].nunique() if unique < 100 or unique < len(from_data[col_name]) / 10: return COLUMN_DATA_TYPES.CATEGORICAL # else assume its text return COLUMN_DATA_TYPES.TEXT # generate the configuration and set the order for the input and output columns if self._generate_config is True: self._input_columns = [ col for col in from_data if col not in self._output_columns ] self.config = { 'input_features': [{ 'name': col, 'type': type_map(col) } for col in self._input_columns], 'output_features': [{ 'name': col, 'type': type_map(col) } for col in self._output_columns] } self.config = predictor_config_schema.validate(self.config) logging.info('Automatically generated a configuration') logging.info(self.config) else: self._output_columns = [ col['name'] for col in self.config['output_features'] ] self._input_columns = [ col['name'] for col in self.config['input_features'] ] if stop_training_after_seconds is None: stop_training_after_seconds = round(from_data.shape[0] * from_data.shape[1] / 5) if stop_model_building_after_seconds is None: stop_model_building_after_seconds = stop_training_after_seconds * 3 from_data_ds = DataSource(from_data, self.config) if test_data is not None: test_data_ds = DataSource(test_data, self.config) else: test_data_ds = from_data_ds.extractRandomSubset(0.1) from_data_ds.training = True mixer_class = NnMixer mixer_params = {} if 'mixer' in self.config: if 'class' in self.config['mixer']: mixer_class = self.config['mixer']['class'] if 'attrs' in self.config['mixer']: mixer_params = self.config['mixer']['attrs'] # Initialize data sources nr_subsets = 3 from_data_ds.prepare_encoders() from_data_ds.create_subsets(nr_subsets) try: mixer_class({}).fit_data_source(from_data_ds) except Exception as e: # Not all mixers might require this # print(e) pass input_size = len(from_data_ds[0][0]) training_data_length = len(from_data_ds) test_data_ds.transformer = from_data_ds.transformer test_data_ds.encoders = from_data_ds.encoders test_data_ds.output_weights = from_data_ds.output_weights test_data_ds.create_subsets(nr_subsets) if 'optimizer' in self.config: optimizer = self.config['optimizer']() while True: training_time_per_iteration = stop_model_building_after_seconds / optimizer.total_trials # Some heuristics... if training_time_per_iteration > input_size: if training_time_per_iteration > min( (training_data_length / (4 * input_size)), 16 * input_size): break optimizer.total_trials = optimizer.total_trials - 1 if optimizer.total_trials < 8: optimizer.total_trials = 8 break training_time_per_iteration = stop_model_building_after_seconds / optimizer.total_trials best_parameters = optimizer.evaluate( lambda dynamic_parameters: Predictor.evaluate_mixer( self.config, mixer_class, mixer_params, from_data_ds, test_data_ds, dynamic_parameters, max_training_time=training_time_per_iteration, max_epochs=None)) logging.info('Using hyperparameter set: ', best_parameters) else: best_parameters = {} if CONFIG.HELPER_MIXERS and self.has_boosting_mixer and ( CONFIG.FORCE_HELPER_MIXERS or len(from_data_ds) < 12 * pow(10, 3)): try: self._helper_mixers = self.train_helper_mixers( from_data_ds, test_data_ds) except Exception as e: logging.warning( f'Failed to train helper mixers with error: {e}') mixer = mixer_class(best_parameters, self.config) self._mixer = mixer for param in mixer_params: if hasattr(mixer, param): setattr(mixer, param, mixer_params[param]) else: logging.warning( 'trying to set mixer param {param} but mixerclass {mixerclass} does not have such parameter' .format(param=param, mixerclass=str(type(mixer)))) started = time.time() log_reasure = time.time() first_run = True stop_training = False for subset_iteration in [1, 2]: if stop_training: break subset_id_arr = [*from_data_ds.subsets.keys()] # [1] for subset_id in subset_id_arr: started_subset = time.time() if stop_training: break #subset_train_ds = from_data_ds #.subsets[subset_id] #subset_test_ds = test_data_ds #.subsets[subset_id] subset_train_ds = from_data_ds.subsets[subset_id] subset_test_ds = test_data_ds.subsets[subset_id] lowest_error = None last_test_error = None last_subset_test_error = None test_error_delta_buff = [] subset_test_error_delta_buff = [] best_model = None best_selfaware_model = None #iterate over the iter_fit and see what the epoch and mixer error is for epoch, training_error in enumerate( mixer.iter_fit(subset_train_ds, initialize=first_run, subset_id=subset_id)): first_run = False # Log this every now and then so that the user knows it's running if (int(time.time()) - log_reasure) > 30: log_reasure = time.time() logging.info( f'Lightwood training, iteration {epoch}, training error {training_error}' ) # Prime the model on each subset for a bit if subset_iteration == 1: break # Once the training error is getting smaller, enable dropout to teach the network to predict without certain features if subset_iteration > 1 and training_error < 0.4 and not from_data_ds.enable_dropout: eval_every_x_epochs = max(1, int(eval_every_x_epochs / 2)) logging.info('Enabled dropout !') from_data_ds.enable_dropout = True lowest_error = None last_test_error = None last_subset_test_error = None test_error_delta_buff = [] subset_test_error_delta_buff = [] continue # If the selfaware network isn't able to train, go back to the original network if subset_iteration > 1 and ( np.isnan(training_error) or np.isinf(training_error) or training_error > pow(10, 5)) and not mixer.stop_selfaware_training: mixer.start_selfaware_training = False mixer.stop_selfaware_training = True lowest_error = None last_test_error = None last_subset_test_error = None test_error_delta_buff = [] subset_test_error_delta_buff = [] continue # Once we are past the priming/warmup period, start training the selfaware network if subset_iteration > 1 and not mixer.is_selfaware and self.config[ 'mixer'][ 'selfaware'] and not mixer.stop_selfaware_training and training_error < 0.35: logging.info('Started selfaware training !') mixer.start_selfaware_training = True lowest_error = None last_test_error = None last_subset_test_error = None test_error_delta_buff = [] subset_test_error_delta_buff = [] continue if epoch % eval_every_x_epochs == 0: test_error = mixer.error(test_data_ds) subset_test_error = mixer.error(subset_test_ds, subset_id=subset_id) logging.info( f'Subtest test error: {subset_test_error} on subset {subset_id}, overall test error: {test_error}' ) if lowest_error is None or test_error < lowest_error: lowest_error = test_error if mixer.is_selfaware: best_selfaware_model = mixer.get_model_copy() else: best_model = mixer.get_model_copy() if last_subset_test_error is None: pass else: subset_test_error_delta_buff.append( last_subset_test_error - subset_test_error) last_subset_test_error = subset_test_error if last_test_error is None: pass else: test_error_delta_buff.append(last_test_error - test_error) last_test_error = test_error delta_mean = np.mean(test_error_delta_buff[-5:]) subset_delta_mean = np.mean( subset_test_error_delta_buff[-5:]) if callback_on_iter is not None: callback_on_iter( epoch, training_error, test_error, delta_mean, self.calculate_accuracy(test_data_ds)) ## Stop if the model is overfitting #if delta_mean <= 0 and len(test_error_delta_buff) > 4: # stop_training = True # Stop if we're past the time limit allocated for training if (time.time() - started) > stop_training_after_seconds: stop_training = True # If the trauining subset is overfitting on it's associated testing subset if (subset_delta_mean <= 0 and len(subset_test_error_delta_buff) > 4 ) or (time.time() - started_subset ) > stop_training_after_seconds / len( from_data_ds.subsets.keys()): logging.info( 'Finished fitting on {subset_id} of {no_subsets} subset' .format(subset_id=subset_id, no_subsets=len( from_data_ds.subsets.keys()))) if mixer.is_selfaware: if best_selfaware_model is not None: mixer.update_model(best_selfaware_model) else: mixer.update_model(best_model) if subset_id == subset_id_arr[-1]: stop_training = True elif not stop_training: break if stop_training: if mixer.is_selfaware: mixer.update_model(best_selfaware_model) else: mixer.update_model(best_model) self._mixer = mixer self.train_accuracy = self.calculate_accuracy( test_data_ds) self.overall_certainty = mixer.overall_certainty() logging.info('Finished training model !') break self._mixer.build_confidence_normalization_data(test_data_ds) self._mixer.encoders = from_data_ds.encoders return self