def shouldContinue(self): """ Check if the training should continue :return: """ model_name = self.model_name # check if stop training is set in which case we should exit the training model_data = self.persistent_model_metadata.find_one( {'model_name': self.model_name}) #type: PersistentModelMetadata if model_data is None: return False if model_data.stop_training == True: logging.info('[FORCED] Stopping model training....') return False elif model_data.kill_training == True: logging.info('[FORCED] Stopping model training....') self.persistent_model_metadata.delete() self.ml_model_info.delete() return False return True
def saveToDisk(self, local_files): """ This method persists model into disk, and removes previous stored files of this model :param local_files: any previous files :return: """ if local_files is not None: for file_response_object in local_files: try: os.remove(file_response_object.path) except: logging.info('Could not delete file {path}'.format( path=file_response_object.path)) file_id = '{model_name}.{ml_model_name}.{config_hash}'.format( model_name=self.model_name, ml_model_name=self.ml_model_name, config_hash=self.config_hash) return_objects = self.data_model_object.saveToDisk(file_id) file_ids = [ret.file_id for ret in return_objects] self.ml_model_info.fs_file_ids = file_ids self.ml_model_info.update() return return_objects
def __init__(self, model_name, data=None): """ Load basic data needed to find the model data :param data: data to make predictions on :param model_name: the model to load :param submodel_name: if its also a submodel, the submodel name """ self.model_name = model_name self.data = data self.persistent_model_metadata = PersistentModelMetadata() self.persistent_model_metadata.model_name = self.model_name self.persistent_ml_model_info = PersistentMlModelInfo() self.persistent_ml_model_info.model_name = self.model_name self.persistent_model_metadata = self.persistent_model_metadata.find_one( self.persistent_model_metadata.getPkey()) # laod the most accurate model info = self.persistent_ml_model_info.find( {'model_name': self.model_name}, order_by=[('r_squared', -1)], limit=1) if info is not None and len(info) > 0: self.persistent_ml_model_info = info[ 0] #type: PersistentMlModelInfo else: # TODO: Make sure we have a model for this logging.info('No model found') return self.ml_model_name = self.persistent_ml_model_info.ml_model_name self.config_serialized = self.persistent_ml_model_info.config_serialized fs_file_ids = self.persistent_ml_model_info.fs_file_ids self.framework, self.dummy, self.ml_model_name = self.ml_model_name.split( '.') self.ml_model_module_path = 'mindsdb.libs.ml_models.' + self.framework + '.models.' + self.ml_model_name + '.' + self.ml_model_name self.ml_model_class_name = convert_snake_to_cammelcase_string( self.ml_model_name) self.ml_model_module = importlib.import_module( self.ml_model_module_path) self.ml_model_class = getattr(self.ml_model_module, self.ml_model_class_name) self.gfs_save_head_time = time.time( ) # the last time it was saved into GridFS, assume it was now logging.info('Starting model...') self.data_model_object = self.ml_model_class.loadFromDisk( file_ids=fs_file_ids) if self.data != None: self._loadData(data)
def predict(self, data=None): """ This actually calls the model and returns the predictions in diff form :return: diffs, which is a list of dictionaries with pointers as to where to replace the prediction given the value that was predicted """ if data != None: self._loadData(data) self.predict_sampler.variable_wrapper = self.ml_model_class.variable_wrapper self.predict_sampler.variable_unwrapper = self.ml_model_class.variable_unwrapper ret_diffs = [] for batch in self.predict_sampler: logging.info('predicting batch...') if self.data_model_object.use_full_text_input: ret = self.data_model_object.forward( batch.getInput(flatten=self.data_model_object.flatInput), full_text_input=batch.getFullTextInput()) else: ret = self.data_model_object.forward( batch.getInput(flatten=self.data_model_object.flatInput)) if type(ret) != type({}): ret_dict = batch.deflatTarget(ret) else: ret_dict = ret ret_dict_denorm = {} for col in ret_dict: ret_dict[col] = self.ml_model_class.variable_unwrapper( ret_dict[col]) for row in ret_dict[col]: if col not in ret_dict_denorm: ret_dict_denorm[col] = [] ret_dict_denorm[col] += [ denorm( row, self.persistent_model_metadata.column_stats[col]) ] ret_total_item = { 'group_pointer': batch.group_pointer, 'column_pointer': batch.column_pointer, 'start_pointer': batch.start_pointer, 'end_pointer': batch.end_pointer, 'ret_dict': ret_dict_denorm } ret_diffs += [ret_total_item] return ret_diffs
def start(data, model_name): """ We use this worker to parallel train different data models and data model configurations :param data: This is the vectorized data :param model_name: This will be the model name so we can pull stats and other :param data_model: This will be the data model name, which can let us find the data model implementation :param config: this is the hyperparameter config """ w = PredictWorker(data, model_name) logging.info('Inferring from model and data...') return w.predict()
def run(self): # Handle transactions differently depending on the type of query # For now we only support LEARN and PREDICT # Train metadata is the metadata that was used when training the model, # note: that we need this train metadata even if we are predicting, so we can understand about the model train_metadata = None if self.transaction.metadata.type == TRANSACTION_PREDICT: # extract this from the persistent_model_metadata train_metadata = TransactionMetadata() train_metadata.setFromDict( self.transaction.persistent_model_metadata.train_metadata) elif self.transaction.metadata.type == TRANSACTION_LEARN: # Pull this straight from the the current transaction train_metadata = self.transaction.metadata else: # We cannot proceed without train metadata self.session.logging.error( 'Do not support transaction {type}'.format( type=self.transaction.metadata.type)) self.transaction.error = True self.transaction.errorMsg = traceback.print_exc(1) return result = self.getPreparedInputDF(train_metadata) columns = list(result.columns.values) data_array = list(result.values.tolist()) self.transaction.input_data.columns = columns # make sure that the column we are trying to predict is on the input_data # else fail, because we cannot predict data we dont have # TODO: Revise this, I may pass a source data that doesnt have the column I want to predict and that may still be ok if we are making a prediction that is not time series if len(data_array[0] ) > 0 and self.transaction.metadata.model_predict_columns: for col_target in self.transaction.metadata.model_predict_columns: if col_target not in self.transaction.input_data.columns: err = 'Trying to predict column {column} but column not in source data'.format( column=col_target) self.session.logging.error(err) self.transaction.error = True self.transaction.errorMsg = err return self.transaction.input_data.data_array = data_array # extract test data if this is a learn transaction and there is a test query if self.transaction.metadata.type == TRANSACTION_LEARN: # if a test_data set was given use it if self.transaction.metadata.test_from_data: df = self.transaction.metadata.test_from_data.df test_result = df.where((pandas.notnull(df)), None) columns = list(test_result.columns.values) data_array = test_result.values.tolist() # Make sure that test adn train sets match column wise if columns != self.transaction.input_data.columns: err = 'Trying to get data for test but columns in train set and test set dont match' self.session.logging.error(err) self.transaction.error = True self.transaction.errorMsg = err return total_data_array = len(self.transaction.input_data.data_array) total_test_array = len(data_array) test_indexes = [ i for i in range(total_data_array, total_data_array + total_test_array) ] self.transaction.input_data.test_indexes = test_indexes # make the input data relevant self.transaction.input_data.data_array += data_array # we later use this to either regenerate or not test_prob = 0 else: test_prob = CONFIG.TEST_TRAIN_RATIO validation_prob = CONFIG.TEST_TRAIN_RATIO / (1 - test_prob) group_by = self.transaction.metadata.model_group_by if group_by: try: group_by_index = self.transaction.input_data.columns.index( group_by) except: group_by_index = None err = 'Trying to group by, {column} but column not in source data'.format( column=group_by) self.session.logging.error(err) self.transaction.error = True self.transaction.errorMsg = err return # get unique group by values #all_group_by_items_query = ''' select {group_by_column} as grp, count(1) as total from ( {query} ) sub group by {group_by_column}'''.format(group_by_column=group_by, query=self.transaction.metadata.model_query) #self.transaction.session.logging.debug('About to pull GROUP BY query {query}'.format(query=all_group_by_items_query)) uniques = result.groupby([group_by]).size() all_group_by_values = uniques.index.tolist() uniques_counts = uniques.values.tolist() # create a list of values in group by, this is because result is array of array we want just array all_group_by_counts = { value: uniques_counts[i] for i, value in enumerate(all_group_by_values) } max_group_by = max(list(all_group_by_counts.values())) self.transaction.persistent_model_metadata.max_group_by_count = max_group_by # we will fill these depending on the test_prob and validation_prob test_group_by_values = [] validation_group_by_values = [] train_group_by_values = [] # split the data into test, validation, train by group by data for group_by_value in all_group_by_values: # depending on a random number if less than x_prob belongs to such group # remember that test_prob can be 0 or the config value depending on if the test test was passed as a query if float(random.random()) < test_prob and len( train_group_by_values) > 0: test_group_by_values += [group_by_value] # elif float(random.random()) < validation_prob: # validation_group_by_values += [group_by_value] else: train_group_by_values += [group_by_value] for i, row in enumerate(self.transaction.input_data.data_array): in_test = True if i in self.transaction.input_data.test_indexes else False if not in_test: if group_by: group_by_value = row[group_by_index] if group_by_value in test_group_by_values: self.transaction.input_data.test_indexes += [i] elif group_by_value in train_group_by_values: self.transaction.input_data.train_indexes += [i] elif group_by_value in validation_group_by_values: self.transaction.input_data.validation_indexes += [ i ] else: # remember that test_prob can be 0 or the config value depending on if the test test was passed as a query if float(random.random()) <= test_prob or len( self.transaction.input_data.test_indexes) == 0: self.transaction.input_data.test_indexes += [i] elif float(random.random()) <= validation_prob or len( self.transaction.input_data.validation_indexes ) == 0: self.transaction.input_data.validation_indexes += [ i ] else: self.transaction.input_data.train_indexes += [i] if len(self.transaction.input_data.test_indexes) == 0: logging.debug('Size of test set is zero, last split') ratio = CONFIG.TEST_TRAIN_RATIO if group_by and len( self.transaction.input_data.train_indexes) > 2000: # it seems to be a good practice to not overfit, to double the ratio, as time series data tends to be abundant ratio = ratio * 2 test_size = int( len(self.transaction.input_data.train_indexes) * ratio) self.transaction.input_data.test_indexes = self.transaction.input_data.train_indexes[ -test_size:] self.transaction.input_data.train_indexes = self.transaction.input_data.train_indexes[: -test_size] logging.info('- Test: {size} rows'.format( size=len(self.transaction.input_data.test_indexes))) logging.info('- Train: {size} rows'.format( size=len(self.transaction.input_data.train_indexes)))
def train(self): """ :return: """ last_epoch = 0 lowest_error = None highest_accuracy = 0 local_files = None for i in range(len(self.data_model_object.learning_rates)): self.data_model_object.setLearningRateIndex(i) for train_ret in self.data_model_object.trainModel( self.train_sampler): logging.debug( 'Training State epoch:{epoch}, batch:{batch}, loss:{loss}'. format(epoch=train_ret.epoch, batch=train_ret.batch, loss=train_ret.loss)) # save model every new epoch if last_epoch != train_ret.epoch: last_epoch = train_ret.epoch logging.debug( 'New epoch:{epoch}, testing and calculating error'. format(epoch=last_epoch)) test_ret = self.data_model_object.testModel( self.test_sampler) logging.info( 'Test Error:{error}, Accuracy:{accuracy} | Best Accuracy so far: {best_accuracy}' .format(error=test_ret.error, accuracy=test_ret.accuracy, best_accuracy=highest_accuracy)) is_it_lowest_error_epoch = False # if lowest error save model if lowest_error in [None]: lowest_error = test_ret.error if lowest_error > test_ret.error: is_it_lowest_error_epoch = True lowest_error = test_ret.error highest_accuracy = test_ret.accuracy logging.info( '[SAVING MODEL] Lowest ERROR so far! - Test Error: {error}, Accuracy: {accuracy}' .format(error=test_ret.error, accuracy=test_ret.accuracy)) logging.debug( 'Lowest ERROR so far! Saving: model {model_name}, {data_model} config:{config}' .format( model_name=self.model_name, data_model=self.ml_model_name, config=self.ml_model_info.config_serialized)) # save model local file local_files = self.saveToDisk(local_files) # throttle model saving into GridFS to 10 minutes # self.saveToGridFs(local_files, throttle=True) # save model predicted - real vectors logging.debug( 'Saved: model {model_name}:{ml_model_name} state vars into db [OK]' .format(model_name=self.model_name, ml_model_name=self.ml_model_name)) # check if continue training if self.shouldContinue() == False: return # save/update model loss, error, confusion_matrix self.registerModelData(train_ret, test_ret, is_it_lowest_error_epoch) logging.info( 'Loading model from store for retrain on new learning rate {lr}' .format(lr=self.data_model_object.learning_rates[i] [LEARNING_RATE_INDEX])) # after its done with the first batch group, get the one with the lowest error and keep training ml_model_info = self.ml_model_info.find_one({ 'model_name': self.model_name, 'ml_model_name': self.ml_model_name, 'config_serialized': json.dumps(self.config) }) if ml_model_info is None: # TODO: Make sure we have a model for this logging.info('No model found in storage') return fs_file_ids = ml_model_info.fs_file_ids self.data_model_object = self.ml_model_class.loadFromDisk( file_ids=fs_file_ids)
def __init__(self, data, model_name, ml_model_name='pytorch.models.column_based_fcnn', config={}): """ :param data: :type data: ModelData :param model_name: :param ml_model_name: :param config: """ self.data = data self.model_name = model_name self.ml_model_name = ml_model_name self.config = config self.config_serialized = json.dumps(self.config) self.config_hash = hashtext(self.config_serialized) # get basic variables defined self.persistent_model_metadata = PersistentModelMetadata().find_one( {'model_name': self.model_name}) self.ml_model_info = PersistentMlModelInfo() self.ml_model_info.model_name = self.model_name self.ml_model_info.ml_model_name = self.ml_model_name self.ml_model_info.config_serialized = self.config_serialized self.ml_model_info.insert() self.framework, self.dummy, self.data_model_name = self.ml_model_name.split( '.') self.ml_model_module_path = 'mindsdb.libs.ml_models.' + self.ml_model_name + '.' + self.data_model_name self.ml_model_class_name = convert_snake_to_cammelcase_string( self.data_model_name) self.ml_model_module = importlib.import_module( self.ml_model_module_path) self.ml_model_class = getattr(self.ml_model_module, self.ml_model_class_name) self.train_sampler = Sampler( self.data.train_set, metadata_as_stored=self.persistent_model_metadata, ignore_types=self.ml_model_class.ignore_types, sampler_mode=SAMPLER_MODES.LEARN) self.test_sampler = Sampler( self.data.test_set, metadata_as_stored=self.persistent_model_metadata, ignore_types=self.ml_model_class.ignore_types, sampler_mode=SAMPLER_MODES.LEARN) self.train_sampler.variable_wrapper = self.ml_model_class.variable_wrapper self.test_sampler.variable_wrapper = self.ml_model_class.variable_wrapper self.sample_batch = self.train_sampler.getSampleBatch() self.gfs_save_head_time = time.time( ) # the last time it was saved into GridFS, assume it was now logging.info('Starting model...') self.data_model_object = self.ml_model_class(self.sample_batch) logging.info('Training model...') self.train()
def run(self): # Handle transactions differently depending on the type of query # For now we only support LEARN and PREDICT train_metadata = self.transaction.metadata if self.transaction.metadata.type == TRANSACTION_PREDICT: self.populatePredictQuery() train_metadata = TransactionMetadata() train_metadata.setFromDict(self.transaction.persistent_model_metadata.train_metadata) elif self.transaction.metadata.type not in [TRANSACTION_PREDICT, TRANSACTION_LEARN]: self.session.logging.error('Do not support transaction {type}'.format(type=self.transaction.metadata.type)) self.transaction.error = True self.transaction.errorMsg = traceback.print_exc(1) return query = self.prepareFullQuery(train_metadata) try: self.transaction.session.logging.info('About to pull query {query}'.format(query=query)) conn = sqlite3.connect(self.transaction.metadata.storage_file) self.logging.info(self.transaction.metadata.model_query) df = pandas.read_sql_query(query, conn) result = df.where((pandas.notnull(df)), None) df = None # clean memory except Exception: self.session.logging.error(traceback.print_exc()) self.transaction.error =True self.transaction.errorMsg = traceback.print_exc(1) return columns = list(result.columns.values) data_array = list(result.values.tolist()) self.transaction.input_data.columns = columns if len(data_array[0])>0 and self.transaction.metadata.model_predict_columns: for col_target in self.transaction.metadata.model_predict_columns: if col_target not in self.transaction.input_data.columns: err = 'Trying to predict column {column} but column not in source data'.format(column=col_target) self.session.logging.error(err) self.transaction.error = True self.transaction.errorMsg = err return self.transaction.input_data.data_array = data_array # extract test data if this is a learn transaction and there is a test query if self.transaction.metadata.type == TRANSACTION_LEARN: if self.transaction.metadata.model_test_query: try: test_query = query_wrapper.format(orig_query = self.transaction.metadata.model_test_query, order_by_string= order_by_string, where_not_null_string=where_not_null_string) self.transaction.session.logging.info('About to pull TEST query {query}'.format(query=test_query)) #drill = self.session.drill.query(test_query, timeout=CONFIG.DRILL_TIMEOUT) df = pandas.read_sql_query(test_query, conn) result = df.where((pandas.notnull(df)), None) df = None #result = vars(drill)['data'] except Exception: # If testing offline, get results from a .cache file self.session.logging.error(traceback.print_exc()) self.transaction.error = True self.transaction.errorMsg = traceback.print_exc(1) return columns = list(result.columns.values) data_array = result.values.tolist() # Make sure that test adn train sets match column wise if columns != self.transaction.input_data.columns: err = 'Trying to get data for test but columns in train set and test set dont match' self.session.logging.error(err) self.transaction.error = True self.transaction.errorMsg = err return total_data_array = len(self.transaction.input_data.data_array) total_test_array = len(data_array) test_indexes = [i for i in range(total_data_array, total_data_array+total_test_array)] self.transaction.input_data.test_indexes = test_indexes # make the input data relevant self.transaction.input_data.data_array += data_array # we later use this to either regenerate or not test_prob = 0 else: test_prob = CONFIG.TEST_TRAIN_RATIO validation_prob = CONFIG.TEST_TRAIN_RATIO / (1-test_prob) group_by = self.transaction.metadata.model_group_by if group_by: try: group_by_index = self.transaction.input_data.columns.index(group_by) except: group_by_index = None err = 'Trying to group by, {column} but column not in source data'.format(column=group_by) self.session.logging.error(err) self.transaction.error = True self.transaction.errorMsg = err return # get unique group by values all_group_by_items_query = ''' select {group_by_column} as grp, count(1) as total from ( {query} ) sub group by {group_by_column}'''.format(group_by_column=group_by, query=self.transaction.metadata.model_query) self.transaction.session.logging.debug('About to pull GROUP BY query {query}'.format(query=all_group_by_items_query)) df = pandas.read_sql_query(all_group_by_items_query, conn) result = df.where((pandas.notnull(df)), None) # create a list of values in group by, this is because result is array of array we want just array all_group_by_counts = {i[0]:i[1] for i in result.values.tolist()} all_group_by_values = all_group_by_counts.keys() max_group_by = max(list(all_group_by_counts.values())) self.transaction.persistent_model_metadata.max_group_by_count = max_group_by # we will fill these depending on the test_prob and validation_prob test_group_by_values = [] validation_group_by_values = [] train_group_by_values = [] # split the data into test, validation, train by group by data for group_by_value in all_group_by_values: # depending on a random number if less than x_prob belongs to such group # remember that test_prob can be 0 or the config value depending on if the test test was passed as a query if float(random.random()) < test_prob and len(train_group_by_values) > 0: test_group_by_values += [group_by_value] # elif float(random.random()) < validation_prob: # validation_group_by_values += [group_by_value] else: train_group_by_values += [group_by_value] for i, row in enumerate(self.transaction.input_data.data_array): in_test = True if i in self.transaction.input_data.test_indexes else False if not in_test: if group_by: group_by_value = row[group_by_index] if group_by_value in test_group_by_values : self.transaction.input_data.test_indexes += [i] elif group_by_value in train_group_by_values : self.transaction.input_data.train_indexes += [i] elif group_by_value in validation_group_by_values : self.transaction.input_data.validation_indexes += [i] else: # remember that test_prob can be 0 or the config value depending on if the test test was passed as a query if float(random.random()) <= test_prob or len(self.transaction.input_data.test_indexes) == 0: self.transaction.input_data.test_indexes += [i] elif float(random.random()) <= validation_prob or len(self.transaction.input_data.validation_indexes)==0: self.transaction.input_data.validation_indexes += [i] else: self.transaction.input_data.train_indexes += [i] if len(self.transaction.input_data.test_indexes) == 0: logging.debug('Size of test set is zero, last split') ratio = CONFIG.TEST_TRAIN_RATIO if group_by and len(self.transaction.input_data.train_indexes) > 2000: # it seems to be a good practice to not overfit, to double the ratio, as time series data tends to be abundant ratio = ratio*2 test_size = int(len(self.transaction.input_data.train_indexes) * ratio) self.transaction.input_data.test_indexes = self.transaction.input_data.train_indexes[-test_size:] self.transaction.input_data.train_indexes = self.transaction.input_data.train_indexes[:-test_size] logging.info('- Test: {size} rows'.format(size=len(self.transaction.input_data.test_indexes))) logging.info('- Train: {size} rows'.format(size=len(self.transaction.input_data.train_indexes)))