def get_critical_error_record_ids(self, dataset_name): record_ids = [] sql_query = ''' DROP TABLE IF EXISTS latest_deployment; CREATE TEMP TABLE latest_deployment AS ( SELECT model_id, epoch FROM deploy ORDER BY TIMESTAMP DESC LIMIT 1 ); SELECT records.record_id FROM records LEFT JOIN predictions ON records.dataset = predictions.dataset AND records.record_id = predictions.record_id LEFT JOIN latest_deployment AS deploy ON predictions.model_id = deploy.model_id AND predictions.epoch = deploy.epoch WHERE LOWER(records.dataset) LIKE LOWER('%{dataset}%') AND ABS(records.angle - predictions.angle) >= 0.8 ORDER BY record_id ASC '''.format(dataset=dataset_name) rows = get_sql_rows(host=self.postgres_host, sql=sql_query, postgres_pool=self.postgres_pool) for row in rows: record_id = row['record_id'] record_ids.append(record_id) return record_ids
def get_toggle_status(self, web_page, name, detail): """ Checks if a user has turned on a given toggle Parameters ---------- web_page : string The page web where the user would have set the toggle name : string The type of toggle name : string Any other details about the toggle Returns ---------- is_on : boolean Whether the toggle is turned on or not """ sql_query = ''' DROP TABLE IF EXISTS latest; CREATE TEMP TABLE latest AS ( SELECT detail, is_on, ROW_NUMBER() OVER( PARTITION BY web_page, name, detail ORDER BY event_ts DESC ) AS temporal_rank FROM toggles WHERE LOWER(web_page) LIKE LOWER('%{web_page}%') AND LOWER(name) LIKE LOWER('%{name}%') AND LOWER(detail) LIKE LOWER('%{detail}%') ); SELECT is_on FROM latest WHERE temporal_rank = 1 AND is_on = TRUE '''.format(web_page=web_page, name=name, detail=detail) rows = get_sql_rows(host=self.postgres_host, sql=sql_query, postgres_pool=self.postgres_pool) is_on = False if len(rows) > 0: first_row = rows[0] is_on = first_row['is_on'] return is_on
def get_dataset_record_ids(self, dataset_name): record_ids = [] sql_query = ''' SELECT record_id FROM records WHERE UPPER(dataset) LIKE UPPER('{dataset_name}') ORDER BY record_id ASC '''.format(dataset_name=dataset_name) rows = get_sql_rows(host=self.postgres_host, sql=sql_query, postgres_pool=self.postgres_pool) for row in rows: record_id = row['record_id'] record_ids.append(record_id) return record_ids
def read_flag(self, dataset, record_id): sql_query = ''' SELECT is_flagged FROM records WHERE LOWER(dataset) LIKE LOWER('%{dataset}%') AND record_id = {record_id} '''.format(dataset=dataset, record_id=record_id) rows = get_sql_rows(host=self.postgres_host, sql=sql_query, postgres_pool=self.postgres_pool) is_flagged = False if len(rows) > 0: first_row = rows[0] is_flagged = first_row['is_flagged'] return is_flagged
def get_flagged_record_count(self, dataset_name): count = 0 sql_query = ''' SELECT COUNT(*) AS count FROM records WHERE LOWER(dataset) LIKE LOWER('%{dataset}%') AND is_flagged = TRUE '''.format(dataset=dataset_name) rows = get_sql_rows(host=self.postgres_host, sql=sql_query, postgres_pool=self.postgres_pool) if len(rows) > 0: first_row = rows[0] count = first_row['count'] return count
def get_flagged_record_ids(self, dataset_name): record_ids = [] sql_query = ''' SELECT record_id FROM records WHERE LOWER(dataset) LIKE LOWER('%{dataset}%') AND is_flagged = TRUE '''.format(dataset=dataset_name) rows = get_sql_rows(host=self.postgres_host, sql=sql_query, postgres_pool=self.postgres_pool) for row in rows: record_id = row['record_id'] record_ids.append(record_id) return record_ids
def get_dataset_selections(self, dataset_type): """ Gets the user-selected train or validation datasets that are stored in Postgres. Users choose their selections in the UI Parameters ---------- dataset_type : string Whether to pull train or validation datasets. Must be either "train" or "validation" """ sql_query = ''' DROP TABLE IF EXISTS latest; CREATE TEMP TABLE latest AS ( SELECT detail, is_on, ROW_NUMBER() OVER( PARTITION BY web_page, name, detail ORDER BY event_ts DESC ) AS temporal_rank FROM toggles WHERE LOWER(name) LIKE '%{dataset_type}%' ); SELECT detail AS dataset FROM latest WHERE temporal_rank = 1 AND is_on = TRUE '''.format(dataset_type=dataset_type) rows = get_sql_rows(host=self.postgres_host, sql=sql_query, postgres_pool=self.postgres_pool) datasets = [] if len(rows) > 0: for row in rows: dataset = row['dataset'] datasets.append(dataset) return datasets
def get_starting_epoch(self): """ Looks up the most recently completed epoch ID for the model and adds one to it. Previously I would track epoch ID in the checkpoint file path, but this made it harder to prune old checkpoint files and required some extra custom code that I felt wasn't worth it. Also, I realized that I can't recall a time where I wanted to go back to an old model. If the latest model was worse I always preferred to make it better by training it on all of the data again rather than loading the older model. I think I had this preference because I felt that if I couldn't get back an approximation of the old model with the same data, then what the model unlearned had been learned by random chance in the first place and wasn't worth trying to replicate or wasn't stable enough to last through additional training. This function should only get called during retraining, i.e., when a model already exists, since you'll always have a starting_epoch of 0 for new models. Returns ------- starting_epoch: int """ # Get the highest model ID from Postgres sql = """ SELECT MAX(epoch) AS previous_epoch FROM epochs WHERE model_id = {model_id} GROUP BY model_id """.format(model_id=self.model_id) rows = get_sql_rows(host=self.postgres_host, sql=sql) if len(rows) > 0: previous_epoch = int(rows[0]['previous_epoch']) starting_epoch = previous_epoch + 1 return starting_epoch else: return 0
def get_dataset_names(self): ordered_datasets = [] id_to_dataset = {} sql_query = ''' SELECT DISTINCT dataset FROM records ORDER BY dataset ASC ''' rows = get_sql_rows(host=self.postgres_host, sql=sql_query, postgres_pool=self.postgres_pool) for row in rows: dataset = row['dataset'] number = int( re.search(r'(?<=dataset_)([0-9]*)(?=_)', dataset).group(1)) id_to_dataset[number] = dataset sorted_tuples = sorted(id_to_dataset.items(), key=operator.itemgetter(0)) for number, dataset in sorted_tuples: ordered_datasets.append(dataset) return ordered_datasets
def get_image_path_from_db(self, dataset_name, record_id): """ Gets the image path from the Postgres database. Probably not helpful if you're importing a dataset and the dataset therefore isn't already in the database. This is helpful when you're consuming the image later, for example during review """ sql = f""" SELECT image_path FROM records WHERE dataset = '{dataset_name}' AND record_id = {record_id} """ rows = get_sql_rows(host=None, sql=sql, postgres_pool=self.postgres_pool) if len(rows) > 0: return rows[0]['image_path'] else: return None
def __init__(self, data_path, postgres_host, port, model_base_directory, model_id=None, total_epochs=50, batch_size=50, image_scale=8, crop_percent=50, overfit=False, angle_only=True, n_channels=3): """ Create a Trainer object Parameters ---------- data_path : str The absolute path to the directory immediately above the dataset folders. If you have datasets like /root/data/dataset_1_18-04-15 and /root/data/dataset_1_18-04-15, then your base_directory should be /root/data postgres_host: str Name of the Postgres host to connect to for details about records and other things. If record_reader.py is running in a Docker container the host would be the name of the container, e.g., postgres-11-1 but if record_reader.py is running in PyCharm on the Mac, then you would use localhost. port: int The port of the Tornado microservice that is used to report to the UI the current epoch, batch, loss, and model ID model_base_directory: str The directory that contains all of the models. For example, if you have two models: /root/model/1 and /root/model/2, then you should specify /root/model. For simplicity the code assumes all your models are organized under the one base directory. Nothing about where the model is stored is saved in the DB because the model_base_directory is something you will use frequently such that you'll probably know it from either repetition or already-working examples of your code model_id: int Specify this value if you want to continue training an existing model. The code will expect to find an immediate child directory to model_base_directory that matches the model ID and will fail if such a directory doesn't exist because you can't resume training a model that doesn't exist. The code will automatically pick a model ID for you if you don't specify one and assumes you are training a new model total_epochs: int The model is not trained for a number of iterations given by epochs, but merely until the epoch id before total_epochs is reached. For retraining, this means nothing will happen if you specify total_epochs=5 but your model has already trained for 10; it won't train for 5 more. For new models this makes no difference because epoch_id starts at 0 batch_size : int The number of records per batch image_scale: int Essentially divide an image by this number to get the new size. For example if you specify 8 the image will shrink to 1/8th of its original size. If you specify 1 then the image won't shrink at all crop_percent: int The percentage of the top portion of the image that should be taken off. Through trial an error this has proven to be an effective technique. Other drivers have come to the same conclusion. Nothing of importance happens in the top half the image. The top half only contains distractions. The model performs better if it has zero chance of fitting to that source of randomness Example: 50, to cut off the top half overfit : boolean Indicates whether the model should be trained and validated on the same data. I use this when I'm training on images that the model got horribly wrong (or recorded disengagements that occurred during a recorded deployment) angle_only : boolean Whether to focus on angle only. Possibly focuses model's attention on the most egregious errors, turning right when the car should turn left, etc is_for_model : boolean Will this be used to feed data to a model, as opposed to an API? The API doesn't care about train / validation selections, but the model does and pulls the selections from Postgres n_channels: int The number of color channels in your image dataset. Should be 3 if color (RGB) and 1 if black and white. Used to define the shape of the Keras model input """ self.data_path = data_path self.postgres_host = postgres_host self.batch_size = batch_size self.overfit = overfit self.angle_only = angle_only self.record_reader = RecordReader(base_directory=self.data_path, postgres_host=self.postgres_host, batch_size=self.batch_size, overfit=self.overfit, angle_only=self.angle_only, is_for_model=True) self.port = port self.model_base_directory = model_base_directory self.model_id = model_id self.n_epochs = int(total_epochs) self.image_scale = int(image_scale) self.crop_percent = crop_percent self.image_height_pixels = int( (240 * (self.crop_percent / 100.0)) / self.image_scale) self.image_width_pixels = int(320 / self.image_scale) self.n_channels = n_channels self.input_shape = (self.image_height_pixels, self.image_width_pixels, self.n_channels) self.train_generator = DataGenerator(record_reader=self.record_reader, partition_type='train', image_scale=int(self.image_scale), crop_percent=self.crop_percent, batch_size=self.batch_size) self.validation_generator = DataGenerator( record_reader=self.record_reader, partition_type='validation', image_scale=int(self.image_scale), crop_percent=self.crop_percent, batch_size=self.batch_size) """ If you specify all of the batches then it will take a very long time to evaluate (in some cases 5+ minutes per epoch) because the model will have to process every single image in the validation set. Assuming your random sample is representative of the entire dataset, picking some arbitrarily small number of steps (batches) should give you a sufficiently accurate representation of the error """ self.validation_steps = 3 """ If you specify a model ID the code assumes you're retraining. Don't specify a model ID if you want to train a new model. The system doesn't trust that users can safely come up with their own versions because you'll need to check both the file system and Postgres """ if self.model_id: # Existing model self.model_directory = os.path.join(self.model_base_directory, str(self.model_id)) # Check for a common user error and provide a helpful error message if not os.path.exists(self.model_directory): print( "The model doesn't exist at {dir}.".format( dir=self.model_directory), "Did you specify the right path and model ID?", "Also, don't specify the model ID if you want to train a new model.", "The system will automatically determine a new model's model ID." ) exit(1) self.start_epoch = self.get_starting_epoch() # Load the model saved_model_path = os.path.join(self.model_directory, 'model.hdf5') self.model = load_keras_model(file_path=saved_model_path) else: # New model """ Model IDs are tracked in two places: in Postgres and in the file system. It's possible these two places could get out of sync with each other because of an unforeseen bug, so to be extra safe when creating a new model's ID I take the largest ID from the two sources and increment it """ # Get the highest model ID from the file system folders = os.listdir(self.model_base_directory) model_ids = [] for folder in folders: """ Each model's folder should be its ID, and each ID should be an int. Ignore the folder if it's not an int """ try: model_id = int(folder) model_ids.append(model_id) except: pass # Assign a default in case this will be the first model highest_folder_id = 0 if len(model_ids) > 0: highest_folder_id = max(model_ids) # Get the highest model ID from Postgres sql = """ SELECT COALESCE( MAX(model_id), 0) AS model_id FROM models """ highest_db_id = int( get_sql_rows(host=self.postgres_host, sql=sql)[0]['model_id']) # The new model ID is highest known model ID + 1 highest_model_id = max(highest_folder_id, highest_db_id) self.model_id = highest_model_id + 1 # Track the model in the file system self.model_directory = os.path.join(self.model_base_directory, str(self.model_id)) os.makedirs(self.model_directory) # Track the model in the database models_sql = ''' INSERT INTO models( model_id, created_timestamp, crop, scale ) VALUES ( {model_id}, NOW(), {crop}, {scale} ) '''.format(model_id=self.model_id, crop=self.crop_percent, scale=self.image_scale) execute_sql(host=self.postgres_host, sql=models_sql) # Create the model architecture = Architecture(input_shape=self.input_shape) self.model = architecture.to_model() self.start_epoch = 0 # The Keras way of tracking the current batch ID in a training epoch # TODO: Get epoch ID from directory if model is retraining checkpoint_path = os.path.join(self.model_directory, 'model.hdf5') self.checkpoint_callback = ModelCheckpoint(filepath=checkpoint_path, verbose=1, save_best_only=True) self.progress_callback = ProgressCallBack(model_id=self.model_id, postgres_host=postgres_host, epoch_id=self.start_epoch) self.microservice_thread = Thread(target=self.start_microservice, kwargs={ 'port': self.port, 'progress_callback': self.progress_callback }) self.microservice_thread.daemon = True self.microservice_thread.start()