def get_critical_error_record_ids(self, dataset_name):
        record_ids = []
        sql_query = '''
            DROP TABLE IF EXISTS latest_deployment;
            CREATE TEMP TABLE latest_deployment AS (
              SELECT
                model_id,
                epoch
              FROM deploy
              ORDER BY TIMESTAMP DESC
              LIMIT 1
            );

            SELECT
              records.record_id
            FROM records
            LEFT JOIN predictions
              ON records.dataset = predictions.dataset
                AND records.record_id = predictions.record_id
            LEFT JOIN latest_deployment AS deploy
              ON predictions.model_id = deploy.model_id
                AND predictions.epoch = deploy.epoch
            WHERE LOWER(records.dataset) LIKE LOWER('%{dataset}%')
              AND ABS(records.angle - predictions.angle) >= 0.8
            ORDER BY record_id ASC
            '''.format(dataset=dataset_name)
        rows = get_sql_rows(host=self.postgres_host,
                            sql=sql_query,
                            postgres_pool=self.postgres_pool)
        for row in rows:
            record_id = row['record_id']
            record_ids.append(record_id)
        return record_ids
    def get_toggle_status(self, web_page, name, detail):
        """
        Checks if a user has turned on a given toggle

        Parameters
        ----------
        web_page : string
            The page web where the user would have set the toggle
        name : string
            The type of toggle
        name : string
            Any other details about the toggle

        Returns
        ----------
        is_on : boolean
            Whether the toggle is turned on or not
        """
        sql_query = '''
            DROP TABLE IF EXISTS latest;
            CREATE TEMP TABLE latest AS (
              SELECT
                detail,
                is_on,
                ROW_NUMBER() OVER(
                  PARTITION BY
                    web_page,
                    name,
                    detail
                  ORDER BY
                    event_ts DESC
                ) AS temporal_rank
              FROM toggles
              WHERE LOWER(web_page) LIKE LOWER('%{web_page}%')
                AND LOWER(name) LIKE LOWER('%{name}%')
                AND LOWER(detail) LIKE LOWER('%{detail}%')
            );

            SELECT
              is_on
            FROM latest
            WHERE temporal_rank = 1
              AND is_on = TRUE
        '''.format(web_page=web_page, name=name, detail=detail)
        rows = get_sql_rows(host=self.postgres_host,
                            sql=sql_query,
                            postgres_pool=self.postgres_pool)
        is_on = False
        if len(rows) > 0:
            first_row = rows[0]
            is_on = first_row['is_on']
        return is_on
 def get_dataset_record_ids(self, dataset_name):
     record_ids = []
     sql_query = '''
         SELECT
           record_id
         FROM records
         WHERE UPPER(dataset) LIKE UPPER('{dataset_name}')
         ORDER BY record_id ASC
     '''.format(dataset_name=dataset_name)
     rows = get_sql_rows(host=self.postgres_host,
                         sql=sql_query,
                         postgres_pool=self.postgres_pool)
     for row in rows:
         record_id = row['record_id']
         record_ids.append(record_id)
     return record_ids
 def read_flag(self, dataset, record_id):
     sql_query = '''
         SELECT
           is_flagged
         FROM records
         WHERE LOWER(dataset) LIKE LOWER('%{dataset}%')
           AND record_id = {record_id}
     '''.format(dataset=dataset, record_id=record_id)
     rows = get_sql_rows(host=self.postgres_host,
                         sql=sql_query,
                         postgres_pool=self.postgres_pool)
     is_flagged = False
     if len(rows) > 0:
         first_row = rows[0]
         is_flagged = first_row['is_flagged']
     return is_flagged
 def get_flagged_record_count(self, dataset_name):
     count = 0
     sql_query = '''
         SELECT
           COUNT(*) AS count
         FROM records
         WHERE LOWER(dataset) LIKE LOWER('%{dataset}%')
           AND is_flagged = TRUE
     '''.format(dataset=dataset_name)
     rows = get_sql_rows(host=self.postgres_host,
                         sql=sql_query,
                         postgres_pool=self.postgres_pool)
     if len(rows) > 0:
         first_row = rows[0]
         count = first_row['count']
     return count
 def get_flagged_record_ids(self, dataset_name):
     record_ids = []
     sql_query = '''
         SELECT
           record_id
         FROM records
         WHERE LOWER(dataset) LIKE LOWER('%{dataset}%')
           AND is_flagged = TRUE
     '''.format(dataset=dataset_name)
     rows = get_sql_rows(host=self.postgres_host,
                         sql=sql_query,
                         postgres_pool=self.postgres_pool)
     for row in rows:
         record_id = row['record_id']
         record_ids.append(record_id)
     return record_ids
    def get_dataset_selections(self, dataset_type):
        """
        Gets the user-selected train or validation datasets that are
        stored in Postgres. Users choose their selections in the UI

        Parameters
        ----------
        dataset_type : string
            Whether to pull train or validation datasets. Must be
            either "train" or "validation"
        """
        sql_query = '''
            DROP TABLE IF EXISTS latest;
            CREATE TEMP TABLE latest AS (
              SELECT
                detail,
                is_on,
                ROW_NUMBER() OVER(
                  PARTITION BY
                    web_page,
                    name,
                    detail
                  ORDER BY
                    event_ts DESC
                ) AS temporal_rank
              FROM toggles
              WHERE LOWER(name) LIKE '%{dataset_type}%'
            );

            SELECT
              detail AS dataset
            FROM latest
            WHERE temporal_rank = 1
              AND is_on = TRUE
        '''.format(dataset_type=dataset_type)
        rows = get_sql_rows(host=self.postgres_host,
                            sql=sql_query,
                            postgres_pool=self.postgres_pool)
        datasets = []
        if len(rows) > 0:
            for row in rows:
                dataset = row['dataset']
                datasets.append(dataset)
        return datasets
Beispiel #8
0
    def get_starting_epoch(self):
        """
        Looks up the most recently completed epoch ID for the model
        and adds one to it. Previously I would track epoch ID in the
        checkpoint file path, but this made it harder to prune old
        checkpoint files and required some extra custom code that I
        felt wasn't worth it. Also, I realized that I can't recall a
        time where I wanted to go back to an old model. If the latest
        model was worse I always preferred to make it better by
        training it on all of the data again rather than loading the
        older model. I think I had this preference because I felt
        that if I couldn't get back an approximation of the old model
        with the same data, then what the model unlearned had been
        learned by random chance in the first place and wasn't worth
        trying to replicate or wasn't stable enough to last through
        additional training.

        This function should only get called during retraining, i.e.,
        when a model already exists, since you'll always have a
        starting_epoch of 0 for new models.

        Returns
        -------
        starting_epoch: int
        """

        # Get the highest model ID from Postgres
        sql = """
            SELECT
                MAX(epoch) AS previous_epoch
            FROM epochs
            WHERE
                model_id = {model_id}
            GROUP BY model_id
        """.format(model_id=self.model_id)
        rows = get_sql_rows(host=self.postgres_host, sql=sql)
        if len(rows) > 0:
            previous_epoch = int(rows[0]['previous_epoch'])
            starting_epoch = previous_epoch + 1
            return starting_epoch
        else:
            return 0
 def get_dataset_names(self):
     ordered_datasets = []
     id_to_dataset = {}
     sql_query = '''
         SELECT DISTINCT
           dataset
         FROM records
         ORDER BY dataset ASC
     '''
     rows = get_sql_rows(host=self.postgres_host,
                         sql=sql_query,
                         postgres_pool=self.postgres_pool)
     for row in rows:
         dataset = row['dataset']
         number = int(
             re.search(r'(?<=dataset_)([0-9]*)(?=_)', dataset).group(1))
         id_to_dataset[number] = dataset
     sorted_tuples = sorted(id_to_dataset.items(),
                            key=operator.itemgetter(0))
     for number, dataset in sorted_tuples:
         ordered_datasets.append(dataset)
     return ordered_datasets
 def get_image_path_from_db(self, dataset_name, record_id):
     """
     Gets the image path from the Postgres database. Probably not
     helpful if you're importing a dataset and the dataset therefore
     isn't already in the database. This is helpful when you're
     consuming the image later, for example during review
     """
     sql = f"""
         SELECT
             image_path
         FROM records
         WHERE
             dataset = '{dataset_name}'
             AND record_id = {record_id}
     """
     rows = get_sql_rows(host=None,
                         sql=sql,
                         postgres_pool=self.postgres_pool)
     if len(rows) > 0:
         return rows[0]['image_path']
     else:
         return None
Beispiel #11
0
    def __init__(self,
                 data_path,
                 postgres_host,
                 port,
                 model_base_directory,
                 model_id=None,
                 total_epochs=50,
                 batch_size=50,
                 image_scale=8,
                 crop_percent=50,
                 overfit=False,
                 angle_only=True,
                 n_channels=3):
        """
        Create a Trainer object

        Parameters
        ----------
        data_path : str
            The absolute path to the directory immediately above the
            dataset folders. If you have datasets like /root/data/dataset_1_18-04-15
            and /root/data/dataset_1_18-04-15, then your base_directory
            should be /root/data
        postgres_host: str
            Name of the Postgres host to connect to for details about records
            and other things. If record_reader.py is running in a Docker container
            the host would be the name of the container, e.g., postgres-11-1 but
            if record_reader.py is running in PyCharm on the Mac, then you would
            use localhost.
        port: int
            The port of the Tornado microservice that is used to report to the UI
            the current epoch, batch, loss, and model ID
        model_base_directory: str
            The directory that contains all of the models. For example, if you
            have two models: /root/model/1 and /root/model/2, then you should
            specify /root/model. For simplicity the code assumes all your models
            are organized under the one base directory. Nothing about where the
            model is stored is saved in the DB because the model_base_directory
            is something you will use frequently such that you'll probably know
            it from either repetition or already-working examples of your code
        model_id: int
            Specify this value if you want to continue training an existing model.
            The code will expect to find an immediate child directory to
            model_base_directory that matches the model ID and will fail if such a
            directory doesn't exist because you can't resume training a model that
            doesn't exist. The code will automatically pick a model ID for you if
            you don't specify one and assumes you are training a new model
        total_epochs: int
            The model is not trained for a number of iterations given by epochs,
            but merely until the epoch id before total_epochs is reached. For
            retraining, this means nothing will happen if you specify total_epochs=5
            but your model has already trained for 10; it won't train for 5 more.
            For new models this makes no difference because epoch_id starts at 0
        batch_size : int
            The number of records per batch
        image_scale: int
            Essentially divide an image by this number to get the new size.
            For example if you specify 8 the image will shrink to 1/8th of its
            original size. If you specify 1 then the image won't shrink at all
        crop_percent: int
            The percentage of the top portion of the image that should be taken
            off. Through trial an error this has proven to be an effective
            technique. Other drivers have come to the same conclusion. Nothing
            of importance happens in the top half the image. The top half only
            contains distractions. The model performs better if it has zero
            chance of fitting to that source of randomness
            Example: 50, to cut off the top half
        overfit : boolean
            Indicates whether the model should be trained and validated
            on the same data. I use this when I'm training on images
            that the model got horribly wrong (or recorded disengagements
            that occurred during a recorded deployment)
        angle_only : boolean
            Whether to focus on angle only. Possibly focuses model's
            attention on the most egregious errors, turning right when
            the car should turn left, etc
        is_for_model : boolean
            Will this be used to feed data to a model, as opposed to an
            API? The API doesn't care about train / validation selections,
            but the model does and pulls the selections from Postgres
        n_channels: int
            The number of color channels in your image dataset. Should be
            3 if color (RGB) and 1 if black and white. Used to define the
            shape of the Keras model input
        """

        self.data_path = data_path
        self.postgres_host = postgres_host
        self.batch_size = batch_size
        self.overfit = overfit
        self.angle_only = angle_only
        self.record_reader = RecordReader(base_directory=self.data_path,
                                          postgres_host=self.postgres_host,
                                          batch_size=self.batch_size,
                                          overfit=self.overfit,
                                          angle_only=self.angle_only,
                                          is_for_model=True)
        self.port = port
        self.model_base_directory = model_base_directory
        self.model_id = model_id
        self.n_epochs = int(total_epochs)
        self.image_scale = int(image_scale)
        self.crop_percent = crop_percent
        self.image_height_pixels = int(
            (240 * (self.crop_percent / 100.0)) / self.image_scale)
        self.image_width_pixels = int(320 / self.image_scale)
        self.n_channels = n_channels
        self.input_shape = (self.image_height_pixels, self.image_width_pixels,
                            self.n_channels)

        self.train_generator = DataGenerator(record_reader=self.record_reader,
                                             partition_type='train',
                                             image_scale=int(self.image_scale),
                                             crop_percent=self.crop_percent,
                                             batch_size=self.batch_size)

        self.validation_generator = DataGenerator(
            record_reader=self.record_reader,
            partition_type='validation',
            image_scale=int(self.image_scale),
            crop_percent=self.crop_percent,
            batch_size=self.batch_size)
        """
        If you specify all of the batches then it will take a very long
        time to evaluate (in some cases 5+ minutes per epoch) because
        the model will have to process every single image in the
        validation set. Assuming your random sample is representative
        of the entire dataset, picking some arbitrarily small number of
        steps (batches) should give you a sufficiently accurate
        representation of the error
        """
        self.validation_steps = 3
        """
        If you specify a model ID the code assumes you're retraining. Don't specify a
        model ID if you want to train a new model. The system doesn't trust that users
        can safely come up with their own versions because you'll need to check both
        the file system and Postgres
        """
        if self.model_id:  # Existing model
            self.model_directory = os.path.join(self.model_base_directory,
                                                str(self.model_id))

            # Check for a common user error and provide a helpful error message
            if not os.path.exists(self.model_directory):
                print(
                    "The model doesn't exist at {dir}.".format(
                        dir=self.model_directory),
                    "Did you specify the right path and model ID?",
                    "Also, don't specify the model ID if you want to train a new model.",
                    "The system will automatically determine a new model's model ID."
                )
                exit(1)

            self.start_epoch = self.get_starting_epoch()

            # Load the model
            saved_model_path = os.path.join(self.model_directory, 'model.hdf5')
            self.model = load_keras_model(file_path=saved_model_path)

        else:  # New model
            """
            Model IDs are tracked in two places: in Postgres and in the file
            system. It's possible these two places could get out of sync with
            each other because of an unforeseen bug, so to be extra safe when
            creating a new model's ID I take the largest ID from the two
            sources and increment it
            """
            # Get the highest model ID from the file system
            folders = os.listdir(self.model_base_directory)
            model_ids = []
            for folder in folders:
                """
                Each model's folder should be its ID, and each ID should be
                an int. Ignore the folder if it's not an int
                """
                try:
                    model_id = int(folder)
                    model_ids.append(model_id)
                except:
                    pass

            # Assign a default in case this will be the first model
            highest_folder_id = 0

            if len(model_ids) > 0:
                highest_folder_id = max(model_ids)

            # Get the highest model ID from Postgres
            sql = """
                SELECT
                    COALESCE(
                        MAX(model_id),
                    0) AS model_id
                FROM models
            """
            highest_db_id = int(
                get_sql_rows(host=self.postgres_host, sql=sql)[0]['model_id'])

            # The new model ID is highest known model ID + 1
            highest_model_id = max(highest_folder_id, highest_db_id)
            self.model_id = highest_model_id + 1

            # Track the model in the file system
            self.model_directory = os.path.join(self.model_base_directory,
                                                str(self.model_id))
            os.makedirs(self.model_directory)

            # Track the model in the database
            models_sql = '''
                INSERT INTO models(
                    model_id,
                    created_timestamp,
                    crop,
                    scale
                ) VALUES (
                    {model_id},
                    NOW(),
                    {crop},
                    {scale}
                )
                '''.format(model_id=self.model_id,
                           crop=self.crop_percent,
                           scale=self.image_scale)
            execute_sql(host=self.postgres_host, sql=models_sql)

            # Create the model
            architecture = Architecture(input_shape=self.input_shape)
            self.model = architecture.to_model()

            self.start_epoch = 0

        # The Keras way of tracking the current batch ID in a training epoch
        # TODO: Get epoch ID from directory if model is retraining
        checkpoint_path = os.path.join(self.model_directory, 'model.hdf5')
        self.checkpoint_callback = ModelCheckpoint(filepath=checkpoint_path,
                                                   verbose=1,
                                                   save_best_only=True)
        self.progress_callback = ProgressCallBack(model_id=self.model_id,
                                                  postgres_host=postgres_host,
                                                  epoch_id=self.start_epoch)

        self.microservice_thread = Thread(target=self.start_microservice,
                                          kwargs={
                                              'port':
                                              self.port,
                                              'progress_callback':
                                              self.progress_callback
                                          })
        self.microservice_thread.daemon = True
        self.microservice_thread.start()