Exemple #1
0
    def _update_pg_db(self):
        postgres_con = create_postgres_connection()
        # noinspection PyProtectedMember
        postgres_cur: psycopg2._psycopg.cursor = postgres_con.cursor()

        # store new model in db
        start_datetime = datetime.fromtimestamp(self._start_time)
        postgres_cur.execute(
            """INSERT INTO model (date, dir_name, inference_stored)
            VALUES (%s, %s, %s) """,
            (start_datetime, get_datetime_str(self._start_time), False))
        # query new model id
        postgres_con.commit()
        postgres_cur.execute(
            """SELECT id
             FROM model
             WHERE date = %s""", (start_datetime, ))
        model_id = postgres_cur.fetchone()[0]

        # insert precision per class for new model
        execute_values(
            postgres_cur,
            "INSERT INTO evaluation (modelid, classid, precision) VALUES %s",
            [(model_id, class_id, precision) for class_id, precision in
             self.average_precision_per_class.items()])
        postgres_con.commit()

        postgres_cur.close()
        postgres_con.close()
Exemple #2
0
def start_training(redis_pool: redis.ConnectionPool,
                   sv_instance: SharedValues):
    redis_con = redis.Redis(connection_pool=redis_pool)
    # input validation
    gpu_selection, batch_size = input_validation_train_infer(
        redis_con, sv_instance.compatible_gpus)

    # check run conditions
    postgres_con = create_postgres_connection()
    postgres_cur = postgres_con.cursor()
    postgres_cur.execute(
        """SELECT count(*)
        FROM imageannotation
        JOIN class c on c.id = imageannotation.classid
        WHERE classtypeid = %s
        GROUP BY classid
        HAVING count(*) > %s
        LIMIT 1""",
        (CLASS_TYPE_CONCEPT, int(os.environ["TRAIN_MIN_ANNO_PER_CLASS"])))
    prerequisite = postgres_cur.fetchone()
    postgres_cur.close()
    postgres_con.close()
    if not prerequisite:
        redis_con.close()
        abort(
            400,
            f"Not enough data with {os.environ['TRAIN_MIN_ANNO_PER_CLASS']} as minimum number of positive "
            "samples per concept")

    with training_start_lock:
        redis_reset_startup(redis_con, "Training",
                            os.environ['REDIS_KEY_TRAINING_RUN'],
                            os.environ['REDIS_KEY_TRAINING_TIME'],
                            os.environ['REDIS_KEY_TRAINING_TIME_ETE'],
                            os.environ['REDIS_KEY_TRAINING_EXCEPTION'],
                            os.environ['REDIS_KEY_TRAINING_CURRENT'],
                            os.environ['REDIS_KEY_TRAINING_TOTAL'])
        redis_con.delete(os.environ['REDIS_KEY_TRAINING_GPUS'])
        [
            redis_con.lpush(os.environ['REDIS_KEY_TRAINING_GPUS'], gpu_idx)
            for gpu_idx in reversed(gpu_selection)
        ]
        redis_con.set(os.environ['REDIS_KEY_TRAINING_BATCH_SIZE'], batch_size)
        redis_con.delete(os.environ['REDIS_KEY_TRAINING_MAP'])
        redis_con.delete(os.environ['REDIS_KEY_TRAINING_LOSS'])
        redis_con.delete(os.environ['REDIS_KEY_TRAINING_LR'])

        # set event in shared memory for export start
        sv_instance.training.start.set()

    sse_send_training_data(sv_instance, redis_con)
    redis_con.close()
Exemple #3
0
def check_if_inference_already_stored():
    postgres_con = create_postgres_connection()
    postgres_cur = postgres_con.cursor()
    postgres_cur.execute(
        """SELECT inference_stored
        FROM model
        JOIN evaluation e on model.id = e.modelid
        JOIN class c on c.id = e.classid
        WHERE classtypeid = %s
        ORDER BY date DESC 
        LIMIT 1
        """, (CLASS_TYPE_CONCEPT, ))
    model_res = postgres_cur.fetchone()
    postgres_cur.close()
    postgres_con.close()
    if model_res and model_res[0]:
        abort(400, "Inference did already run for this model")
Exemple #4
0
def start_export(sv_instance: SharedValues, app_name: str) -> None:
    if app_name == os.environ['DJANGO_APP_NAME_CONCEPT']:
        class_type_id = CLASS_TYPE_CONCEPT
    else:
        class_type_id = CLASS_TYPE_PERSON

    postgres_con = create_postgres_connection()
    postgres_cur = postgres_con.cursor()

    latest_model = get_latest_prediction_model_data(postgres_cur, class_type_id)
    redis_con = create_redis_connection()
    if not latest_model:
        set_export_stop(redis_con, sv_instance, app_name, exception="No predictions to export")
        redis_con.close()
        return

    options = {
        'app_name': app_name,
        'class_type_id': class_type_id,
        'threshold': int(redis_con.get(os.environ['REDIS_KEY_EXPORT_THRESHOLD'].format(app_name))),
        'model_id': latest_model[0],
        'model_date_str': get_datetime_str(int(latest_model[1].timestamp())),
        'time': int(redis_con.get(os.environ['REDIS_KEY_EXPORT_TIME'].format(app_name)))
    }
    redis_con.close()

    try:
        export_class_detections(options, postgres_cur, sv_instance, '/tmp/viva/', '/export/')
        export_exception = None
    except Exception as e:
        export_exception = str(e)
        print(repr(e), file=sys.stderr)
        print(traceback.format_exc(), file=sys.stderr)
    except KeyboardInterrupt:
        return

    postgres_cur.close()
    postgres_con.close()

    redis_con = create_redis_connection()
    redis_con.set(os.environ['REDIS_KEY_EXPORT_MODEL_IDENT'].format(app_name), options['model_date_str'])
    set_export_stop(redis_con, sv_instance, app_name, exception=export_exception)
    redis_con.close()
Exemple #5
0
def train_viva_net(options: dict, sv_instance: SharedValues):
    postgres_con = create_postgres_connection()
    postgres_cur = postgres_con.cursor()

    db_class_ids = list(map(lambda c: str(c[0]),
                            get_train_classes(postgres_cur, int(os.environ["TRAIN_MIN_ANNO_PER_CLASS"]),
                                              class_type=CLASS_TYPE_CONCEPT)))
    if not db_class_ids:
        raise InsufficientTrainingData(
            "not enough data with {} as minimum number of positive samples per concept".format(
                os.environ["TRAIN_MIN_ANNO_PER_CLASS"]))

    num_classes = len(db_class_ids)
    data = {}
    class_counts = {}

    for db_class_id in db_class_ids:
        class_results = list(get_annotations_for_class(postgres_cur, int(db_class_id)))
        class_counts[db_class_id] = {"neg": len([1 for res in class_results if not res[1]])}
        parse_class_annotations(data, db_class_id, class_results)

    train, val = multi_label_stratify(data, db_class_ids, class_counts)

    # create pandas data frames
    pd_train = create_pd_frame(train, db_class_ids)
    pd_val = create_pd_frame(val, db_class_ids, is_val=True)

    # get generators
    train_gen = get_generator(pd_train, db_class_ids, options['batch_size'])
    val_gen = get_generator(pd_val, db_class_ids, options['batch_size'], for_test=True)

    tf.config.experimental.set_memory_growth = True
    strategy = tf.distribute.MirroredStrategy()
    with strategy.scope():
        train_net = get_model_compiled(num_classes, float(os.environ['TRAINING_LR_INIT']))
    gen_class_map = {i: int(db_class_id) for i, db_class_id in enumerate(db_class_ids)}

    # close db connection
    postgres_cur.close()
    postgres_con.close()

    redis_con = create_redis_connection()
    redis_con.set(os.environ['REDIS_KEY_TRAINING_TOTAL'], len(train_gen) *
                  (TRAINING_MAX_EPOCHS_FIRST_PHASE + TRAINING_MAX_EPOCHS_2ND_PHASE))
    redis_con.set(os.environ['REDIS_KEY_TRAINING_CURRENT'], 0)
    redis_con.set(os.environ['REDIS_KEY_TRAINING_STEPS_PER_EPOCH'], len(train_gen))
    redis_con.set(os.environ['REDIS_KEY_TRAINING_TIME_ETE'], int(datetime.timestamp(datetime.now())))
    sse_send_training_data(sv_instance, redis_con)
    redis_con.close()

    # callbacks
    eval_callback = EvaluationCallback(val_gen, gen_class_map, options['start_time'], save_model=False)
    reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss',
                                                  factor=float(os.environ['TRAINING_LR_REDUCE_FACTOR']),
                                                  patience=int(os.environ['TRAINING_LR_REDUCE_PATIENCE']),
                                                  min_lr=0.000001)
    stop_callback = keras.callbacks.EarlyStopping(monitor="mean_average_precision", mode="max",
                                                  min_delta=TRAINING_DELTA_STOP,
                                                  patience=TRAINING_DELTA_PATIENCE, verbose=1)
    epoch_callback = RedisSSECallback(sv_instance, len(train_gen))
    logger = NBatchLogger(TRAINING_LOG_STEP)

    print("First phase is running...")
    train_net.fit(train_gen,
                  epochs=TRAINING_MAX_EPOCHS_FIRST_PHASE,
                  workers=int(os.environ['TRAINING_WORKER_COUNT']),
                  callbacks=[eval_callback, logger, reduce_lr, epoch_callback, stop_callback],
                  use_multiprocessing=False,
                  max_queue_size=200,
                  verbose=0,
                  )

    def unfreeze_model(model):
        for layer in model.layers[-20:]:
            if not isinstance(layer, tf.keras.layers.BatchNormalization):
                layer.trainable = True

        optimizer = tf.keras.optimizers.Adam(learning_rate=float(os.environ['TRAINING_LR_INIT']) * 0.1)
        model.compile(
            optimizer=optimizer, loss=lambda y_true, y_pred: focal_crossentropy(y_true, y_pred, alpha=0.8)
        )

    unfreeze_model(train_net)
    eval_callback = EvaluationCallback(val_gen, gen_class_map, options['start_time'], save_model=True)

    print("Second phase is running...")
    train_net.fit(train_gen,
                  epochs=TRAINING_MAX_EPOCHS_2ND_PHASE,
                  workers=int(os.environ['TRAINING_WORKER_COUNT']),
                  callbacks=[eval_callback, logger, reduce_lr, epoch_callback, stop_callback],
                  use_multiprocessing=False,
                  max_queue_size=200,
                  verbose=0,
                  )