def inference_viva_net(options: dict, sv_instance: SharedValues, model: Tuple[int, datetime, bool, str]): postgres_pool = create_postgres_pool() postgres_con = postgres_pool.getconn() postgres_cur = postgres_con.cursor() print("Loading images ...") model_id = model[0] data = list(get_images(postgres_cur, DRA_KEYFRAMES_ID)) data_ids = [item[0] for item in data] data = [os.path.join(DOCKER_ATTACH_MEDIA, item[1]) for item in data] predict_gen = tf.data.Dataset.from_generator( lambda: gen(data), output_types=tf.float32, output_shapes=[DEFAULT_TARGET_SIZE[0], DEFAULT_TARGET_SIZE[1], 3]) predict_gen = predict_gen.batch(options['batch_size']).prefetch( tf.data.experimental.AUTOTUNE) predict_gen = iter(predict_gen) redis_con = create_redis_connection() redis_con.set(os.environ['REDIS_KEY_INFERENCE_TOTAL'], len(data)) redis_con.set(os.environ['REDIS_KEY_INFERENCE_CURRENT'], 0) sse_send_inference_data(sv_instance, redis_con) # Classes should/could not be deleted otherwise mapping to classes will fail with open(os.path.join(FILE_PATH_TFS_MODEL_DIR, CLASS_MAP_FILE_NAME), "r") as f: db_class_ids = [int(x.strip()) for x in f.readlines()] # delete all previous image predictions for current model delete_predictions_of_model(postgres_cur, model_id) postgres_con.commit() postgres_cur.close() postgres_pool.putconn(postgres_con) # start internal multiprocessing - start a process for each TFS server to query tfs_client_manager(data_ids, db_class_ids, model_id, options, predict_gen, len(data), sv_instance, redis_con) redis_con.close() postgres_con = postgres_pool.getconn() postgres_cur = postgres_con.cursor() previous_prediction_model = get_previous_stored_model(postgres_cur) set_inference_done(postgres_cur, model_id) postgres_con.commit() if previous_prediction_model is not None: delete_predictions_of_model(postgres_cur, previous_prediction_model) postgres_con.commit() postgres_cur.close() postgres_pool.putconn(postgres_con) postgres_pool.closeall()
def start_export(redis_pool: redis.ConnectionPool, sv_instance: SharedValues) -> None: # input validation if any(x not in request.form for x in ["threshold", "app"]): abort(400, "Missing parameter") try: threshold = int(request.form['threshold']) if threshold < 0 or threshold > 100: raise ValueError() except ValueError: abort(400, "Wrong parameter value") return # senseless but avoids warning app_name = request.form['app'] if app_name not in DJANGO_APP_NAMES: abort(400, "Wrong parameter value") # check run conditions redis_con = redis.Redis(connection_pool=redis_pool) if app_name == os.environ['DJANGO_APP_NAME_CONCEPT'] and \ (redis_con.get(os.environ['REDIS_KEY_INFERENCE_RUN']) or "0") == "1": redis_con.close() abort(400, "Inference is currently running") with export_start_lock: redis_reset_startup( redis_con, "Export", os.environ['REDIS_KEY_EXPORT_RUN'].format(app_name), os.environ['REDIS_KEY_EXPORT_TIME'].format(app_name), os.environ['REDIS_KEY_EXPORT_TIME_ETE'].format(app_name), os.environ['REDIS_KEY_EXPORT_EXCEPTION'].format(app_name), os.environ['REDIS_KEY_EXPORT_CURRENT'].format(app_name), os.environ['REDIS_KEY_EXPORT_TOTAL'].format(app_name)) redis_con.set( os.environ['REDIS_KEY_EXPORT_THRESHOLD'].format(app_name), threshold) # set event in shared memory for export start sv_instance.export[app_name].start.set() sse_send_export_data(sv_instance, app_name, redis_con) if app_name == os.environ['DJANGO_APP_NAME_CONCEPT']: sse_send_inference_data(sv_instance, redis_con) redis_con.close()
def start_inference(redis_pool: redis.ConnectionPool, sv_instance: SharedValues) -> None: redis_con = redis.Redis(connection_pool=redis_pool) # input validation gpu_selection, batch_size = input_validation_train_infer( redis_con, sv_instance.compatible_gpus) # check run conditions if (redis_con.get(os.environ['REDIS_KEY_EXPORT_RUN'].format( os.environ['DJANGO_APP_NAME_CONCEPT'])) or "0") == "1": redis_con.close() abort(400, "Export is currently running") check_if_inference_already_stored() with inference_start_lock: redis_reset_startup(redis_con, "Inference", os.environ['REDIS_KEY_INFERENCE_RUN'], os.environ['REDIS_KEY_INFERENCE_TIME'], os.environ['REDIS_KEY_INFERENCE_TIME_ETE'], os.environ['REDIS_KEY_INFERENCE_EXCEPTION'], os.environ['REDIS_KEY_INFERENCE_CURRENT'], os.environ['REDIS_KEY_INFERENCE_TOTAL']) redis_con.delete(os.environ['REDIS_KEY_INFERENCE_GPUS']) [ redis_con.lpush(os.environ['REDIS_KEY_INFERENCE_GPUS'], gpu_idx) for gpu_idx in reversed(gpu_selection) ] redis_con.set(os.environ['REDIS_KEY_INFERENCE_BATCH_SIZE'], batch_size) # set event in shared memory for export start sv_instance.inference.start.set() sse_send_inference_data(sv_instance, redis_con) sse_send_export_data(sv_instance, os.environ['DJANGO_APP_NAME_CONCEPT'], redis_con) redis_con.close()
def tfs_client_manager(data_ids: list, db_class_ids: list, model_id: int, options: dict, predict_gen: tf.data.Dataset, data_length: int, sv_instance: SharedValues, redis_con: redis.Redis): batch_size = options["batch_size"] batch_count = data_length batch_add_count = 0 batch_queue = multiprocessing.Queue() # FIFO queue process_count = len(options['gpu_selection']) process_count = 1 if process_count == 0 else min(8, process_count) process_list = [] for idx in range(process_count): process = multiprocessing.Process( target=process_batch_queue, args=(batch_queue, batch_size, model_id, data_ids, db_class_ids, idx, sv_instance)) process_list.append(process) process.start() sv_instance.inference.num_finished.value = 0 sv_instance.inference.running.value = 1 last_finished_step = 0 redis_con.set(os.environ['REDIS_KEY_INFERENCE_TIME_ETE'], int(datetime.timestamp(datetime.now()))) while True: q_size = batch_queue.qsize() # Feed the queue if q_size < process_count * 4 and batch_add_count < batch_count: if batch_add_count != batch_count and batch_add_count > process_count * 4 \ and batch_count > process_count > q_size and batch_count % process_count == 0: print( "Warning: Batch queue is nearly empty ({:d})." "Reduced performance might be caused by empty batch queue that cannot be filled up that quickly!" .format(q_size)) batch_queue.put((batch_add_count, next(predict_gen))) batch_add_count += 1 # stop monitoring and manipulation of queue if batch_add_count == batch_count: sv_instance.inference.running.value = 0 # signal the clients to stop if queue.get returns nothing process_alive = [process.is_alive() for process in process_list] if not any(process_alive): break if q_size > process_count: time.sleep(0.2) # Update database and send clients message if count of finished images reached defined steps new_finished_step = math.floor( sv_instance.inference.num_finished.value / (max(12, batch_size) * process_count)) if last_finished_step != new_finished_step: redis_con.set(os.environ['REDIS_KEY_INFERENCE_CURRENT'], sv_instance.inference.num_finished.value) threading.Thread(target=sse_send_inference_data, args=(sv_instance, redis_con)).start() last_finished_step = new_finished_step sse_send_inference_data(sv_instance, redis_con)
def inference_update(): redis_con = redis.Redis(connection_pool=redis_pool) sse_send_inference_data(sv_instance, redis_con) redis_con.close() return ""