Example #1
0
def depth_Estimation(args):
	model_name = args.model_name
	#Setting up the network
	print("Loading model....")
	download_model_if_doesnt_exist(model_name)
	encoder_path = os.path.join("models", model_name, "encoder.pth")
	depth_decoder_path = os.path.join("models", model_name, "depth.pth")

	# LOADING PRETRAINED MODEL
	encoder = networks.ResnetEncoder(18, False)
	depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc, scales=range(4))

	loaded_dict_enc = torch.load(encoder_path, map_location='cpu')
	filtered_dict_enc = {k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict()}
	encoder.load_state_dict(filtered_dict_enc)

	loaded_dict = torch.load(depth_decoder_path, map_location='cpu')
	depth_decoder.load_state_dict(loaded_dict)

	encoder.eval()
	depth_decoder.eval();

	#Loading image
	print("Loading image....")
	image_path = args.image_path
	input_image = pil.open(image_path).convert('RGB')
	original_width, original_height = input_image.size
	feed_height = loaded_dict_enc['height']
	feed_width = loaded_dict_enc['width']
	input_image_resized = input_image.resize((feed_width, feed_height), pil.LANCZOS)

	input_image_pytorch = transforms.ToTensor()(input_image_resized).unsqueeze(0)
	input_npy = input_image_pytorch.squeeze().cpu().numpy()


	#prediction of disparity image
	with torch.no_grad():
		features = encoder(input_image_pytorch)
		outputs = depth_decoder(features)
		disp = outputs[("disp", 0)]

	 #Scaling for given resolution
	disp_resized = torch.nn.functional.interpolate(disp,
	(original_height, original_width), mode="bilinear", align_corners=False) # interpolate the values in to fit the given resolution of the image
	disp_resized_np = disp_resized.squeeze().cpu().numpy() # Converting tensor in pytorch to numpy array
	
	print("resized disp" + str(disp_resized_np.shape))
	print("Range of Depth in image")
	scaled,dep = disp_to_depth(disp_resized_np,0.1,1000) # resizing the depth from 0.1 to 1000 units
	print("min->"+str(dep.min())+"mx->"+str(dep.max()))

	#Preview of the rgb and Depth images
	rgb = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
	depth = dep.reshape((rgb.shape[0],rgb.shape[1]),order='C')
	plot(rgb,depth)

	return rgb,depth
Example #2
0
    def __init__(self,
                 model_path=os.path.dirname(__file__) + "/models/",
                 model_name="stereo_640x192"):
        self.model_name = model_name
        self.model_path = model_path + self.model_name
        torch.set_grad_enabled(False)

        if torch.cuda.is_available():
            self.device = torch.device("cuda")
        else:
            self.device = torch.device("cpu")

        download_model_if_doesnt_exist(self.model_name, self.model_path)
        self.encoder_path = os.path.join(self.model_path, "encoder.pth")
        self.depth_decoder_path = os.path.join(self.model_path, "depth.pth")

        # LOADING PRETRAINED MODEL
        self.encoder = ResnetEncoder(18, False)
        self.loaded_dict_enc = torch.load(self.encoder_path,
                                          map_location=self.device)

        # extract the height and width of image that this model was trained with
        self.feed_height = self.loaded_dict_enc['height']
        self.feed_width = self.loaded_dict_enc['width']
        self.filtered_dict_enc = {
            k: v
            for k, v in self.loaded_dict_enc.items()
            if k in self.encoder.state_dict()
        }
        self.encoder.load_state_dict(self.filtered_dict_enc)
        self.encoder.to(self.device)
        self.encoder.eval()

        self.depth_decoder = DepthDecoder(num_ch_enc=self.encoder.num_ch_enc,
                                          scales=range(4))

        self.loaded_dict = torch.load(self.depth_decoder_path,
                                      map_location=self.device)
        self.depth_decoder.load_state_dict(self.loaded_dict)

        self.depth_decoder.to(self.device)
        self.depth_decoder.eval()

        #set up service calls
        self.mono_depth_service = rospy.Service(
            "MonoDepthService", MonoDepth, self.mono_depth_service_callback)
        self.mono_depth_publisher = rospy.Publisher('mono_depth_img',
                                                    Image,
                                                    queue_size=10)
def setup_network(model_name="mono_640x192"):
    download_model_if_doesnt_exist(model_name)
    encoder_path = os.path.join("models", model_name, "encoder.pth")
    depth_decoder_path = os.path.join("models", model_name, "depth.pth")

    # LOADING PRETRAINED MODEL
    encoder = networks.ResnetEncoder(18, False)
    depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc,
                                          scales=range(4))
    loaded_dict_enc = torch.load(encoder_path, map_location='cpu')
    filtered_dict_enc = {
        k: v
        for k, v in loaded_dict_enc.items() if k in encoder.state_dict()
    }
    encoder.load_state_dict(filtered_dict_enc)
    loaded_dict = torch.load(depth_decoder_path, map_location='cpu')
    depth_decoder.load_state_dict(loaded_dict)

    encoder.eval()
    depth_decoder.eval()

    return encoder, depth_decoder, loaded_dict_enc
Example #4
0
def test_cam(args):

    if torch.cuda.is_available() and not args.no_cuda:
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    download_model_if_doesnt_exist(args.model_name)
    model_path = os.path.join("models", args.model_name)
    print("-> Loading model from ", model_path)
    encoder_path = os.path.join(model_path, "encoder.pth")
    depth_decoder_path = os.path.join(model_path, "depth.pth")

    # LOADING PRETRAINED MODEL
    print("   Loading pretrained encoder")
    encoder = networks.ResnetEncoder(18, False)
    loaded_dict_enc = torch.load(encoder_path, map_location=device)

    # Extract the height and width of image that this model was trained with
    feed_height = loaded_dict_enc['height']
    feed_width = loaded_dict_enc['width']
    filtered_dict_enc = {
        k: v
        for k, v in loaded_dict_enc.items() if k in encoder.state_dict()
    }
    encoder.load_state_dict(filtered_dict_enc)
    encoder.to(device)
    encoder.eval()

    print("   Loading pretrained decoder")
    depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc,
                                          scales=range(4))
    loaded_dict = torch.load(depth_decoder_path, map_location=device)
    depth_decoder.load_state_dict(loaded_dict)
    depth_decoder.to(device)
    depth_decoder.eval()

    print("-> Loading complete, initializing the camera")

    # Initialize camera to capture image stream
    # Change the value to 0 when using default camera
    #video_stream = WebcamVideoStream(src=args.webcam).start()

    if not args.no_display:
        # Object to display images
        image_display = DisplayImage(not args.no_process)

    # Flag that records when 'q' is pressed to break out of inference loop below
    quit_inference = False

    def on_release(key):
        if key == keyboard.KeyCode.from_char('q'):
            nonlocal quit_inference
            quit_inference = True
            #s.close()
            return False

    keyboard.Listener(on_release=on_release).start()

    # Number of frames to capture to calculate fps
    num_frames = 5
    curr_time = np.zeros(num_frames)

    with torch.no_grad():
        print("Loop has started")
        host = "0.0.0.0"
        port = 5015
        s = socket.socket()
        try:
            s.bind((host, port))
        except socket.error as e:
            print(str(e))
        print("Socket setup")
        connected = True
        bufferSize = 8192
        #c, addr = s.accept()
        #print("Connected to :", addr[0], ":",addr[1])
        first_loop = True
        connection_ready = False
        while True:
            if quit_inference:
                if args.no_display:
                    print('-> Done')
                break

            if first_loop:
                frame = cv2.imread('assets/test_image.jpg')
                print("Read test image")
                first_loop = False
            elif not connection_ready:
                s.listen(10)
                c, addr = s.accept()
                print("Connected to: ", addr[0], ":", addr[1])
                connection_ready = True
                continue
            else:
                try:

                    data = c.recv(11)
                    print("data as a string: " + str(data))
                    if (str(data).startswith('b\'SIZE')):
                        tmp = str(data).split()
                        bufferSize = int(tmp[1][:-1])
                        print("tmp[1] :" + str(tmp[1]))

                        c.sendall("yes".encode())
                        data = bytearray(c.recv(bufferSize))
                        print(data)
                    #else:
                    #   data = bytearray(data) + bytearray(c.recv(bufferSize))

                    #data = bytearray(c.recv(bufferSize))
                    print("Data")
                    print(data)
                    frame_np = np.asarray(data, dtype=np.uint8)
                    print("frame_np")
                    print(frame_np)
                    frame = cv2.imdecode(frame_np, cv2.IMREAD_COLOR)
                    print("frame")
                    print(frame)
                # print(frame.shape)
                except socket.error as e:
                    connected = False
                    print("Connection lost, reconnecting")
                    while not connected:
                        try:
                            c.bind(("0.0.0.0", port))
                            c.listen()
                            c.accept()
                            print("Reconnection worked")
                            connected = True
                        except socket.error as e:
                            print(e)

            # Capture frame-by-frame
            #frame = video_stream.read()
        # frame = np.asarray(data, dtype =np.uint8)
        #PUT IN THE ACTUAL IMAGE RETRIEVAL HERE

        #print (type(frame))
        # Calculate the fps
            print("Got frame")
            curr_time[1:] = curr_time[:-1]
            curr_time[0] = time.time()
            fps = num_frames / (curr_time[0] - curr_time[len(curr_time) - 1])

            # Our operations on the frame come here
            #                input_image = pil.fromarray(frame).convert('RGB')
            #fh = open("testfile.jpg","wb")
            #fh.write(data)
            #fh.close()
            input_image = pil.fromarray(frame).convert('RGB')
            #          img = pil.open(fh)
            #           img.save(data, format ='jpg')
            #               print("type: "+ type(img))
            # input_image = pil.frombytes('RGB', len(data), data, 'raw')
            #input_image = pil.fromarray(data).convert('RGB')
            original_width, original_height = input_image.size
            input_image = input_image.resize((feed_width, feed_height),
                                             pil.LANCZOS)
            input_image = transforms.ToTensor()(input_image).unsqueeze(0)

            # PREDICTION
            print("Prediction starting")
            input_image = input_image.to(device)
            features = encoder(input_image)
            outputs = depth_decoder(features)

            disp = outputs[("disp", 0)]
            disp_resized = torch.nn.functional.interpolate(
                disp, (original_height, original_width), mode="nearest")

            # Get the predict depth
            scaled_disp, pred_depth = disp_to_depth(disp_resized, 0.1, 100)
            pred_depth_np = pred_depth.squeeze().cpu().detach().numpy()

            # Initialize a 3x4 depth map
            depth_map = np.zeros([3, 4])
            grid_width = original_width // 4
            grid_height = original_height // 3
            for i in range(len(depth_map)):
                for j in range(len(depth_map[0])):
                    # Cut and store the average value of depth information of 640x480 into 3x4 grid
                    depth_map[i][j] = get_avg_depth(pred_depth_np,
                                                    grid_width * i,
                                                    grid_height * j,
                                                    grid_width * (i + 1),
                                                    grid_height * (j + 1))

            # Giving a simple decision logic
            if depth_map[0, 1] <= 1 or depth_map[1, 1] <= 1 or depth_map[
                    0, 2] <= 1 or depth_map[1, 2] <= 1:
                if depth_map[1, 1] <= 1 and depth_map[1, 2] <= 1:
                    print("Dangerous!!! AHEAD")
                else:
                    if depth_map[0, 1] <= 1 or depth_map[1, 1] <= 1:
                        print("Dangerous!!! LEFT")
                    if depth_map[0, 2] <= 1 or depth_map[1, 2] <= 1:
                        print("Dangerous!!! RIGHT")
            elif np.sum(depth_map[0:2, 2:3]) <= 7 or np.sum(
                    depth_map[0:2, 2:3]) <= 7:
                if np.sum(depth_map[0:2, 0:1]) <= 7:
                    print("Careful!! LEFT")
                if np.sum(depth_map[0:2, 2:3]) <= 7:
                    print("Careful!! RIGHT")
            else:
                print("Clear")

            if not args.no_display:
                # DISPLAY
                # Generate color-mapped depth image
                disp_resized_np = disp_resized.squeeze().cpu().detach().numpy()
                image_display.display(frame,
                                      disp_resized_np,
                                      fps,
                                      original_width,
                                      original_height,
                                      blended=not args.no_blend)
            else:
                print(f"FPS: {fps}")

            # if quit_inference:
            #    if args.no_display:
            #        print('-> Done')
            #    break

    # When everything is done, stop camera stream
    video_stream.stop()
Example #5
0
def test_simple(args):
    """Function to predict for a single image or folder of images"""
    if torch.cuda.is_available() and not args.no_cuda:
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    download_model_if_doesnt_exist(args.model_name)
    model_path = os.path.join("models", args.model_name)

    print("-> Loading model from ", model_path)
    encoder_path = os.path.join(model_path, "encoder.pth")
    depth_decoder_path = os.path.join(model_path, "depth.pth")

    # LOADING PRETRAINED MODEL
    print("   Loading pretrained encoder")
    encoder = networks.ResnetEncoder(18, False)
    loaded_dict_enc = torch.load(encoder_path, map_location=device)

    # extract the height and width of image that this model was trained with
    feed_height = loaded_dict_enc["height"]
    feed_width = loaded_dict_enc["width"]
    filtered_dict_enc = {
        k: v
        for k, v in loaded_dict_enc.items() if k in encoder.state_dict()
    }
    encoder.load_state_dict(filtered_dict_enc)
    encoder.to(device)
    encoder.eval()

    print("   Loading pretrained decoder")
    depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc,
                                          scales=range(4))

    loaded_dict = torch.load(depth_decoder_path, map_location=device)
    depth_decoder.load_state_dict(loaded_dict)

    depth_decoder.to(device)
    depth_decoder.eval()

    # FINDING INPUT IMAGES
    if os.path.isfile(args.image_path):
        # Only testing on a single image
        paths = [args.image_path]
        output_directory = (os.path.dirname(args.image_path)
                            if not args.dump_path else args.dump_path)
    elif os.path.isdir(args.image_path):
        # Searching folder for images
        paths = glob.glob(
            os.path.join(args.image_path, "*.{}".format(args.ext)))
        output_directory = args.image_path if not args.dump_path else args.dump_path
    else:
        raise Exception("Can not find args.image_path: {}".format(
            args.image_path))

    print("-> Predicting on {:d} test images".format(len(paths)))

    # PREDICTING ON EACH IMAGE IN TURN
    with torch.no_grad():
        mse = 0
        for idx, image_path in enumerate(paths):

            if image_path.endswith("_disp.jpg"):
                # don't try to predict disparity for a disparity image!
                continue

            # Load image and preprocess
            input_image = pil.open(image_path).convert("RGB")
            original_width, original_height = input_image.size
            input_image = input_image.resize((feed_width, feed_height),
                                             pil.LANCZOS)
            input_image = transforms.ToTensor()(input_image).unsqueeze(0)

            # PREDICTION
            input_image = input_image.to(device)
            features = encoder(input_image)
            outputs = depth_decoder(features)

            disp = outputs[("disp", 0)]
            disp_resized = torch.nn.functional.interpolate(
                disp,
                (original_height, original_width),
                mode="bilinear",
                align_corners=False,
            )

            # Saving numpy file
            output_name = os.path.splitext(os.path.basename(image_path))[0]
            name_dest_npy = os.path.join(output_directory,
                                         "{}_disp.npy".format(output_name))
            scaled_disp, _ = disp_to_depth(disp, 0.1, 100)
            np.save(name_dest_npy, scaled_disp.cpu().numpy())

            # Saving colormapped depth image
            disp_resized_np = disp_resized.squeeze().cpu().numpy()
            vmax = np.percentile(disp_resized_np, 95)
            vmin = disp_resized_np.min()
            normalizer = mpl.colors.Normalize(vmin=vmin, vmax=vmax)
            mapper = cm.ScalarMappable(norm=normalizer, cmap="magma")
            colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] *
                              255).astype(np.uint8)
            im = pil.fromarray(colormapped_im)

            name_dest_im = os.path.join(output_directory,
                                        "{}_disp.jpeg".format(output_name))
            im.save(name_dest_im)

            # Calc error
            correct_file = re.sub(r"\.\w+", "_depth.npy", image_path)
            if os.path.exists(correct_file):
                correct = np.load(correct_file)[:, :, 0]
                disp_np = disp_resized.cpu().detach().numpy()
                disp_np = disp_np[0, 0, :, :]

                correct = ((correct - correct.min()) /
                           (correct.max() - correct.min()) * 255)
                disp_np = ((disp_np - disp_np.min()) /
                           (disp_np.max() - disp_np.min()) * 255)

                mse = mse + ((correct - disp_np)**2).mean()**0.5 / 255

            print("   Processed {:d} of {:d} images - saved prediction to {}".
                  format(idx + 1, len(paths), name_dest_im))

    print(f"mse: {mse}")
    print("-> Done!")
Example #6
0
def test_simple(args):
    """Function to predict for a single image or folder of images
    """
    assert args.model_name is not None, \
        "You must specify the --model_name parameter; see README.md for an example"

    if torch.cuda.is_available() and not args.no_cuda:
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    download_model_if_doesnt_exist(args.model_name)
    model_path = os.path.join("models", args.model_name)
    print("-> Loading model from ", model_path)
    encoder_path = os.path.join(model_path, "encoder.pth")
    depth_decoder_path = os.path.join(model_path, "depth.pth")

    # LOADING PRETRAINED MODEL
    print("   Loading pretrained encoder")
    encoder = networks.ResnetEncoder(18, False)
    loaded_dict_enc = torch.load(encoder_path, map_location=device)

    # extract the height and width of image that this model was trained with
    feed_height = loaded_dict_enc['height']
    feed_width = loaded_dict_enc['width']
    filtered_dict_enc = {k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict()}
    encoder.load_state_dict(filtered_dict_enc)
    encoder.to(device)
    encoder.eval()

    print("   Loading pretrained decoder")
    depth_decoder = networks.DepthDecoder(
        num_ch_enc=encoder.num_ch_enc, scales=range(4))

    loaded_dict = torch.load(depth_decoder_path, map_location=device)
    depth_decoder.load_state_dict(loaded_dict)

    depth_decoder.to(device)
    depth_decoder.eval()

    # FINDING INPUT IMAGES
    if os.path.isfile(args.image_path):
        # Only testing on a single image
        paths = [args.image_path]
        output_directory = os.path.dirname(args.image_path)
    elif os.path.isdir(args.image_path):
        # Searching folder for images
        paths = glob.glob(os.path.join(args.image_path, '*.{}'.format(args.ext)))
        output_directory = args.image_path
    else:
        raise Exception("Can not find args.image_path: {}".format(args.image_path))

    camera_intrinsics_px = [1242*0.58, 375*1.92, 1242*0.5, 375*0.5] # See datasets/kitti_dataset.py
    # TODO: improve loading intrinsics from file ?

    print("-> Predicting on {:d} test images".format(len(paths)))

    # PREDICTING ON EACH IMAGE IN TURN
    with torch.no_grad():
        for idx, image_path in enumerate(paths):

            if image_path.endswith("_disp.jpeg"):
                # don't try to predict disparity for a disparity image!
                continue

            # Load image and preprocess
            input_image_original = pil.open(image_path).convert('RGB')
            original_width, original_height = input_image_original.size
            input_image = input_image_original.resize((feed_width, feed_height), pil.LANCZOS)
            input_image = transforms.ToTensor()(input_image).unsqueeze(0)

            # PREDICTION
            input_image = input_image.to(device)
            features = encoder(input_image)
            outputs = depth_decoder(features)

            disp = outputs[("disp", 0)]
            disp_resized = torch.nn.functional.interpolate(
                disp, (original_height, original_width), mode="bilinear", align_corners=False)

            # Saving numpy file
            output_name = os.path.splitext(os.path.basename(image_path))[0]
            name_dest_npy = os.path.join(output_directory, "{}_disp.npy".format(output_name))
            scaled_disp, depth = disp_to_depth(disp, 0.1, 100)
            np.save(name_dest_npy, scaled_disp.cpu().numpy())

            # Save PLY pointcloud from depth map
            depth_resized = torch.nn.functional.interpolate(
                depth, (original_height, original_width), mode="nearest") # !! do not interpolate depth values
            depth_resized_np = depth_resized.cpu().numpy()[0][0]
            nbPts = 0
            plypoints = ""
            for v in range(0, original_height):
                for u in range(0, original_width):
                    d = depth_resized_np[v][u]
                    if d <= 0.0:
                        continue
                    r,g,b = input_image_original.getpixel((u,v))
                    x = d * (float(u) - camera_intrinsics_px[2]) / camera_intrinsics_px[0]
                    y = d * (float(v) - camera_intrinsics_px[3]) / camera_intrinsics_px[1]
                    z = d * 1.0;
                    nbPts += 1
                    plypoints += str(x) + " " + str(y) + " " + str(z) + " " + str(r) + " " + str(g) + " " + str(b) + "\n"
            plyhead = "ply\n"
            plyhead += "format ascii 1.0\n"
            plyhead += "element vertex " + str(nbPts) + "\n"
            plyhead += "property float x\n"
            plyhead += "property float y\n"
            plyhead += "property float z\n"
            plyhead += "property uchar red\n"
            plyhead += "property uchar green\n"
            plyhead += "property uchar blue\n"
            plyhead += "end_header\n"
            filePly = open(os.path.join(output_directory, "{}_disp.ply".format(output_name)), "w+")
            filePly.write(plyhead + plypoints + "\n")
            filePly.close()

            # Saving colormapped depth image
            disp_resized_np = disp_resized.squeeze().cpu().numpy()
            vmax = np.percentile(disp_resized_np, 95)
            normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(), vmax=vmax)
            mapper = cm.ScalarMappable(norm=normalizer, cmap='magma')
            colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] * 255).astype(np.uint8)
            im = pil.fromarray(colormapped_im)

            name_dest_im = os.path.join(output_directory, "{}_disp.jpeg".format(output_name))
            im.save(name_dest_im)

            print("   Processed {:d} of {:d} images - saved prediction to {}".format(
                idx + 1, len(paths), name_dest_im))

    print('-> Done!')
Example #7
0
def test_simple():
    ext = 'jpg'
    # model_name='mono_640x192'
    model_name = 'mono_1024x320'
    no_cuda = False

    if torch.cuda.is_available() and not no_cuda:
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    download_model_if_doesnt_exist(model_name)
    model_path = os.path.join("models", model_name)
    print("-> Loading model from ", model_path)
    encoder_path = os.path.join(model_path, "encoder.pth")
    depth_decoder_path = os.path.join(model_path, "depth.pth")

    # LOADING PRETRAINED MODEL
    print("   Loading pretrained encoder")
    encoder = networks.ResnetEncoder(18, False)
    loaded_dict_enc = torch.load(encoder_path, map_location=device)

    # extract the height and width of image that this model was trained with
    feed_height = loaded_dict_enc['height']
    feed_width = loaded_dict_enc['width']
    filtered_dict_enc = {
        k: v
        for k, v in loaded_dict_enc.items() if k in encoder.state_dict()
    }
    encoder.load_state_dict(filtered_dict_enc)
    encoder.to(device)
    encoder.eval()

    print("Loading pretrained decoder")
    depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc,
                                          scales=range(4))

    loaded_dict = torch.load(depth_decoder_path, map_location=device)
    depth_decoder.load_state_dict(loaded_dict)

    depth_decoder.to(device)
    depth_decoder.eval()

    run_glob = True  #à retirer, et préciser au début du namespace les vars globales.
    # boucle de vol
    while run_glob:  #en faire une variable globale.
        #faire ici le imwrite et définir le path.
        t = time.time()
        path_ = '/home/edern/Documents/TIPE/traitement/mesures/test_image_000062.jpg'
        output_directory = os.path.dirname(path_)

        with torch.no_grad():
            if path_.endswith("_disp.jpg"):
                # don't try to predict disparity for a disparity image!
                continue

            # Load image and preprocess
            input_image = pil.open(path_).convert('RGB')
            original_width, original_height = input_image.size
            input_image = input_image.resize((feed_width, feed_height),
                                             pil.LANCZOS)
            input_image = transforms.ToTensor()(input_image).unsqueeze(0)

            # PREDICTION
            input_image = input_image.to(device)
            features = encoder(input_image)
            outputs = depth_decoder(features)

            disp = outputs[("disp", 0)]
            disp_resized = torch.nn.functional.interpolate(
                disp, (original_height, original_width),
                mode="bilinear",
                align_corners=False)

            # Saving numpy file
            output_name = os.path.splitext(os.path.basename(path_))[0]  #MAP ?
            name_dest_npy = os.path.join(output_directory,
                                         "{}_disp.npy".format(output_name))
            scaled_disp, _ = disp_to_depth(disp, 0.1, 100)
            np.save(name_dest_npy, scaled_disp.cpu().numpy())

            # Saving colormapped depth image
            disp_resized_np = disp_resized.squeeze().cpu().numpy()
            vmax = np.percentile(disp_resized_np, 95)
            normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(),
                                              vmax=vmax)
            mapper = cm.ScalarMappable(norm=normalizer, cmap='magma')
            colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] *
                              255).astype(np.uint8)
            im = pil.fromarray(colormapped_im)

            name_dest_im = os.path.join(output_directory,
                                        "{}_disp.jpg".format(output_name))
            im.save(name_dest_im)

            run_glob = False  #tests seulement
    print('-> Done in {}'.format(time.time() - t))
from google.colab.patches import cv2_imshow 
import networks
from utils import download_model_if_doesnt_exist
import re 
import cv2
import glob
from PIL import Image

import re 
import cv2
import glob
from PIL import Image

model_name = "mono_640x192"

download_model_if_doesnt_exist(model_name)
encoder_path = os.path.join("models", model_name, "encoder.pth")
depth_decoder_path = os.path.join("models", model_name, "depth.pth")

# LOADING PRETRAINED MODEL
encoder = networks.ResnetEncoder(18, False)
depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc, scales=range(4))

loaded_dict_enc = torch.load(encoder_path, map_location='cpu')
filtered_dict_enc = {k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict()}
encoder.load_state_dict(filtered_dict_enc)

loaded_dict = torch.load(depth_decoder_path, map_location='cpu')
depth_decoder.load_state_dict(loaded_dict)

encoder.eval()
Example #9
0
def test_cam(args):
    """Function to predict for a camera image stream
    """

    if torch.cuda.is_available() and not args.no_cuda:
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    download_model_if_doesnt_exist(args.model_name)
    model_path = os.path.join("models", args.model_name)
    print("-> Loading model from ", model_path)
    encoder_path = os.path.join(model_path, "encoder.pth")
    depth_decoder_path = os.path.join(model_path, "depth.pth")

    # LOADING PRETRAINED MODEL
    print("   Loading pretrained encoder")
    encoder = networks.ResnetEncoder(18, False)
    loaded_dict_enc = torch.load(encoder_path, map_location=device)

    # Extract the height and width of image that this model was trained with
    feed_height = loaded_dict_enc['height']
    feed_width = loaded_dict_enc['width']
    filtered_dict_enc = {
        k: v
        for k, v in loaded_dict_enc.items() if k in encoder.state_dict()
    }
    encoder.load_state_dict(filtered_dict_enc)
    encoder.to(device)
    encoder.eval()

    print("   Loading pretrained decoder")
    depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc,
                                          scales=range(4))
    loaded_dict = torch.load(depth_decoder_path, map_location=device)
    depth_decoder.load_state_dict(loaded_dict)
    depth_decoder.to(device)
    depth_decoder.eval()

    print("-> Loading complete, initializing the camera")

    # Initialize camera to capture image stream
    # Change the value to 0 when using default camera
    video_stream = WebcamVideoStream(src=args.webcam).start()

    if not args.no_display:
        # Object to display images
        image_display = DisplayImage(not args.no_process)

    # Flag that records when 'q' is pressed to break out of inference loop below
    quit_inference = False

    def on_release(key):
        if key == keyboard.KeyCode.from_char('q'):
            nonlocal quit_inference
            quit_inference = True
            return False

    keyboard.Listener(on_release=on_release).start()

    # Number of frames to capture to calculate fps
    num_frames = 5
    curr_time = np.zeros(num_frames)
    with torch.no_grad():
        while True:
            if quit_inference:
                if args.no_display:
                    print('-> Done')
                break

            # Capture frame-by-frame
            frame = video_stream.read()

            # Calculate the fps
            curr_time[1:] = curr_time[:-1]
            curr_time[0] = time.time()
            fps = num_frames / (curr_time[0] - curr_time[len(curr_time) - 1])

            # Our operations on the frame come here
            input_image = pil.fromarray(frame).convert('RGB')
            original_width, original_height = input_image.size
            input_image = input_image.resize((feed_width, feed_height),
                                             pil.LANCZOS)
            input_image = transforms.ToTensor()(input_image).unsqueeze(0)

            # PREDICTION
            input_image = input_image.to(device)
            features = encoder(input_image)
            outputs = depth_decoder(features)

            disp = outputs[("disp", 0)]
            disp_resized = torch.nn.functional.interpolate(
                disp, (original_height, original_width), mode="nearest")

            # Get the predict depth
            scaled_disp, pred_depth = disp_to_depth(disp_resized, 0.1, 100)
            pred_depth_np = pred_depth.squeeze().cpu().detach().numpy()

            # Initialize a 3x4 depth map
            depth_map = np.zeros([3, 4])
            for i in range(len(depth_map)):
                for j in range(len(depth_map[0])):
                    # Cut and store the average value of depth information of 640x480 into 3x4 grid
                    depth_map[i][j] = get_avg_depth(pred_depth_np, 160 * i,
                                                    160 * j, 160 * i + 160,
                                                    160 * j + 160)

            # Giving a simple decision logic
            if depth_map[0, 1] <= 1 or depth_map[1, 1] <= 1 or depth_map[
                    0, 2] <= 1 or depth_map[1, 2] <= 1:
                if depth_map[1, 1] <= 1 and depth_map[1, 2] <= 1:
                    print("Dangerous!!! AHEAD")
                else:
                    if depth_map[0, 1] <= 1 or depth_map[1, 1] <= 1:
                        print("Dangerous!!! LEFT")
                    if depth_map[0, 2] <= 1 or depth_map[1, 2] <= 1:
                        print("Dangerous!!! RIGHT")
            elif np.sum(depth_map[0:2, 2:3]) <= 7 or np.sum(
                    depth_map[0:2, 2:3]) <= 7:
                if np.sum(depth_map[0:2, 0:1]) <= 7:
                    print("Careful!! LEFT")
                if np.sum(depth_map[0:2, 2:3]) <= 7:
                    print("Careful!! RIGHT")
            else:
                print("Clear")

            if not args.no_display:
                # DISPLAY
                # Generate color-mapped depth image
                disp_resized_np = disp_resized.squeeze().cpu().detach().numpy()
                image_display.display(frame,
                                      disp_resized_np,
                                      fps,
                                      original_width,
                                      original_height,
                                      blended=not args.no_blend)
            else:
                print(f"FPS: {fps}")

            # if quit_inference:
            #    if args.no_display:
            #        print('-> Done')
            #    break

    # When everything is done, stop camera stream
    video_stream.stop()
def test_simple(args):
    """Function to predict for a single image or folder of images
    """
    assert args.model_name is not None, \
        "You must specify the --model_name parameter; see README.md for an example"

    if torch.cuda.is_available() and not args.no_cuda:
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    download_model_if_doesnt_exist(args.model_name)
    model_path = os.path.join("models", args.model_name)
    print("-> Loading model from ", model_path)
    encoder_path = os.path.join(model_path, "encoder.pth")
    depth_decoder_path = os.path.join(model_path, "depth.pth")

    # LOADING PRETRAINED MODEL
    print("   Loading pretrained encoder")
    encoder = networks.ResnetEncoder(18, False)
    loaded_dict_enc = torch.load(encoder_path, map_location=device)

    # extract the height and width of image that this model was trained with
    feed_height = loaded_dict_enc['height']
    feed_width = loaded_dict_enc['width']
    filtered_dict_enc = {k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict()}
    encoder.load_state_dict(filtered_dict_enc)
    encoder.to(device)
    encoder.eval()

    print("   Loading pretrained decoder")
    depth_decoder = networks.DepthDecoder(
        num_ch_enc=encoder.num_ch_enc, scales=range(4))

    loaded_dict = torch.load(depth_decoder_path, map_location=device)
    depth_decoder.load_state_dict(loaded_dict)

    depth_decoder.to(device)
    depth_decoder.eval()


    try:
        cap = cv2.VideoCapture(int(args.video_path))
        print(f"Loaded camera {int(args.video_path)}")
    except ValueError:
        cap = cv2.VideoCapture(args.video_path)
        print(f"Loaded video file {int(args.video_path)}")

    # PREDICTING ON EACH IMAGE IN TURN
    try:
        with torch.no_grad():
            while True:
                _, image = cap.read()
                # image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                # if image_path.endswith("_disp.jpg"):
                #     # don't try to predict disparity for a disparity image!
                #     continue

                # Load image and preprocess
                input_image = pil.fromarray(image)
                original_width, original_height = input_image.size
                input_image = input_image.resize((feed_width, feed_height), pil.LANCZOS)
                input_image = transforms.ToTensor()(input_image).unsqueeze(0)

                # PREDICTION
                input_image = input_image.to(device)
                features = encoder(input_image)
                outputs = depth_decoder(features)

                disp = outputs[("disp", 0)]
                disp_resized = torch.nn.functional.interpolate(
                    disp, (original_height, original_width), mode="bilinear", align_corners=False)

                # Saving numpy file
                # output_name = os.path.splitext(os.path.basename(image_path))[0]
                # name_dest_npy = os.path.join(output_directory, "{}_disp.npy".format(output_name))
                # scaled_disp, _ = disp_to_depth(disp, 0.1, 100)
                # np.save(name_dest_npy, scaled_disp.cpu().numpy())

                # Saving colormapped depth image
                disp_resized_np = disp_resized.squeeze().cpu().numpy()
                vmax = np.percentile(disp_resized_np, 95)
                normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(), vmax=vmax)
                mapper = cm.ScalarMappable(norm=normalizer, cmap='magma')
                colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] * 255).astype(np.uint8)
                cv2.imshow("", np.concatenate((colormapped_im, cv2.resize(image, tuple(colormapped_im.shape[:2][::-1])))))
                key = cv2.waitKey(10)
                if key == ord('q'):
                    break
                elif key == ord('c'):
                    cv2.imwrite("assets/test_image2.jpeg", image)
                # im = pil.fromarray(colormapped_im)

                # name_dest_im = os.path.join(output_directory, "{}_disp.jpeg".format(output_name))
                # im.save(name_dest_im)

                # print("   Processed {:d} of {:d} images - saved prediction to {}".format(
                #     idx + 1, len(paths), name_dest_im))

        print('-> Done!')
    except Exception:
        raise
    finally:
        cap.release()
        cv2.destroyAllWindows()
Example #11
0
    def __init__(self, _host_frame, _target_frame):
        '''
        initialize the randpattern based photometric residual wrapper
        :param _host_frame: numpy ndarray H x W x 3 image.
        :param _target_frame: numpy ndarray image, same dimension as above.
        '''
        # load options
        options = MonodepthOptions()
        opts = options.parse()
        self.opt = opts
        self.num_input_frames = len(self.opt.frame_ids)
        # init model
        self.model_name = "mono_1024x320"

        download_model_if_doesnt_exist(self.model_name)
        self.encoder_path = os.path.join("models", self.model_name,
                                         "encoder.pth")
        self.depth_decoder_path = os.path.join("models", self.model_name,
                                               "depth.pth")
        self.pose_encoder_path = os.path.join("models", self.model_name,
                                              "pose_encoder.pth")
        self.pose_decoder_path = os.path.join("models", self.model_name,
                                              "pose.pth")

        # LOADING PRETRAINED MODEL
        self.encoder = networks.ResnetEncoder(18, False)
        self.depth_decoder = networks.DepthDecoder(
            num_ch_enc=self.encoder.num_ch_enc, scales=range(4))
        self.pose_encoder = networks.ResnetEncoder(self.opt.num_layers, False,
                                                   2)
        # self.pose_encoder = networks.PoseCNN(self.num_input_frames if self.opt.pose_model_input == "all" else 2)
        self.pose_decoder = networks.PoseDecoder(self.pose_encoder.num_ch_enc,
                                                 1, 2)
        # self.pose_decoder = networks.PoseDecoder(self.pose_encoder.num_ch_enc, num_input_features=1,
        #                                          num_frames_to_predict_for=2)

        self.loaded_dict_enc = torch.load(self.encoder_path,
                                          map_location='cpu')
        self.filtered_dict_enc = {
            k: v
            for k, v in self.loaded_dict_enc.items()
            if k in self.encoder.state_dict()
        }
        self.encoder.load_state_dict(self.filtered_dict_enc)

        self.loaded_dict_pose_enc = torch.load(self.pose_encoder_path,
                                               map_location='cpu')
        self.filtered_dict_pose_enc = {
            k: v
            for k, v in self.loaded_dict_pose_enc.items()
            if k in self.pose_encoder.state_dict()
        }
        self.pose_encoder.load_state_dict(self.filtered_dict_pose_enc)

        self.loaded_dict = torch.load(self.depth_decoder_path,
                                      map_location='cpu')
        self.depth_decoder.load_state_dict(self.loaded_dict)

        self.loaded_dict_pose = torch.load(self.pose_decoder_path,
                                           map_location='cpu')
        self.pose_decoder.load_state_dict(self.loaded_dict_pose)

        self.encoder.eval()
        self.depth_decoder.eval()

        self.pose_encoder.eval()
        self.pose_decoder.eval()
        self.isgood = []

        # define frames
        self.host_frame = _host_frame
        self.target_frame = _target_frame
        self.host_frame_dx, self.host_frame_dy = image_gradients(
            self.host_frame)
        self.target_frame_dx, self.target_frame_dy = image_gradients(
            self.target_frame)

        # dso's pattern:
        self.residual_pattern = np.array([
            [0, 0],
            [-2, 0],
            [2, 0],
            [-1, -1],
            [1, 1],
            [-1, 1],
            [1, -1],
            [0, 2],
            [0, -2],
        ])
def test_cam(args):
    """Function to predict for an image stream
    """

    # Determine where to run inference
    if torch.cuda.is_available() and not args.no_cuda:
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    # Download model given in args if it doesn't exist
    download_model_if_doesnt_exist(args.model_name)
    model_path = os.path.join("models", args.model_name)
    print("-> Loading model from ", model_path)
    encoder_path = os.path.join(model_path, "encoder.pth")
    depth_decoder_path = os.path.join(model_path, "depth.pth")

    # LOADING PRETRAINED MODEL
    print("   Loading pretrained encoder")
    encoder = networks.ResnetEncoder(18, False)
    loaded_dict_enc = torch.load(encoder_path, map_location=device)

    # Extract the height and width of image that this model was trained with
    feed_height = loaded_dict_enc['height']
    feed_width = loaded_dict_enc['width']
    filtered_dict_enc = {
        k: v
        for k, v in loaded_dict_enc.items() if k in encoder.state_dict()
    }
    encoder.load_state_dict(filtered_dict_enc)
    encoder.to(device)
    encoder.eval()

    print("   Loading pretrained decoder")
    depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc,
                                          scales=range(4))
    loaded_dict = torch.load(depth_decoder_path, map_location=device)
    depth_decoder.load_state_dict(loaded_dict)
    depth_decoder.to(device)
    depth_decoder.eval()

    print("-> Loading complete, initializing the camera")

    # Get coco labels
    ctypes.CDLL("../TRT_object_detection/lib/libflattenconcat.so")
    COCO_LABELS = coco.COCO_CLASSES_LIST

    # initialize
    TRT_LOGGER = trt.Logger(trt.Logger.INFO)
    trt.init_libnvinfer_plugins(TRT_LOGGER, '')
    runtime = trt.Runtime(TRT_LOGGER)

    # compile model into TensorRT
    if not os.path.isfile(model.TRTbin):
        dynamic_graph = model.add_plugin(gs.DynamicGraph(model.path))
        uff_model = uff.from_tensorflow(dynamic_graph.as_graph_def(),
                                        model.output_name,
                                        output_filename='tmp.uff')

        with trt.Builder(TRT_LOGGER) as builder, builder.create_network(
        ) as network, trt.UffParser() as parser:
            builder.max_workspace_size = 1 << 28
            builder.max_batch_size = 1
            builder.fp16_mode = True

            parser.register_input('Input', model.dims)
            parser.register_output('MarkOutput_0')
            parser.parse('tmp.uff', network)
            engine = builder.build_cuda_engine(network)

            buf = engine.serialize()
            with open(model.TRTbin, 'wb') as f:
                f.write(buf)

    # create engine
    with open(model.TRTbin, 'rb') as f:

        buf = f.read()
        engine = runtime.deserialize_cuda_engine(buf)

    # create buffer
    host_inputs = []
    cuda_inputs = []
    host_outputs = []
    cuda_outputs = []
    bindings = []
    stream = cuda.Stream()

    for binding in engine:
        size = trt.volume(
            engine.get_binding_shape(binding)) * engine.max_batch_size
        host_mem = cuda.pagelocked_empty(size, np.float32)
        cuda_mem = cuda.mem_alloc(host_mem.nbytes)

        bindings.append(int(cuda_mem))
        if engine.binding_is_input(binding):
            host_inputs.append(host_mem)
            cuda_inputs.append(cuda_mem)
        else:
            host_outputs.append(host_mem)
            cuda_outputs.append(cuda_mem)
    context = engine.create_execution_context()

    if not args.no_display:
        # Object to display images
        image_display = DisplayImage(not args.no_process)

    # Flag that records when 'q' is pressed to break out of inference loop below
    quit_inference = False

    # Listener for key board presses and updates quit_inference
    def on_release(key):
        if key == keyboard.KeyCode.from_char('q'):
            nonlocal quit_inference
            quit_inference = True
            return False

    # Initialize listener
    keyboard.Listener(on_release=on_release).start()

    status_socket_thread = SocketStatusThread()
    status_socket_thread.start()

    image_stream_thread = ImageStreamThread()
    image_stream_thread.start()

    # Number of frames to capture to calculate fps
    num_frames = 5
    curr_time = np.zeros(num_frames)

    with torch.no_grad():
        while True:
            if quit_inference:
                if args.no_display:
                    image_stream_thread.stop()
                    print('-> Done')
                break
            frame = image_stream_thread.read_frame()

            # Calculate the fps
            curr_time[1:] = curr_time[:-1]
            curr_time[0] = time.time()
            fps = num_frames / (curr_time[0] - curr_time[len(curr_time) - 1])

            # Do depth inference
            disp_resized, danger_level, danger_side, original_width, original_height = predict_depth(
                frame, feed_width, feed_height, device, encoder, depth_decoder)

            # Only do object detection if danger level is above 0 (i.e. Careful or Dangerous)
            print(f"Danger level: {danger_level}")
            detections_str = ""
            if danger_level > 0:
                detections = detect_objects(frame, host_inputs, host_outputs,
                                            cuda_inputs, cuda_outputs,
                                            bindings, stream, context,
                                            COCO_LABELS)
                # Only sending back detections in region where depth seems close
                # detections = detections_dict[danger_side]
                detections_str = '\n' + '\n'.join('$'.join(map(str, obj))
                                                  for obj in detections)
                print(str(detections))
                print(f"Detections: {detections_str}")

            # Construct string with danger level and END signal
            # Separate each piece (i.e. danger level, each detection, END) with new line so client socket knows
            # where each item ends
            result = str(
                danger_level) + "\n" + danger_side + detections_str + "\nEND\n"
            print("Sending result...")
            image_stream_thread.send_result(result)

            if not args.no_display:
                # Generate color-mapped depth image and display alongside original frame and blended, if chosen
                disp_resized_np = disp_resized.squeeze().cpu().detach().numpy()
                image_display.display(frame,
                                      disp_resized_np,
                                      fps,
                                      original_width,
                                      original_height,
                                      blended=not args.no_blend)
                cv2.waitKey(1)
            else:
                print(f"FPS: {fps}")

    print("Outside of with statement")
    image_stream_thread.stop()
Example #13
0
def video_test_simple(args):
    """Function to predict for a single image or folder of images
    """
    assert args.model_name is not None, \
        "You must specify the --model_name parameter; see README.md for an example"

    if torch.cuda.is_available() and not args.no_cuda:
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    download_model_if_doesnt_exist(args.model_name)
    model_path = os.path.join("models", args.model_name)
    print("-> Loading model from ", model_path)
    encoder_path = os.path.join(model_path, "encoder.pth")
    depth_decoder_path = os.path.join(model_path, "depth.pth")

    # LOADING PRETRAINED MODEL
    print("   Loading pretrained encoder")
    encoder = networks.ResnetEncoder(18, False)
    loaded_dict_enc = torch.load(encoder_path, map_location=device)

    # extract the height and width of image that this model was trained with
    feed_height = loaded_dict_enc['height']
    feed_width = loaded_dict_enc['width']
    filtered_dict_enc = {
        k: v
        for k, v in loaded_dict_enc.items() if k in encoder.state_dict()
    }
    encoder.load_state_dict(filtered_dict_enc)
    encoder.to(device)
    encoder.eval()

    print("   Loading pretrained decoder")
    depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc,
                                          scales=range(4))

    loaded_dict = torch.load(depth_decoder_path, map_location=device)
    depth_decoder.load_state_dict(loaded_dict)

    depth_decoder.to(device)
    depth_decoder.eval()

    vs = cv2.VideoCapture(args.video_path)
    writer = None

    try:
        prop = cv2.cv.CV_CAP_PROP_FRAME_COUNT if imutils.is_cv2() \
            else cv2.CAP_PROP_FRAME_COUNT
        total = int(vs.get(prop))
        print("   {} total frames in video".format(total))

    except:
        print("   Could not determine # of frames in video")
        print("   No approx. completion time can be provided")
        total = -1

    # FINDING INPUT VIDEO
    if os.path.isfile(args.video_path):
        paths = [args.video_path]
    elif os.path.isdir(args.video_path):
        paths = glob.glob(
            os.path.join(args.video_path, '*.{}'.format(args.ext)))

    else:
        raise Exception("Can not find args.video_path: {}".format(
            args.video_path))

    # PREDICTING
    with torch.no_grad():
        while True:
            # Load frame and preprocess
            (grabbed, input_image) = vs.read()
            if not grabbed:
                break

            original_height, original_width, c = input_image.shape
            input_image = cv2.resize(input_image, (feed_width, feed_height),
                                     interpolation=cv2.INTER_LANCZOS4)
            input_image = transforms.ToTensor()(input_image).unsqueeze(0)

            # PREDICTION
            start = time.time()
            input_image = input_image.to(device)
            features = encoder(input_image)
            outputs = depth_decoder(features)

            disp = outputs[("disp", 0)]
            disp_resized = torch.nn.functional.interpolate(
                disp, (original_height, original_width),
                mode="bilinear",
                align_corners=False)

            # Saving colormapped depth image
            disp_resized_np = disp_resized.squeeze().cpu().numpy()
            vmax = np.percentile(disp_resized_np, 95)
            normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(),
                                              vmax=vmax)
            mapper = cm.ScalarMappable(norm=normalizer, cmap='magma')
            colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] *
                              255).astype(np.uint8)
            colormapped_im = cv2.cvtColor(colormapped_im, cv2.COLOR_RGB2BGR)
            end = time.time()

            if writer is None:
                # Initialize our video writer
                fourcc = cv2.VideoWriter_fourcc(*'MJPG')
                writer = cv2.VideoWriter(
                    args.video_path_output, fourcc, 30,
                    (colormapped_im.shape[1], colormapped_im.shape[0]), True)
                if total > 0:
                    elap = (end - start)
                    print("   Single frame took {:.4f} seconds".format(elap))
                    print("   Estimated total time to finish: {:.4f}".format(
                        elap * total))
            # Write the output frame to disk
            writer.write(colormapped_im)

    print('-> Done!')
Example #14
0
def test_webcam(args):
    """Function to predict for a single image or folder of images
    """
    assert args.model_name is not None, \
        "You must specify the --model_name parameter; see README.md for an example"

    if torch.cuda.is_available() and not args.no_cuda:
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    download_model_if_doesnt_exist(args.model_name)
    model_path = os.path.join("models", args.model_name)
    print("-> Loading model from ", model_path)
    encoder_path = os.path.join(model_path, "encoder.pth")
    depth_decoder_path = os.path.join(model_path, "depth.pth")

    # LOADING PRETRAINED MODEL
    print("   Loading pretrained encoder")
    encoder = networks.ResnetEncoder(18, False)
    loaded_dict_enc = torch.load(encoder_path, map_location=device)

    # extract the height and width of image that this model was trained with
    feed_height = loaded_dict_enc['height']
    feed_width = loaded_dict_enc['width']
    filtered_dict_enc = {
        k: v
        for k, v in loaded_dict_enc.items() if k in encoder.state_dict()
    }
    encoder.load_state_dict(filtered_dict_enc)
    encoder.to(device)
    encoder.eval()

    print("   Loading pretrained decoder")
    depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc,
                                          scales=range(4))

    loaded_dict = torch.load(depth_decoder_path, map_location=device)
    depth_decoder.load_state_dict(loaded_dict)

    depth_decoder.to(device)
    depth_decoder.eval()

    video_capture = cv2.VideoCapture(webcam_index)

    # PREDICTING ON EACH IMAGE IN TURN
    with torch.no_grad():
        while True:
            ret, frame = video_capture.read()  # frame shape 640*480*3
            if frame.shape[0] == 0:
                break

            # Load image and preprocess
            input_image = cv2.resize(frame, (feed_width, feed_height))
            input_image = transforms.ToTensor()(input_image).unsqueeze(0)

            # PREDICTION
            input_image = input_image.to(device)
            features = encoder(input_image)
            outputs = depth_decoder(features)

            disp = outputs[("disp", 0)]
            disp_resized = torch.nn.functional.interpolate(
                disp, (frame.shape[0], frame.shape[1]),
                mode="bilinear",
                align_corners=False)

            # Saving colormapped depth image
            disp_resized_np = disp_resized.squeeze().cpu().numpy()
            # print(disp_resized_np.shape)
            # vmax = np.percentile(disp_resized_np, 95)
            # normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(), vmax=vmax)
            mapper = cm.ScalarMappable(cmap='magma')
            colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] *
                              255).astype(np.uint8)
            im = pil.fromarray(colormapped_im)

            cv2.imshow('out', colormapped_im)

            if cv2.waitKey(1) & 0xFF == ord('q'):
                break

    video_capture.release()
    cv2.destroyAllWindows()
Example #15
0
def test_simple(args):
    """Function to predict for a single image or folder of images
    """
    assert args.model_name is not None, \
        "You must specify the --model_name parameter; see README.md for an example"

    if torch.cuda.is_available() and not args.no_cuda:
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    download_model_if_doesnt_exist(args.model_name)
    model_path = os.path.join("models", args.model_name)
    print("-> Loading model from ", model_path)
    encoder_path = os.path.join(model_path, "encoder.pth")
    depth_decoder_path = os.path.join(model_path, "depth.pth")

    # LOADING PRETRAINED MODEL
    print("   Loading pretrained encoder")
    encoder = networks.ResnetEncoder(18, False)
    loaded_dict_enc = torch.load(encoder_path, map_location=device)

    # extract the height and width of image that this model was trained with
    feed_height = loaded_dict_enc['height']
    feed_width = loaded_dict_enc['width']
    filtered_dict_enc = {k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict()}
    encoder.load_state_dict(filtered_dict_enc)
    encoder.to(device)
    encoder.eval()

    print("Loading pretrained decoder")
    depth_decoder = networks.DepthDecoder(
        num_ch_enc=encoder.num_ch_enc, scales=range(4))

    loaded_dict = torch.load(depth_decoder_path, map_location=device)
    depth_decoder.load_state_dict(loaded_dict)

    depth_decoder.to(device)
    depth_decoder.eval()

    print("Loading pose networks")
    pose_encoder_path = os.path.join(model_path, "pose_encoder.pth")
    pose_decoder_path = os.path.join(model_path, "pose.pth")

    pose_encoder = networks.ResnetEncoder(18, False, 2)
    pose_encoder.load_state_dict(torch.load(pose_encoder_path))

    pose_decoder = networks.PoseDecoder(pose_encoder.num_ch_enc, 1, 2)
    pose_decoder.load_state_dict(torch.load(pose_decoder_path))

    pose_encoder.cuda()
    pose_encoder.eval()
    pose_decoder.cuda()
    pose_decoder.eval()
    

    bag_name = '2019-12-17-13-24-03'
    map_name = "feature=base&ver=2019121700&base_pt=(32.75707,-111.55757)&end_pt=(32.092537212,-110.7892506)"
    begin = '0:36:00'
    end = '0:37:00'
    output_directory = "assets/"

    dataset = TSDataset(bag_name, begin, end)
    pred_depth = []
    pred_poses = []
    last_img = None

    # PREDICTING ON EACH IMAGE IN TURN
    with torch.no_grad():
        for idx, input_image in enumerate(dataset):

            # Load image and preprocess
            original_width, original_height = input_image.size
            input_image = input_image.resize((feed_width, feed_height), pil.LANCZOS)
            input_image = transforms.ToTensor()(input_image).unsqueeze(0)

            # PREDICTION
            input_image = input_image.to(device)
            features = encoder(input_image)
            outputs = depth_decoder(features)

            disp = outputs[("disp", 0)]
            disp_resized = torch.nn.functional.interpolate(
                disp, (original_height, original_width), mode="bilinear", align_corners=False)

            # Saving colormapped depth image
            disp_resized_np = disp_resized.squeeze().cpu().numpy()
            vmax = np.percentile(disp_resized_np, 95)
            normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(), vmax=vmax)
            mapper = cm.ScalarMappable(norm=normalizer, cmap='magma')
            colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] * 255).astype(np.uint8)
            im = pil.fromarray(colormapped_im)
            pred_depth.append(im)

            # Handle pose
            if last_img is None:
                last_img = input_image
            all_color_aug = torch.cat([last_img, input_image], 1)
            last_img = input_image

            features = [pose_encoder(all_color_aug)]
            axisangle, translation = pose_decoder(features)
            pose = transformation_from_parameters(axisangle[:, 0], translation[:, 0]).cpu().numpy()
            pred_poses.append(pose)
            
            print("   Processed {:d} of {:d} images".format(
                idx + 1, len(dataset)))
    pred_poses = np.concatenate(pred_poses, axis=0)
    print(pred_poses.shape)
    np.save("poses.npy", pred_poses)

    # save_video(pred_depth)

    print('-> Done!')
Example #16
0
def test_depth_pose(args):
    """Function to predict depth and pose
    """
    assert args.model_name is not None, \
        "You must specify the --model_name parameter; see README.md for an example"

    if torch.cuda.is_available() and not args.no_cuda:
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    download_model_if_doesnt_exist(args.model_name)
    model_path = os.path.join("models", args.model_name)
    print("-> Loading model from ", model_path)
    encoder_path = os.path.join(model_path, "encoder.pth")
    depth_decoder_path = os.path.join(model_path, "depth.pth")
    pose_encoder_path = os.path.join(model_path, "pose_encoder.pth")
    pose_decoder_path = os.path.join(model_path, "pose.pth")

    # LOADING PRETRAINED MODEL
    print("   Loading pretrained depth encoder")
    encoder = networks.ResnetEncoder(18, False)
    loaded_dict_enc = torch.load(encoder_path, map_location=device)

    print("   Loading pretrained pose encoder")
    pose_encoder = networks.ResnetEncoder(18, False, 2)
    loaded_dict_pose_enc = torch.load(pose_encoder_path, map_location=device)

    # extract the height and width of image that this model was trained with
    feed_height = loaded_dict_enc['height']
    feed_width = loaded_dict_enc['width']
    filtered_dict_enc = {
        k: v
        for k, v in loaded_dict_enc.items() if k in encoder.state_dict()
    }
    encoder.load_state_dict(filtered_dict_enc)

    pose_encoder.load_state_dict(loaded_dict_pose_enc)

    encoder.to(device)
    pose_encoder.to(device)
    encoder.eval()
    pose_encoder.eval()

    print("   Loading pretrained depth decoder")
    depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc,
                                          scales=range(4))

    loaded_dict = torch.load(depth_decoder_path, map_location=device)
    depth_decoder.load_state_dict(loaded_dict)

    print("   Loading pretrained pose decoder")
    pose_decoder = networks.PoseDecoder(pose_encoder.num_ch_enc, 1, 2)
    loaded_dict_pose = torch.load(pose_decoder_path, map_location=device)
    pose_decoder.load_state_dict(loaded_dict_pose)

    depth_decoder.to(device)
    pose_decoder.to(device)
    depth_decoder.eval()
    pose_decoder.eval()

    print("-> Predicting on test images")

    pred_depths = []
    pred_poses = []

    backproject_depth = BackprojectDepth(1, feed_height, feed_width)
    backproject_depth.to(device)
    project_3d = Project3D(1, feed_height, feed_width)
    project_3d.to(device)

    K = np.array(
        [[0.58, 0, 0.5, 0], [0, 1.92, 0.5, 0], [0, 0, 1, 0], [0, 0, 0, 1]],
        dtype=np.float32)
    K[0, :] *= feed_width
    K[1, :] *= feed_height
    inv_K = np.linalg.pinv(K)

    K = torch.from_numpy(K)
    K = K.unsqueeze(0).to(device)
    inv_K = torch.from_numpy(inv_K)
    inv_K = inv_K.unsqueeze(0).to(device)

    # PREDICTING ON EACH IMAGE IN TURN
    with torch.no_grad():

        for i in range(107):

            # Load image and preprocess
            image_0_path = './kitti_data/01/{:010d}.jpg'.format(i)
            input_image_0 = Image.open(image_0_path).convert('RGB')
            original_width, original_height = input_image_0.size
            input_image_0 = input_image_0.resize((feed_width, feed_height),
                                                 Image.LANCZOS)
            input_image_0 = transforms.ToTensor()(input_image_0).unsqueeze(0)

            image_1_path = './kitti_data/01/{:010d}.jpg'.format(i + 1)
            input_image_1 = Image.open(image_1_path).convert('RGB')
            input_image_1 = input_image_1.resize((feed_width, feed_height),
                                                 Image.LANCZOS)
            input_image_1 = transforms.ToTensor()(input_image_1).unsqueeze(0)

            # PREDICTION for depth
            input_image_0 = input_image_0.to(device)
            features = encoder(input_image_0)
            outputs = depth_decoder(features)

            disp = outputs[("disp", 0)]
            #disp_resized = torch.nn.functional.interpolate(
            #    disp, (original_height, original_width), mode="bilinear", align_corners=False)

            _, pred_depth = disp_to_depth(disp, 0.1, 100)
            pred_depth = pred_depth.cpu()[:, 0].numpy()

            pred_depths.append(pred_depth[0])

            print("   Predict Depth {:d}".format(i))

            # PREDICTION for pose
            input_image_1 = input_image_1.to(device)
            input_image_pose = torch.cat([input_image_0, input_image_1], 1)
            features_pose = pose_encoder(input_image_pose)
            features_pose = [features_pose]
            axisangle, translation = pose_decoder(features_pose)

            pred_pose = transformation_from_parameters(axisangle[:, 0],
                                                       translation[:, 0])

            pred_poses.append(pred_pose.cpu()[0].numpy())

            print("   Predict Pose {:d}".format(i))
            print(pred_pose)

            # WARPED image
            if RECONSTRUCTION:
                print("   Reconstruct image {:d}".format(i))
                cam_points = backproject_depth(pred_depth, inv_K)
                pix_coords = project_3d(cam_points, K, pred_pose)
                reconstruct_image_0 = torch.nn.functional.grid_sample(
                    input_image_1, pix_coords, padding_mode="border")
                print("   Saving resonstructed image...")

                reconstruct_image_0 = torch.nn.functional.interpolate(
                    reconstruct_image_0, (original_height, original_width),
                    mode="bilinear",
                    align_corners=False)
                reconstruct_image_0_np = reconstruct_image_0.squeeze().cpu(
                ).numpy()
                reconstruct_image_0_np = (reconstruct_image_0_np * 255).astype(
                    np.uint8)
                reconstruct_image_0_np = np.concatenate([
                    np.expand_dims(reconstruct_image_0_np[i], 2)
                    for i in range(3)
                ], 2)
                im = Image.fromarray(reconstruct_image_0_np, mode='RGB')
                name_dest_im = os.path.join("kitti_data/01", "warped",
                                            "{:010d}_warped.jpg".format(i))
                im.save(name_dest_im)
            print("...")

    np.save('kitti_data/pred_depth_01.npy', np.array(pred_depths))
    np.save('kitti_data/pred_pose_01.npy', np.array(pred_poses))
    print('-> Done!')
def test_simple(args):
    """Function to predict for a single image or folder of images
    """
    assert args.model_name is not None, \
        "You must specify the --model_name parameter; see README.md for an example"

    if torch.cuda.is_available() and not args.no_cuda:
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    download_model_if_doesnt_exist(args.model_name)
    model_path = os.path.join("models", args.model_name)
    print("-> Loading model from ", model_path)
    encoder_path = os.path.join(model_path, "encoder.pth")
    depth_decoder_path = os.path.join(model_path, "depth.pth")

    # LOADING PRETRAINED MODEL
    print("   Loading pretrained encoder")
    encoder = networks.ResnetEncoder(18, False)
    loaded_dict_enc = torch.load(encoder_path, map_location=device)

    # extract the height and width of image that this model was trained with
    feed_height = loaded_dict_enc['height']
    feed_width = loaded_dict_enc['width']
    filtered_dict_enc = {
        k: v
        for k, v in loaded_dict_enc.items() if k in encoder.state_dict()
    }
    encoder.load_state_dict(filtered_dict_enc)
    encoder.to(device)
    encoder.eval()

    print("   Loading pretrained decoder")
    depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc,
                                          scales=range(4))

    loaded_dict = torch.load(depth_decoder_path, map_location=device)
    depth_decoder.load_state_dict(loaded_dict)

    depth_decoder.to(device)
    depth_decoder.eval()

    datasetVOT = [
        'bag', 'ball1', 'ball2', 'basketball', 'birds1', 'birds2', 'blanket',
        'bmx', 'bolt1', 'bolt2', 'book', 'butterfly', 'car1', 'car2',
        'crossing', 'dinosaur', 'fernando', 'fish1', 'fish2', 'fish3', 'fish4',
        'girl', 'glove', 'godfather', 'graduate', 'gymnastics1', 'gymnastics2',
        'gymnastics3', 'gymnastics4', 'hand', 'handball1', 'handball2',
        'helicopter', 'iceskater1', 'iceskater2', 'leaves', 'marching',
        'matrix', 'motocross1', 'motocross2', 'nature', 'octopus',
        'pedestrian1', 'pedestrian2', 'rabbit', 'racing', 'road', 'shaking',
        'sheep', 'singer1', 'singer2', 'singer3', 'soccer1', 'soccer2',
        'soldier', 'sphere', 'tiger', 'traffic', 'tunnel', 'wiper'
    ]

    datasetMOT = [
        'MOT17-02', 'MOT17-04', 'MOT17-05', 'MOT17-09', 'MOT17-10', 'MOT17-11',
        'MOT17-13'
    ]

    for d in datasetMOT[:]:

        new_path = args.image_path + "\\" + d + "\\img1"
        print(new_path, d)
        if os.path.isdir(new_path):
            # Searching folder for images
            paths = glob.glob(os.path.join(new_path, '*.{}'.format(args.ext)))
            output_directory = paths
        else:
            raise Exception("Can not find args.image_path: {}".format(
                args.image_path))

        print("-> Predicting on {:d} test images".format(len(paths)))

        # PREDICTING ON EACH IMAGE IN TURN
        with torch.no_grad():
            for idx, image_path in enumerate(paths):

                if image_path.endswith("_disp.jpg"):
                    # don't try to predict disparity for a disparity image!
                    continue

                # Load image and preprocess
                input_image = pil.open(image_path).convert('RGB')
                original_width, original_height = input_image.size
                input_image = input_image.resize((feed_width, feed_height),
                                                 pil.LANCZOS)
                input_image = transforms.ToTensor()(input_image).unsqueeze(0)

                # PREDICTION
                input_image = input_image.to(device)
                features = encoder(input_image)
                outputs = depth_decoder(features)

                disp = outputs[("disp", 0)]
                disp_resized = torch.nn.functional.interpolate(
                    disp, (original_height, original_width),
                    mode="bilinear",
                    align_corners=False)

                # Saving numpy file
                output_name = os.path.splitext(os.path.basename(image_path))[0]
                name_dest_npy = os.path.join(output_directory,
                                             "{}_disp.npy".format(output_name))
                scaled_disp, _ = disp_to_depth(disp, 0.1, 100)
                np.save(name_dest_npy, scaled_disp.cpu().numpy())

                # Saving colormapped depth image
                disp_resized_np = disp_resized.squeeze().cpu().numpy()
                vmax = np.percentile(disp_resized_np, 95)
                normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(),
                                                  vmax=vmax)
                mapper = cm.ScalarMappable(norm=normalizer, cmap='magma')
                colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] *
                                  255).astype(np.uint8)
                im = pil.fromarray(colormapped_im)

                name_dest_im = os.path.join(new_path,
                                            "{}_disp.jpeg".format(output_name))
                im.save(name_dest_im)

                print(
                    "   Processed {:d} of {:d} images - saved prediction to {}"
                    .format(idx + 1, len(paths), name_dest_im))

    print('-> Done!')
Example #18
0
def image_demo(image_path, model_name):
    if torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    download_model_if_doesnt_exist(model_name)
    model_path = os.path.join("models", model_name)
    encoder_path = os.path.join(model_path, "encoder.pth")
    depth_decoder_path = os.path.join(model_path, "depth.pth")
    print("-> Loading model from ", model_path)
    print("-> encoder_path = ", encoder_path)
    print("-> depth_decoder_path = ", depth_decoder_path)

    # LOADING PRETRAINED MODEL
    print("-> Loading pretrained encoder")
    encoder = networks.ResnetEncoder(18, False)
    loaded_dict_enc = torch.load(encoder_path, map_location=device)

    # extract the height and width of image that this model was trained with
    feed_height = loaded_dict_enc['height']
    feed_width = loaded_dict_enc['width']
    filtered_dict_enc = {
        k: v
        for k, v in loaded_dict_enc.items() if k in encoder.state_dict()
    }
    encoder.load_state_dict(filtered_dict_enc)
    encoder.to(device)
    encoder.eval()

    print("-> Loading pretrained decoder")
    depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc,
                                          scales=range(4))

    loaded_dict = torch.load(depth_decoder_path, map_location=device)
    depth_decoder.load_state_dict(loaded_dict)

    depth_decoder.to(device)
    depth_decoder.eval()

    # FINDING INPUT IMAGES
    print("-> Predicting test image : ", image_path)

    with torch.no_grad():
        # Load image and preprocess
        input_image = pil.open(image_path).convert('RGB')
        print(input_image)
        original_width, original_height = input_image.size
        print("-> input image size ", input_image.size)
        input_image = input_image.resize((feed_width, feed_height),
                                         pil.LANCZOS)
        print("-> resize to ", input_image.size)
        input_image = transforms.ToTensor()(input_image).unsqueeze(0)
        print(input_image)

        # PREDICTION
        input_image = input_image.to(device)
        print(input_image)
        features = encoder(input_image)
        outputs = depth_decoder(features)

        disp = outputs[("disp", 0)]
        disp_resized = torch.nn.functional.interpolate(
            disp, (original_height, original_width),
            mode="bilinear",
            align_corners=False)

        # Saving colormapped depth image
        disp_resized_np = disp_resized.squeeze().cpu().numpy()
        vmax = np.percentile(disp_resized_np, 95)
        normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(),
                                          vmax=vmax)
        mapper = cm.ScalarMappable(norm=normalizer, cmap='magma')
        colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] *
                          255).astype(np.uint8)

        cv2.imshow('img', colormapped_im)
        cv2.waitKey(0)
Example #19
0
        disp, (frame.shape[0], frame.shape[1]), mode="bilinear", align_corners=False)
    return disp_resized.squeeze().cpu().numpy()

# Initialize SSD
net = build_ssd('test', 300, 21)
net.load_state_dict(torch.load('data/weights/ssd_300_VOC0712.pth'))
transform = BaseTransform(net.size, (104/256.0, 117/256.0, 123/256.0))

# Initialize Monodepth2
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU BOiii")
else:
    device = torch.device("cpu")

download_model_if_doesnt_exist(DEPTH_MODEL_NAME)
model_path = os.path.join("models", DEPTH_MODEL_NAME)
print("-> Loading model from ", model_path)
encoder_path = os.path.join(model_path, "encoder.pth")
depth_decoder_path = os.path.join(model_path, "depth.pth")

# LOADING PRETRAINED MODEL
print("   Loading pretrained encoder")
encoder = networks.ResnetEncoder(18, False)
loaded_dict_enc = torch.load(encoder_path, map_location=device)

# extract the height and width of image that this model was trained with
feed_height = loaded_dict_enc['height']
feed_width = loaded_dict_enc['width']
filtered_dict_enc = {k: v for k, v in loaded_dict_enc.items() if k in encoder.state_dict()}
encoder.load_state_dict(filtered_dict_enc)
Example #20
0
def video_demo(video_path, model_name):
    if torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    download_model_if_doesnt_exist(model_name)
    model_path = os.path.join("models", model_name)
    encoder_path = os.path.join(model_path, "encoder.pth")
    depth_decoder_path = os.path.join(model_path, "depth.pth")
    print("-> Loading model from ", model_path)
    print("-> encoder_path = ", encoder_path)
    print("-> depth_decoder_path = ", depth_decoder_path)

    # LOADING PRETRAINED MODEL
    print("-> Loading pretrained encoder")
    encoder = networks.ResnetEncoder(18, False)
    loaded_dict_enc = torch.load(encoder_path, map_location=device)

    # extract the height and width of image that this model was trained with
    feed_height = loaded_dict_enc['height']
    feed_width = loaded_dict_enc['width']
    filtered_dict_enc = {
        k: v
        for k, v in loaded_dict_enc.items() if k in encoder.state_dict()
    }
    encoder.load_state_dict(filtered_dict_enc)
    encoder.to(device)
    encoder.eval()

    print("-> Loading pretrained decoder")
    depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc,
                                          scales=range(4))

    loaded_dict = torch.load(depth_decoder_path, map_location=device)
    depth_decoder.load_state_dict(loaded_dict)

    depth_decoder.to(device)
    depth_decoder.eval()

    # play video
    vid = cv2.VideoCapture(video_path)
    if not vid.isOpened():
        raise IOError("Couldn't open webcam or video")
    while True:
        ok, frame = vid.read()
        if ok == False:
            break
        with torch.no_grad():
            # Load image and preprocess
            input_image = pil.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
            # print(input_image)
            original_width, original_height = input_image.size
            #print("-> input image size ", input_image.size)
            input_image = input_image.resize((feed_width, feed_height),
                                             pil.LANCZOS)
            #print("-> resize to ", input_image.size)
            input_image = transforms.ToTensor()(input_image).unsqueeze(0)
            # print(input_image)

            # PREDICTION
            input_image = input_image.to(device)
            # print(input_image)
            features = encoder(input_image)
            outputs = depth_decoder(features)

            disp = outputs[("disp", 0)]
            disp_resized = torch.nn.functional.interpolate(
                disp, (original_height, original_width),
                mode="bilinear",
                align_corners=False)

            # Saving colormapped depth image
            disp_resized_np = disp_resized.squeeze().cpu().numpy()
            vmax = np.percentile(disp_resized_np, 95)
            normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(),
                                              vmax=vmax)
            mapper = cm.ScalarMappable(norm=normalizer, cmap='magma')
            colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] *
                              255).astype(np.uint8)

            cv2.imshow('img', colormapped_im)
            cv2.waitKey(10)
            key = cv2.waitKey(1) & 0xFF
            if key == ord('q'):
                break

        #cv2.imshow("result", frame)
        # cv2.waitKey(30)
        #key = cv2.waitKey(1) & 0xFF
        # if key == ord('q'):
        #    break
    cv2.destroyAllWindows()
Example #21
0
def test_simple(args):
    """Function to predict for a single image or folder of images
    """
    assert args.model_name is not None, \
        "You must specify the --model_name parameter; see README.md for an example"

    if torch.cuda.is_available() and not args.no_cuda:
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    download_model_if_doesnt_exist(args.model_name)
    model_path = os.path.join("models", args.model_name)
    print("-> Loading model from ", model_path)
    encoder_path = os.path.join(model_path, "encoder.pth")
    depth_decoder_path = os.path.join(model_path, "depth.pth")
    pose_encoder_path = os.path.join(model_path, "pose_encoder.pth")
    pose_decoder_path = os.path.join(model_path, "pose.pth")

    # LOADING PRETRAINED MODEL
    print("   Loading pretrained encoder")
    encoder = networks.ResnetEncoder(26, False)
    # encoder = networks.PackNeSt_encoder()  # REVIEW
    loaded_dict_enc = torch.load(encoder_path, map_location=device)

    # extract the height and width of image that this model was trained with
    feed_height = loaded_dict_enc['height']
    feed_width = loaded_dict_enc['width']
    filtered_dict_enc = {
        k: v
        for k, v in loaded_dict_enc.items() if k in encoder.state_dict()
    }
    encoder.load_state_dict(filtered_dict_enc)
    encoder.to(device)
    encoder.eval()

    print("   Loading pretrained decoder")
    depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc,
                                          scales=range(4))
    # depth_decoder = networks.PackNeSt_decoder()  # REVIEW

    loaded_dict = torch.load(depth_decoder_path, map_location=device)
    depth_decoder.load_state_dict(loaded_dict)

    depth_decoder.to(device)
    depth_decoder.eval()

    # FINDING INPUT IMAGES
    if os.path.isfile(args.image_path):
        # Only testing on a single image
        paths = [args.image_path]
        output_directory = os.path.dirname(args.image_path)
    elif os.path.isdir(args.image_path):
        # Searching folder for images
        paths = glob.glob(
            os.path.join(args.image_path, '*.{}'.format(args.ext)))
        output_directory = args.image_path
    else:
        raise Exception("Can not find args.image_path: {}".format(
            args.image_path))

    print("-> Predicting on {:d} test images".format(len(paths)))

    # PREDICTING ON EACH IMAGE IN TURN
    with torch.no_grad():
        for idx, image_path in enumerate(paths):

            if image_path.endswith("_disp.jpg"):
                # don't try to predict disparity for a disparity image!
                continue

            # Load image and preprocess
            input_image = pil.open(image_path).convert('RGB')
            original_width, original_height = input_image.size
            input_image = input_image.resize((feed_width, feed_height),
                                             pil.LANCZOS)
            input_image = transforms.ToTensor()(input_image).unsqueeze(0)

            # PREDICTION
            input_image = input_image.to(device)
            features = encoder(input_image)
            outputs = depth_decoder(features)

            disp = outputs[("disp", 0)]
            disp_resized = torch.nn.functional.interpolate(
                disp, (original_height, original_width),
                mode="bilinear",
                align_corners=False)

            # Saving numpy file
            output_name = os.path.splitext(os.path.basename(image_path))[0]
            name_dest_npy = os.path.join(output_directory,
                                         "{}_disp.npy".format(output_name))
            scaled_disp, _ = disp_to_depth(disp, 0.1, 100)
            np.save(name_dest_npy, scaled_disp.cpu().numpy())

            # Saving colormapped depth image
            disp_resized_np = disp_resized.squeeze().cpu().numpy()
            vmax = np.percentile(disp_resized_np, 95)
            normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(),
                                              vmax=vmax)
            mapper = cm.ScalarMappable(norm=normalizer, cmap='magma')
            colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] *
                              255).astype(np.uint8)
            im = pil.fromarray(colormapped_im)

            name_dest_im = os.path.join(output_directory,
                                        "{}_disp.jpeg".format(output_name))
            im.save(name_dest_im)

            print("   Processed {:d} of {:d} images - saved prediction to {}".
                  format(idx + 1, len(paths), name_dest_im))

    print('-> Done!')
Example #22
0
def test_simple(args):
    """Function to predict for a single image or folder of images
    """
    assert args.model_name is not None, \
        "You must specify the --model_name parameter; see README.md for an example"

    if torch.cuda.is_available() and not args.no_cuda:
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    download_model_if_doesnt_exist(args.model_name)
    model_path = os.path.join("models", args.model_name)
    print("-> Loading model from ", model_path)
    encoder_path = os.path.join(model_path, "encoder.pth")
    depth_decoder_path = os.path.join(model_path, "depth.pth")

    # LOADING PRETRAINED MODEL
    print("   Loading pretrained encoder")
    encoder = networks.ResnetEncoder(18, False)
    loaded_dict_enc = torch.load(encoder_path, map_location=device)

    # extract the height and width of image that this model was trained with
    feed_height = loaded_dict_enc['height']
    feed_width = loaded_dict_enc['width']
    filtered_dict_enc = {
        k: v
        for k, v in loaded_dict_enc.items() if k in encoder.state_dict()
    }
    encoder.load_state_dict(filtered_dict_enc)
    encoder.to(device)
    encoder.eval()

    print("   Loading pretrained decoder")
    depth_decoder = networks.DepthDecoder(num_ch_enc=encoder.num_ch_enc,
                                          scales=range(4))

    loaded_dict = torch.load(depth_decoder_path, map_location=device)
    depth_decoder.load_state_dict(loaded_dict)

    depth_decoder.to(device)
    depth_decoder.eval()

    # FINDING INPUT IMAGES
    if os.path.isfile(args.image_path):
        # Only testing on a single image
        paths = [args.image_path]
        output_directory = os.path.dirname(args.image_path)
    elif os.path.isdir(args.image_path):
        # Searching folder for images
        paths = glob.glob(
            os.path.join(args.image_path, '*.{}'.format(args.ext)))

        output_directory = os.path.join(args.image_path, 'disp_images')
        if not os.path.exists(output_directory):
            os.makedirs(output_directory)

        video_directory = os.path.join(args.image_path, 'videos')
        if not os.path.exists(video_directory):
            os.makedirs(video_directory)
    else:
        raise Exception("Can not find args.image_path: {}".format(
            args.image_path))

    print("-> Predicting on {:d} test images".format(len(paths)))

    # PREDICTING ON EACH IMAGE IN TURN
    with torch.no_grad():
        for idx, image_path in enumerate(paths):

            if image_path.endswith("_disp.jpg"):
                # don't try to predict disparity for a disparity image!
                continue

            # Load image and preprocess
            input_image = pil.open(image_path).convert('RGB')
            original_width, original_height = input_image.size
            input_image = input_image.resize((feed_width, feed_height),
                                             pil.LANCZOS)
            input_image = transforms.ToTensor()(input_image).unsqueeze(0)

            # PREDICTION
            input_image = input_image.to(device)
            features = encoder(input_image)
            outputs = depth_decoder(features)

            disp = outputs[("disp", 0)]
            disp_resized = torch.nn.functional.interpolate(
                disp, (original_height, original_width),
                mode="bilinear",
                align_corners=False)

            # Saving numpy file
            output_name = os.path.splitext(os.path.basename(image_path))[0]
            name_dest_npy = os.path.join(output_directory,
                                         "{}_disp.npy".format(output_name))
            scaled_disp, _ = disp_to_depth(disp, 0.1, 100)
            np.save(name_dest_npy, scaled_disp.cpu().numpy())

            # Saving colormapped depth image
            disp_resized_np = disp_resized.squeeze().cpu().numpy()
            vmax = np.percentile(disp_resized_np, 95)
            normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(),
                                              vmax=vmax)
            mapper = cm.ScalarMappable(norm=normalizer, cmap='magma')
            colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] *
                              255).astype(np.uint8)
            im = pil.fromarray(colormapped_im)

            name_dest_im = os.path.join(output_directory,
                                        "{}_disp.jpg".format(output_name))
            im.save(name_dest_im)

            print("   Processed {:d} of {:d} images - saved prediction to {}".
                  format(idx + 1, len(paths), name_dest_im))

    # Create videos if folder
    if os.path.isdir(args.image_path):

        print('-> Building the original video from the inputted images')

        # Sorting files
        files = [
            file for file in os.listdir(args.image_path)
            if os.path.isfile(os.path.join(args.image_path, file))
        ]
        nums = [int(re.findall('\d+', s)[0]) for s in files]
        dictionary = dict(zip(nums, files))
        sorted_keys = sorted(dictionary)
        sorted_dict = {i: dictionary[i] for i in sorted_keys}
        files = sorted_dict.values()

        orig_imgs = []
        for file in files:
            if not file.endswith('.{}'.format(args.ext)):
                continue
            temp = cv2.imread(os.path.join(args.image_path, file))
            orig_imgs.append(temp)

        height, width = orig_imgs[0].shape[0:2]

        orig_video = cv2.VideoWriter(
            os.path.join(video_directory, 'orig_video.avi'),
            cv2.VideoWriter_fourcc('M', 'J', 'P', 'G'), 10, (width, height))

        for image in orig_imgs:
            orig_video.write(image)

        cv2.destroyAllWindows()
        orig_video.release()

        print('-> Building the depth video')

        # Sorting files
        outputs = [
            file for file in os.listdir(output_directory)
            if file.endswith('jpg')
        ]
        nums = [int(re.findall('\d+', s)[0]) for s in outputs]
        dictionary = dict(zip(nums, outputs))
        sorted_keys = sorted(dictionary)
        sorted_dict = {i: dictionary[i] for i in sorted_keys}
        outputs = sorted_dict.values()

        depth_imgs = []
        for file in outputs:
            if file.endswith("_disp.npy"):
                continue
            temp_depth = cv2.imread(os.path.join(output_directory, file))
            depth_imgs.append(temp_depth)

        height, width = depth_imgs[0].shape[0:2]

        depth_video = cv2.VideoWriter(
            os.path.join(video_directory, 'depth_video.avi'),
            cv2.VideoWriter_fourcc('M', 'J', 'P', 'G'), 10, (width, height))

        for image in depth_imgs:
            depth_video.write(image)

        cv2.destroyAllWindows()
        depth_video.release()

    print('-> Done!')