def valid(datacfg, cfgfile, weightfile, outfile): cudnn.enabled = True cudnn.benchmark = True options = read_data_cfg(datacfg) valid_images = options['valid'] name_list = options['names'] prefix = 'results' names = load_class_names(name_list) with open(valid_images) as fp: tmp_files = fp.readlines() valid_files = [item.rstrip() for item in tmp_files] m = Darknet(cfgfile) m.print_network() m.load_weights(weightfile) m.cuda() m.eval() print('shape:', m.width, 'x', m.height) fps = [] if not os.path.exists('results'): os.mkdir('results') for i in range(m.num_classes): buf = '%s/%s%s.txt' % (prefix, outfile, names[i]) fps.append(open(buf, 'w')) conf_thresh = 0.005 nms_thresh = 0.45 for batch_idx, valid_file in enumerate(valid_files): image = cv2.imread(valid_file) assert image is not None image2 = letterbox_image(image, m.width, m.height) if batch_idx == 0: cv2.imwrite('letterbox_image.jpg', image2.astype(np.uint8)) image_tensor = image_to_tensor(image2) data = image_tensor.cuda() with torch.no_grad(): output = m(data) # if batch_idx == 0: # outputs[-1] = data # save_outputs('./outputs.npz', outputs) batch_boxes = get_region_boxes2(output, image.shape[1], image.shape[0], m.width, m.height, conf_thresh, m.num_classes, m.anchors, m.num_anchors, 1) fileId = os.path.basename(valid_file).split('.')[0] height, width = image.shape[:2] print('[{}/{}]: '.format(batch_idx, len(valid_files)), valid_file, ' ', len(batch_boxes[0])) boxes = batch_boxes[0] boxes = nms_class(boxes, nms_thresh, m.num_classes) for box in boxes: x1 = (box[0] - box[2] / 2.0) * width y1 = (box[1] - box[3] / 2.0) * height x2 = (box[0] + box[2] / 2.0) * width y2 = (box[1] + box[3] / 2.0) * height if x1 < 0: x1 = 0 if y1 < 0: y1 = 0 if x2 >= width: x2 = width - 1 if y2 >= height: y2 = height - 1 for j in range(m.num_classes): prob = box[5 + j] if prob >= conf_thresh: fps[j].write('%s %f %f %f %f %f\n' % (fileId, prob, x1, y1, x2, y2)) for i in range(m.num_classes): fps[i].close()
def predict(): target = os.path.join(APP_ROOT, 'static/') print(target) if not os.path.isdir(target): os.mkdir(target) else: print("Couldn't create upload directory: {}".format(target)) print(request.files.getlist("file")) for upload in request.files.getlist("file"): print(upload) print("{} is the file name".format(upload.filename)) filename = upload.filename destination = "/".join([target, filename]) print ("Accept incoming file:", filename) print ("Save it to:", destination) upload.save(destination) scales = "1,2,3" print (filename) images = "static/"+str(filename) batch_size = int(1) confidence = float(0.5) nms_thesh = float(0.4) start = 0 CUDA = torch.cuda.is_available() num_classes = 80 classes = load_classes('data/coco.names') print("Loading network.....") model = Darknet("cfg/yolov3.cfg") model.load_weights("yolov3.weights") print("Network successfully loaded") model.net_info["height"] = "416" inp_dim = int(model.net_info["height"]) assert inp_dim % 32 == 0 assert inp_dim > 32 if CUDA: model.cuda() model.eval() read_dir = time.time() try: imlist = [osp.join(osp.realpath('.'), images, img) for img in os.listdir(images) if os.path.splitext(img)[1] == '.png' or os.path.splitext(img)[1] =='.jpeg' or os.path.splitext(img)[1] =='.jpg'] except NotADirectoryError: imlist = [] imlist.append(osp.join(osp.realpath('.'), images)) except FileNotFoundError: print ("No file or directory with the name {}".format(images)) exit() load_batch = time.time() batches = list(map(prep_image, imlist, [inp_dim for x in range(len(imlist))])) im_batches = [x[0] for x in batches] orig_ims = [x[1] for x in batches] im_dim_list = [x[2] for x in batches] im_dim_list = torch.FloatTensor(im_dim_list).repeat(1,2) if CUDA: im_dim_list = im_dim_list.cuda() leftover = 0 if (len(im_dim_list) % batch_size): leftover = 1 if batch_size != 1: num_batches = len(imlist) // batch_size + leftover im_batches = [torch.cat((im_batches[i*batch_size : min((i + 1)*batch_size, len(im_batches))])) for i in range(num_batches)] i = 0 write = False model(get_test_input(inp_dim, CUDA), CUDA) start_det_loop = time.time() objs = {} for batch in im_batches: start = time.time() if CUDA: batch = batch.cuda() with torch.no_grad(): prediction = model(Variable(batch), CUDA) prediction = write_results(prediction, confidence, num_classes, nms = True, nms_conf = nms_thesh) if type(prediction) == int: i += 1 continue end = time.time() prediction[:,0] += i*batch_size if not write: output = prediction write = 1 else: output = torch.cat((output,prediction)) for im_num, image in enumerate(imlist[i*batch_size: min((i + 1)*batch_size, len(imlist))]): im_id = i*batch_size + im_num objs = [classes[int(x[-1])] for x in output if int(x[0]) == im_id] print("{0:20s} predicted in {1:6.3f} seconds".format(image.split("/")[-1], (end - start)/batch_size)) print("{0:20s} {1:s}".format("Objects Detected:", " ".join(objs))) print("----------------------------------------------------------") i += 1 if CUDA: torch.cuda.synchronize() try: output except NameError: print("No detections were made") exit() im_dim_list = torch.index_select(im_dim_list, 0, output[:,0].long()) scaling_factor = torch.min(inp_dim/im_dim_list,1)[0].view(-1,1) output[:,[1,3]] -= (inp_dim - scaling_factor*im_dim_list[:,0].view(-1,1))/2 output[:,[2,4]] -= (inp_dim - scaling_factor*im_dim_list[:,1].view(-1,1))/2 output[:,1:5] /= scaling_factor for i in range(output.shape[0]): output[i, [1,3]] = torch.clamp(output[i, [1,3]], 0.0, im_dim_list[i,0]) output[i, [2,4]] = torch.clamp(output[i, [2,4]], 0.0, im_dim_list[i,1]) output_recast = time.time() class_load = time.time() colors = pkl.load(open("pallete", "rb")) draw = time.time() def write(x, batches, results): c1 = tuple(x[1:3].int()) c2 = tuple(x[3:5].int()) img = results[int(x[0])] cls = int(x[-1]) label = "{0}".format(classes[cls]) color = random.choice(colors) cv2.rectangle(img, c1, c2,color, 2) t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1 , 1)[0] c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4 cv2.rectangle(img, c1, c2,color, -1) cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225,255,255], 1) return img list(map(lambda x: write(x, im_batches, orig_ims), output)) det_names = pd.Series(imlist).apply(lambda x: "{}/{}".format("static",x.split("/")[-1])) list(map(cv2.imwrite, det_names, orig_ims)) end = time.time() torch.cuda.empty_cache() return render_template("results.html",image_name=filename)
class Yolo6dDetector: def __init__(self): print('initialization........') self.cfgfile = 'cfg/yolo-pose.cfg' self.outfile = 'output_file' self.object_weight = 'backup/small_duck/model.weights' self.ply_model = './LINEMOD/small_duck/small_duck.ply' self.model = Darknet(self.cfgfile) self.num_classes = 1 self.conf_thresh = 0.1 self.test_width = 544 self.test_height = 544 self.R_pr = None self.r_pr = None self.Rt_pr = None self.visualize = True self.data = None def loadData(self,empty_one,empty_two): print("in python detector loadData\n") print("%s\n"%empty_one) print("%s\n"%empty_two) self.model.load_weights(self.object_weight) self.model.eval() self.mesh = MeshPly(self.ply_model) self.vertices = np.c_[np.array(self.mesh.vertices), np.ones((len(self.mesh.vertices), 1))].transpose() self.corners3D = get_3D_corners(self.vertices) self.internal_calibration = get_camera_intrinsic() # Read intrinsic camera parameters self.edges_corners = [[0, 1], [0, 2], [0, 4], [1, 3], [1, 5], [2, 3], [2, 6], [3, 7], [4, 5], [4, 6], [5, 7], [6, 7]] def setColorImg(self,img): print("in python detector setColorImg\n") img = Image.fromarray(cv2.cvtColor(img,cv2.COLOR_BGR2RGB)) img = img.resize((self.test_width, self.test_height)) transform = transforms.Compose([transforms.ToTensor(), ]) self.data = Variable(transform(img).view(1,3,544,544)) # Images img_show = self.data[0, :, :, :] img_show = img_show.numpy().squeeze() self.img_show = np.transpose(img_show, (1, 2, 0)) # print('successful') # print(img) def detection(self): print("in python detector detection\n") #forward pass output = self.model(self.data).cuda() # Using confidence threshold, eliminate low-confidence predictions all_boxes = get_region_boxes(output, self.conf_thresh, self.num_classes) # Iterate through all images in the batch for i in range(output.size(0)): # For each image, get all the predictions boxes = all_boxes[i] best_conf_est = -1 # If the prediction has the highest confidence, choose it as our prediction for single object pose estimation for j in range(len(boxes)): if (boxes[j][18] > best_conf_est): # match = corner_confidence9(box_gt[:18], torch.FloatTensor(boxes[j][:18])) box_pr = boxes[j] best_conf_est = boxes[j][18] # Denormalize the corner predictions corners2D_pr = np.array(np.reshape(box_pr[:18], [9, 2]), dtype='float32') corners2D_pr[:, 0] = corners2D_pr[:, 0] * 1920 corners2D_pr[:, 1] = corners2D_pr[:, 1] * 1080 # Compute [R|t] by pnp self.R_pr, self.t_pr = pnp(np.array(np.transpose(np.concatenate((np.zeros((3, 1)), self.corners3D[:3, :]), axis=1)), dtype='float32'), corners2D_pr, np.array(self.internal_calibration, dtype='float32')) # self.R_pr.append(R_pr) # self.r_pr.append(t_pr) # # Compute pixel error self.Rt_pr = np.concatenate((self.R_pr, self.t_pr), axis=1) # proj_2d_pred = compute_projection(vertices, Rt_pr, internal_calibration) proj_corners_pr = np.transpose(compute_projection(self.corners3D, self.Rt_pr, self.internal_calibration)) if self.visualize: # Visualize plt.xlim((0, 1920)) plt.ylim((0, 1080)) plt.imshow(scipy.misc.imresize(self.img_show, (1080, 1920))) # Projections for edge in self.edges_corners: # plt.plot(proj_corners_gt[edge, 0], proj_corners_gt[edge, 1], color='g', linewidth=3.0) plt.plot(proj_corners_pr[edge, 0], proj_corners_pr[edge, 1], color='b', linewidth=3.0) plt.gca().invert_yaxis() plt.show() def getReuslt(self): print("in python detector getReuslt\n") print(self.R_pr) print(self.t_pr) def setDepthImg(self, img): print("in python detector setColorImg\n")
class Car_DC(): def __init__(self, src_dir, dst_dir, car_cfg_path=local_car_cfg_path, car_det_weights_path=local_car_det_weights_path, inp_dim=768, prob_th=0.2, nms_th=0.4, num_classes=1): """ model initialization """ # super parameters self.inp_dim = inp_dim self.prob_th = prob_th self.nms_th = nms_th self.num_classes = num_classes self.dst_dir = dst_dir # clear dst_dir if os.path.exists(self.dst_dir): for x in os.listdir(self.dst_dir): if x.endswith('.jpg'): os.remove(self.dst_dir + '/' + x) else: os.makedirs(self.dst_dir) # initialize vehicle detection model self.detector = Darknet(car_cfg_path) self.detector.load_weights(car_det_weights_path) # set input dimension of image self.detector.net_info['height'] = self.inp_dim self.detector.to(device) self.detector.eval() # evaluation mode print('=> car detection model initiated.') # initiate multilabel classifier self.classifier = Car_Classifier(num_cls=19, model_path=local_model_path) # initiate imgs_path # self.imgs_path = [os.path.join(src_dir, x) for x in os.listdir(src_dir) if x.endswith('.jpg') or x.endswith('.png')] # MODIFIED! self.imgs_path = [ os.path.join(src_dir, x) for x in os.listdir(src_dir) if x.startswith('set') and x.endswith('_image') ] self.imgs_path = [ os.path.join(x, y) for x in self.imgs_path for y in os.listdir(x) ] self.imgs_path.sort() self.imgs_path = [ os.path.join(x, y) for x in self.imgs_path for y in os.listdir(x) ] self.imgs_path = [ os.path.join(x, y) for x in self.imgs_path for y in os.listdir(x) if y.endswith('.jpg') or y.endswith('.png') ] def cls_draw_bbox(self, output, orig_img): """ 1. predict vehicle's attributes based on bbox of vehicle 2. draw bbox to orig_img """ labels = [] pt_1s = [] pt_2s = [] car_color, car_direction, car_type = None, None, None # 1 for det in output: if len(det) == 7: continue # rectangle points pt_1 = tuple(det[1:3].int()) # the left-up point pt_2 = tuple(det[3:5].int()) # the right down point pt_1s.append(pt_1) pt_2s.append(pt_2) # turn BGR back to RGB ROI = Image.fromarray(orig_img[pt_1[1]:pt_2[1], pt_1[0]:pt_2[0]][:, :, ::-1]) # # ROI.show() # # call classifier to predict car_color, car_direction, car_type = self.classifier.predict(ROI) label = str(car_color + ' ' + car_direction + ' ' + car_type) labels.append(label) print('=> predicted label: ', label) break # 2 color = (0, 215, 255) for i, det in enumerate(output): if len(det) == 7: continue pt_1 = pt_1s[i] pt_2 = pt_2s[i] # draw bounding box cv2.rectangle(orig_img, pt_1, pt_2, color, thickness=2) # get str text size txt_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 2, 2)[0] # pt_2 = pt_1[0] + txt_size[0] + 3, pt_1[1] + txt_size[1] + 5 pt_2 = pt_1[0] + txt_size[0] + 3, pt_1[1] - txt_size[1] - 5 # # draw text background rect cv2.rectangle(orig_img, pt_1, pt_2, color, thickness=-1) # text # draw text cv2.putText( orig_img, labels[i], (pt_1[0], pt_1[1]), # pt_1[1] + txt_size[1] + 4 cv2.FONT_HERSHEY_PLAIN, 2, [225, 255, 255], 2) break return car_color, car_direction, car_type def process_predict(self, prediction, prob_th, num_cls, nms_th, inp_dim, orig_img_size): """ processing detections """ scaling_factor = min([inp_dim / float(x) for x in orig_img_size]) # W, H scaling factor output = post_process(prediction, prob_th, num_cls, nms=True, nms_conf=nms_th, CUDA=True) # post-process such as nms if type(output) != int: output[:, [1, 3]] -= (inp_dim - scaling_factor * orig_img_size[0]) / 2.0 # x, w output[:, [2, 4]] -= (inp_dim - scaling_factor * orig_img_size[1]) / 2.0 # y, h output[:, 1:5] /= scaling_factor for i in range(output.shape[0]): output[i, [1, 3]] = torch.clamp(output[i, [1, 3]], 0.0, orig_img_size[0]) output[i, [2, 4]] = torch.clamp(output[i, [2, 4]], 0.0, orig_img_size[1]) return output def detect_classify(self, query_pair): pre_path = '' color_dict = {} type_dict = {} # cars = [] # all_cars_per_camera = {} index_list_all = [] index_list_per_camera = [] pre_camera_id = self.imgs_path[0].split('/')[3] stream_i = 0 print("\n\nProcessing stream %d...\n" % stream_i) tracklet_i = 0 """ detect and classify """ for x in self.imgs_path: curr_path = os.path.split(x)[0] # read image data img = cv2.imread(x) img = cv2.copyMakeBorder(img, BORDER, BORDER, BORDER, BORDER, cv2.BORDER_CONSTANT, value=(100, 100, 100)) img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_RGB2BGR)) img2det = process_img(img, self.inp_dim) img2det = img2det.to(device) # put image data to device # vehicle detection prediction = self.detector.forward(img2det, CUDA=True) # calculating scaling factor orig_img_size = list(img.size) output = self.process_predict(prediction, self.prob_th, self.num_classes, self.nms_th, self.inp_dim, orig_img_size) orig_img = cv2.cvtColor(np.asarray(img), cv2.COLOR_RGB2BGR) # RGB => BGR if type(output) != int: # print('\n', x) car_color, car_direction, car_type = self.cls_draw_bbox( output, orig_img) dst_path = self.dst_dir + '/' + os.path.split(x)[1] # if not os.path.exists(dst_path): # cv2.imwrite(dst_path, orig_img) if curr_path != pre_path and pre_path != '': start_length = os.path.split(os.path.split(pre_path)[0])[1] detect_color = max(color_dict, key=color_dict.get) detect_type = max(type_dict, key=type_dict.get) print("Tracklet %d detects " % tracklet_i, detect_color, detect_type) # add_to_all(all_cars_per_camera, detect_color, detect_type) compare_query_append(query_pair, detect_color, detect_type, index_list_per_camera, tracklet_i, start_length) tracklet_i += 1 color_dict.clear() type_dict.clear() curr_camera_id = x.split('/')[3] if curr_camera_id != pre_camera_id: print("The query result on stream %d:" % stream_i, index_list_per_camera) index_list_all.append(deepcopy(index_list_per_camera)) index_list_per_camera.clear() pre_camera_id = curr_camera_id stream_i += 1 tracklet_i = 0 print("\n\nProcessing stream %d...\n" % stream_i) if car_color != None: if car_color not in color_dict: color_dict[car_color] = 0 color_dict[car_color] += 1 if car_type != None: if car_type not in type_dict: type_dict[car_type] = 0 type_dict[car_type] += 1 pre_path = curr_path # add the last one if pre_path != '': start_length = os.path.split(os.path.split(pre_path)[0])[1] detect_color = max(color_dict, key=color_dict.get) detect_type = max(type_dict, key=type_dict.get) print("Tracklet %d detects " % tracklet_i, detect_color, detect_type) compare_query_append(query_pair, detect_color, detect_type, index_list_per_camera, tracklet_i, start_length) # print(all_cars_per_camera) color_dict.clear() type_dict.clear() print("The query result on stream %d:" % stream_i, index_list_per_camera) index_list_all.append(deepcopy(index_list_per_camera)) return index_list_all
class ObjectDetection: def __init__(self, id): # self.cap = cv2.VideoCapture(id) self.cap = WebcamVideoStream(src=id).start() self.cfgfile = "cfg/yolov3.cfg" # self.cfgfile = 'cfg/yolov3-tiny.cfg' self.weightsfile = "yolov3.weights" # self.weightsfile = 'yolov3-tiny.weights' self.confidence = float(0.25) self.nms_thesh = float(0.4) self.num_classes = 80 self.classes = load_classes('data/coco.names') self.colors = pkl.load(open("pallete", "rb")) self.model = Darknet(self.cfgfile) self.CUDA = torch.cuda.is_available() self.model.load_weights(self.weightsfile) self.model.net_info["height"] = 160 self.inp_dim = int(self.model.net_info["height"]) self.width = 640 #640# self.height = 480 #360# print("Loading network.....") if self.CUDA: self.model.cuda() print("Network successfully loaded") assert self.inp_dim % 32 == 0 assert self.inp_dim > 32 self.model.eval() def main(self): q = queue.Queue() while True: def frame_render(queue_from_cam): frame = self.cap.read( ) # If you capture stream using opencv (cv2.VideoCapture()) the use the following line # ret, frame = self.cap.read() frame = cv2.resize(frame, (self.width, self.height)) queue_from_cam.put(frame) cam = threading.Thread(target=frame_render, args=(q, )) cam.start() cam.join() frame = q.get() q.task_done() fps = FPS().start() try: img, orig_im, dim = prep_image(frame, self.inp_dim) im_dim = torch.FloatTensor(dim).repeat(1, 2) if self.CUDA: #### If you have a gpu properly installed then it will run on the gpu im_dim = im_dim.cuda() img = img.cuda() # with torch.no_grad(): #### Set the model in the evaluation mode output = self.model(Variable(img), self.CUDA) output = write_results(output, self.confidence, self.num_classes, nms=True, nms_conf=self.nms_thesh ) #### Localize the objects in a frame output = output.type(torch.half) if list(output.size()) == [1, 86]: pass else: output[:, 1:5] = torch.clamp(output[:, 1:5], 0.0, float( self.inp_dim)) / self.inp_dim # im_dim = im_dim.repeat(output.size(0), 1) output[:, [1, 3]] *= frame.shape[1] output[:, [2, 4]] *= frame.shape[0] list( map( lambda x: write(x, frame, self.classes, self.colors ), output)) x, y, w, h = b_boxes["bbox"][0], b_boxes["bbox"][ 1], b_boxes["bbox"][2], b_boxes["bbox"][3] distance = (2 * 3.14 * 180) / ( w + h * 360) * 1000 + 3 ### Distance measuring in Inch feedback = ("{}".format(labels["Current Object"]) + " " + "is" + " at {} ".format(round(distance)) + "Inches") # speak.Speak(feedback) # If you are running this on linux based OS kindly use espeak. Using this speaking library in winodws will add unnecessary latency print(feedback) except: pass fps.update() fps.stop() print("[INFO] elasped time: {:.2f}".format(fps.elapsed())) print("[INFO] approx. FPS: {:.1f}".format(fps.fps())) frame = cv2.putText(frame, str("{:.2f} Inches".format(distance)), (x, y), cv2.FONT_HERSHEY_DUPLEX, 0.6, (0, 0, 255), 1, cv2.LINE_AA) cv2.imshow("Object Detection Window", frame) if cv2.waitKey(1) & 0xFF == ord('q'): break continue
def valid(datacfg, modelcfg, weightfile): def truths_length(truths, max_num_gt=50): for i in range(max_num_gt): if truths[i][1] == 0: return i # Parse configuration files data_options = read_data_cfg(datacfg) valid_images = data_options['valid'] meshname = data_options['mesh'] backupdir = data_options['backup'] name = data_options['name'] gpus = data_options['gpus'] fx = float(data_options['fx']) fy = float(data_options['fy']) u0 = float(data_options['u0']) v0 = float(data_options['v0']) im_width = int(data_options['width']) im_height = int(data_options['height']) if not os.path.exists(backupdir): makedirs(backupdir) # Parameters seed = int(time.time()) os.environ['CUDA_VISIBLE_DEVICES'] = gpus torch.cuda.manual_seed(seed) save = False testtime = True num_classes = 1 testing_samples = 0.0 if save: makedirs(backupdir + '/test') makedirs(backupdir + '/test/gt') makedirs(backupdir + '/test/pr') # To save testing_error_trans = 0.0 testing_error_angle = 0.0 testing_error_pixel = 0.0 errs_2d = [] errs_3d = [] errs_trans = [] errs_angle = [] errs_corner2D = [] preds_trans = [] preds_rot = [] preds_corners2D = [] gts_trans = [] gts_rot = [] gts_corners2D = [] # Read object model information, get 3D bounding box corners mesh = MeshPly(meshname) vertices = np.c_[np.array(mesh.vertices), np.ones((len(mesh.vertices), 1))].transpose() corners3D = get_3D_corners(vertices) try: diam = float(options['diam']) except: diam = calc_pts_diameter(np.array(mesh.vertices)) # Read intrinsic camera parameters intrinsic_calibration = get_camera_intrinsic(u0, v0, fx, fy) # Get validation file names with open(valid_images) as fp: tmp_files = fp.readlines() valid_files = [item.rstrip() for item in tmp_files] # Specicy model, load pretrained weights, pass to GPU and set the module in evaluation mode model = Darknet(modelcfg) model.print_network() model.load_weights(weightfile) model.cuda() model.eval() test_width = model.test_width test_height = model.test_height num_keypoints = model.num_keypoints num_labels = num_keypoints * 2 + 3 # Get the parser for the test dataset valid_dataset = dataset.listDataset(valid_images, shape=(test_width, test_height), shuffle=False, transform=transforms.Compose([transforms.ToTensor(),])) # Specify the number of workers for multiple processing, get the dataloader for the test dataset kwargs = {'num_workers': 4, 'pin_memory': True} test_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=1, shuffle=False, **kwargs) logging(" Testing {}...".format(name)) logging(" Number of test samples: %d" % len(test_loader.dataset)) # Iterate through test batches (Batch size for test data is 1) count = 0 for batch_idx, (data, target) in enumerate(test_loader): t1 = time.time() # Pass data to GPU data = data.cuda() target = target.cuda() # Wrap tensors in Variable class, set volatile=True for inference mode and to use minimal memory during inference data = Variable(data, volatile=True) t2 = time.time() # Forward pass output = model(data).data t3 = time.time() # Using confidence threshold, eliminate low-confidence predictions all_boxes = get_region_boxes(output, num_classes, num_keypoints) t4 = time.time() # Evaluation # Iterate through all batch elements for box_pr, target in zip([all_boxes], [target[0]]): # For each image, get all the targets (for multiple object pose estimation, there might be more than 1 target per image) truths = target.view(-1, num_keypoints*2+3) # Get how many objects are present in the scene num_gts = truths_length(truths) # Iterate through each ground-truth object for k in range(num_gts): box_gt = list() for j in range(1, 2*num_keypoints+1): box_gt.append(truths[k][j]) box_gt.extend([1.0, 1.0]) box_gt.append(truths[k][0]) # Denormalize the corner predictions corners2D_gt = np.array(np.reshape(box_gt[:18], [9, 2]), dtype='float32') corners2D_pr = np.array(np.reshape(box_pr[:18], [9, 2]), dtype='float32') corners2D_gt[:, 0] = corners2D_gt[:, 0] * im_width corners2D_gt[:, 1] = corners2D_gt[:, 1] * im_height corners2D_pr[:, 0] = corners2D_pr[:, 0] * im_width corners2D_pr[:, 1] = corners2D_pr[:, 1] * im_height preds_corners2D.append(corners2D_pr) gts_corners2D.append(corners2D_gt) # Compute corner prediction error corner_norm = np.linalg.norm(corners2D_gt - corners2D_pr, axis=1) corner_dist = np.mean(corner_norm) errs_corner2D.append(corner_dist) # Compute [R|t] by pnp R_gt, t_gt = pnp(np.array(np.transpose(np.concatenate((np.zeros((3, 1)), corners3D[:3, :]), axis=1)), dtype='float32'), corners2D_gt, np.array(intrinsic_calibration, dtype='float32')) R_pr, t_pr = pnp(np.array(np.transpose(np.concatenate((np.zeros((3, 1)), corners3D[:3, :]), axis=1)), dtype='float32'), corners2D_pr, np.array(intrinsic_calibration, dtype='float32')) # Compute translation error trans_dist = np.sqrt(np.sum(np.square(t_gt - t_pr))) errs_trans.append(trans_dist) # Compute angle error angle_dist = calcAngularDistance(R_gt, R_pr) errs_angle.append(angle_dist) # Compute pixel error Rt_gt = np.concatenate((R_gt, t_gt), axis=1) Rt_pr = np.concatenate((R_pr, t_pr), axis=1) proj_2d_gt = compute_projection(vertices, Rt_gt, intrinsic_calibration) proj_2d_pred = compute_projection(vertices, Rt_pr, intrinsic_calibration) norm = np.linalg.norm(proj_2d_gt - proj_2d_pred, axis=0) pixel_dist = np.mean(norm) errs_2d.append(pixel_dist) # Compute 3D distances transform_3d_gt = compute_transformation(vertices, Rt_gt) transform_3d_pred = compute_transformation(vertices, Rt_pr) norm3d = np.linalg.norm(transform_3d_gt - transform_3d_pred, axis=0) vertex_dist = np.mean(norm3d) errs_3d.append(vertex_dist) # Sum errors testing_error_trans += trans_dist testing_error_angle += angle_dist testing_error_pixel += pixel_dist testing_samples += 1 count = count + 1 if save: preds_trans.append(t_pr) gts_trans.append(t_gt) preds_rot.append(R_pr) gts_rot.append(R_gt) np.savetxt(backupdir + '/test/gt/R_' + valid_files[count][-8:-3] + 'txt', np.array(R_gt, dtype='float32')) np.savetxt(backupdir + '/test/gt/t_' + valid_files[count][-8:-3] + 'txt', np.array(t_gt, dtype='float32')) np.savetxt(backupdir + '/test/pr/R_' + valid_files[count][-8:-3] + 'txt', np.array(R_pr, dtype='float32')) np.savetxt(backupdir + '/test/pr/t_' + valid_files[count][-8:-3] + 'txt', np.array(t_pr, dtype='float32')) np.savetxt(backupdir + '/test/gt/corners_' + valid_files[count][-8:-3] + 'txt', np.array(corners2D_gt, dtype='float32')) np.savetxt(backupdir + '/test/pr/corners_' + valid_files[count][-8:-3] + 'txt', np.array(corners2D_pr, dtype='float32')) t5 = time.time() # Compute 2D projection error, 6D pose error, 5cm5degree error px_threshold = 5 # 5 pixel threshold for 2D reprojection error is standard in recent sota 6D object pose estimation works eps = 1e-5 acc = len(np.where(np.array(errs_2d) <= px_threshold)[0]) * 100. / (len(errs_2d)+eps) acc5cm5deg = len(np.where((np.array(errs_trans) <= 0.05) & (np.array(errs_angle) <= 5))[0]) * 100. / (len(errs_trans)+eps) acc3d10 = len(np.where(np.array(errs_3d) <= diam * 0.1)[0]) * 100. / (len(errs_3d)+eps) acc5cm5deg = len(np.where((np.array(errs_trans) <= 0.05) & (np.array(errs_angle) <= 5))[0]) * 100. / (len(errs_trans)+eps) corner_acc = len(np.where(np.array(errs_corner2D) <= px_threshold)[0]) * 100. / (len(errs_corner2D)+eps) mean_err_2d = np.mean(errs_2d) mean_corner_err_2d = np.mean(errs_corner2D) nts = float(testing_samples) if testtime: print('-----------------------------------') print(' tensor to cuda : %f' % (t2 - t1)) print(' forward pass : %f' % (t3 - t2)) print('get_region_boxes : %f' % (t4 - t3)) print(' prediction time : %f' % (t4 - t1)) print(' eval : %f' % (t5 - t4)) print('-----------------------------------') # Print test statistics logging('Results of {}'.format(name)) logging(' Acc using {} px 2D Projection = {:.2f}%'.format(px_threshold, acc)) logging(' Acc using 10% threshold - {} vx 3D Transformation = {:.2f}%'.format(diam * 0.1, acc3d10)) logging(' Acc using 5 cm 5 degree metric = {:.2f}%'.format(acc5cm5deg)) logging(" Mean 2D pixel error is %f, Mean vertex error is %f, mean corner error is %f" % (mean_err_2d, np.mean(errs_3d), mean_corner_err_2d)) logging(' Translation error: %f m, angle error: %f degree, pixel error: % f pix' % (testing_error_trans/nts, testing_error_angle/nts, testing_error_pixel/nts) ) result_data = { 'model': cfgfile, 'acc': acc, 'acc3d10': acc3d10, 'acc5cm5deg': acc5cm5deg, 'mean_err_2d': mean_err_2d, 'errs_3d': np.mean(errs_3d), 'mean_corner_err_2d': mean_corner_err_2d, 'translation_err': testing_error_trans/nts, 'angle_err': testing_error_angle/nts, 'px_err': testing_error_pixel/nts } print(result_data) try: df = pd.read_csv('test_metrics.csv') df = df.append(result_data, ignore_index=True) df.to_csv('test_metrics.csv', index=False) except: df = pd.DataFrame.from_records([result_data]) df.to_csv('test_metrics.csv', index=False) if save: predfile = backupdir + '/predictions_linemod_' + name + '.mat' scipy.io.savemat(predfile, {'R_gts': gts_rot, 't_gts':gts_trans, 'corner_gts': gts_corners2D, 'R_prs': preds_rot, 't_prs':preds_trans, 'corner_prs': preds_corners2D})
def valid(datacfg, cfgfile, weightfile, outfile): options = read_data_cfg(datacfg) valid_images = options['valid'] # backup = cfgs.backup backup = weightfile.split('/')[-2] ckpt = weightfile.split('/')[-1].split('.')[0] prefix = 'results/' + backup.split('/')[-1] + '/e' + ckpt print('saving to: ' + prefix) names = cfg.classes with open(valid_images) as fp: tmp_files = fp.readlines() valid_files = [item.rstrip() for item in tmp_files] m = Darknet(cfgfile) m.print_network() m.load_weights(weightfile) m.cuda() m.eval() valid_dataset = dataset.listDataset(valid_images, shape=(m.width, m.height), shuffle=False, transform=transforms.Compose([ transforms.ToTensor(), ])) valid_batchsize = 2 assert (valid_batchsize > 1) kwargs = {'num_workers': 4, 'pin_memory': True} valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=valid_batchsize, shuffle=False, **kwargs) fps = [0] * m.num_classes if not os.path.exists(prefix): # os.mkdir(prefix) os.makedirs(prefix) for i in range(m.num_classes): buf = '%s/%s%s.txt' % (prefix, outfile, names[i]) fps[i] = open(buf, 'w') lineId = -1 conf_thresh = 0.005 nms_thresh = 0.45 for batch_idx, (data, target) in enumerate(valid_loader): data = data.cuda() data = Variable(data, volatile=True) output = m(data).data batch_boxes = get_region_boxes(output, conf_thresh, m.num_classes, m.anchors, m.num_anchors, 0, 1) for i in range(output.size(0)): lineId = lineId + 1 fileId = os.path.basename(valid_files[lineId]).split('.')[0] width, height = get_image_size(valid_files[lineId]) print(valid_files[lineId]) boxes = batch_boxes[i] boxes = nms(boxes, nms_thresh) for box in boxes: x1 = (box[0] - box[2] / 2.0) * width y1 = (box[1] - box[3] / 2.0) * height x2 = (box[0] + box[2] / 2.0) * width y2 = (box[1] + box[3] / 2.0) * height det_conf = box[4] # import pdb # pdb.set_trace() for j in range((len(box) - 5) / 2): cls_conf = box[5 + 2 * j] cls_id = box[6 + 2 * j] prob = det_conf * cls_conf fps[cls_id].write('%s %f %f %f %f %f\n' % (fileId, prob, x1, y1, x2, y2)) # fps[cls_id].write('%s %f %f %f %f %f %f\n' % (fileId, det_conf, cls_conf, x1, y1, x2, y2)) for i in range(m.num_classes): fps[i].close()
model = Darknet(args.cfgfile) model.load_weights(args.weightsfile) print("Network successfully loaded") model.net_info["height"] = args.reso inp_dim = int(model.net_info["height"]) assert inp_dim % 32 == 0 assert inp_dim > 32 #If there's a GPU availible, put the model on GPU if CUDA: model.cuda() #Set the model in evaluation mode model.eval() read_dir = time.time() #Detection phase try: imlist = [osp.join(osp.realpath('.'), images, img) for img in os.listdir(images)] except NotADirectoryError: imlist = [] imlist.append(osp.join(osp.realpath('.'), images)) except FileNotFoundError: print ("No file or directory with the name {}".format(images)) exit() if not os.path.exists(args.det): os.makedirs(args.det)
def eval_list(cfgfile, weightfile, imglist): #m = TinyYoloFace14Net() #m.eval() #m.load_darknet_weights(tiny_yolo_weight) m = Darknet(cfgfile) m.eval() m.load_weights(weightfile) eval_wid = m.width eval_hei = m.height use_cuda = True if use_cuda: m.cuda() conf_thresh = 0.25 nms_thresh = 0.4 iou_thresh = 0.5 min_box_scale = 8. / m.width with open(imglist) as fp: lines = fp.readlines() total = 0.0 proposals = 0.0 correct = 0.0 lineId = 0 avg_iou = 0.0 for line in lines: img_path = line.rstrip() if img_path[0] == '#': continue lineId = lineId + 1 lab_path = img_path.replace('images', 'labels') lab_path = lab_path.replace('JPEGImages', 'labels') lab_path = lab_path.replace('.jpg', '.txt').replace('.png', '.txt') #truths = read_truths(lab_path) truths = read_truths_args(lab_path, min_box_scale) #print(truths) img = Image.open(img_path).convert('RGB').resize((eval_wid, eval_hei)) boxes = do_detect(m, img, conf_thresh, nms_thresh, use_cuda) if False: savename = "tmp/%06d.jpg" % (lineId) print("save %s" % savename) plot_boxes(img, boxes, savename) total = total + truths.shape[0] for i in range(len(boxes)): if boxes[i][4] > conf_thresh: proposals = proposals + 1 for i in range(truths.shape[0]): box_gt = [ truths[i][1], truths[i][2], truths[i][3], truths[i][4], 1.0 ] best_iou = 0 for j in range(len(boxes)): iou = bbox_iou(box_gt, boxes[j], x1y1x2y2=False) best_iou = max(iou, best_iou) if best_iou > iou_thresh: avg_iou += best_iou correct = correct + 1 precision = 1.0 * correct / proposals recall = 1.0 * correct / total fscore = 2.0 * precision * recall / (precision + recall) print("%d IOU: %f, Recal: %f, Precision: %f, Fscore: %f\n" % (lineId - 1, avg_iou / correct, recall, precision, fscore))
# 初始化网络并载入权重 print("载入神经网络...") model = Darknet(args.cfgfile) # Darknet类中初始化时得到了网络结构和网络的参数信息,保存在其参数net_info,module_list中 model.load_weights(args.weightsfile) # 将权重文件载入,并复制给对应的网络结构model中 print("模型加载成功.") # 网络输入数据大小 model.net_info["height"] = args.reso # model类中net_info是一个字典。’’height’’是图片的宽高,因为图片缩放到416x416,所以宽高一样大 inp_dim = int(model.net_info["height"]) # inp_dim是网络输入图片尺寸(如416*416) assert inp_dim % 32 == 0 # 如果设定的输入图片的尺寸不是32的位数或者不大于32,抛出异常 assert inp_dim > 32 # 如果GPU可用, 模型切换到cuda中运行 if CUDA: model.cuda() model.eval() # 变成测试模式,这主要是对dropout和batch normalization的操作在训练和测试的时候是不一样的 read_dir = time.time() # read_dir 是一个用于测量时间的检查点,开始计时 # 加载待检测图像列表 try: # 从磁盘读取图像或从目录读取多张图像。图像的路径存储在一个名为 imlist 的列表中,imlist列表保存了images文件中所有图片的完整路径,一张图片路径对应一个元素。 # osp.realpath('.')得到了图片所在文件夹的绝对路径,images是测试图片文件夹,listdir(images)得到了images文件夹下面所有图片的名字。 # 通过join()把目录(文件夹)的绝对路径和图片名结合起来,就得到了一张图片的完整路径 imlist = [osp.join(osp.realpath('.'), images, img) for img in os.listdir(images)] # 值:'D:\\PyCharm Professional\\projects\\YOLO_tutorial\\imgs\\dog-cycle-car.png' except NotADirectoryError: # 如果上面的路径有错,只得到images文件夹绝对路径即可 imlist = [] imlist.append(osp.join(osp.realpath('.'), images)) except FileNotFoundError: print("No file or directory with the name {}".format(images)) exit() # 存储结果目录
def main(): # Parsing arguments arguments_parser = ArgumentsParser() args = arguments_parser.parse_arguments() images = args.images batch_size = int(args.bs) confidence = float(args.confidence) nms_thresh = float(args.nms_thresh) # Set up the neural network print("Loading network.....") model = Darknet(args.cfgfile) model.load_weights(args.weightsfile) print("Network successfully loaded") model.net_info["height"] = args.reso inp_dim = int(model.net_info["height"]) assert inp_dim % 32 == 0 assert inp_dim > 32 # If there's a GPU availible, put the model on GPU if CUDA: model.cuda() # Set the model in evaluation mode model.eval() read_dir = time.time() # Detection phase load_batch = time.time() image_manager = Cv2ImageManager() loaded_images, list_of_images = image_manager.read_images(images) im_batches = list( map(prep_image, loaded_images, [inp_dim for x in range(len(list_of_images))])) im_dim_list = [(x.shape[1], x.shape[0]) for x in loaded_images] im_dim_list = torch.FloatTensor(im_dim_list).repeat(1, 2) leftover = 0 if (len(im_dim_list) % batch_size): leftover = 1 if batch_size != 1: num_batches = len(list_of_images) // batch_size + leftover im_batches = [ torch.cat( (im_batches[i * batch_size:min((i + 1) * batch_size, len(im_batches))])) for i in range(num_batches) ] if CUDA: im_dim_list = im_dim_list.cuda() start_det_loop = time.time() detector = Detector(model, im_batches, batch_size, inp_dim, confidence, nms_thresh, CLASSES, NUMBER_OF_CLASSES, CUDA) output = detector.detect(list_of_images, im_dim_list) output_recast = time.time() class_load = time.time() draw = time.time() det_images = list( map( lambda x: image_manager.draw_bounding_boxes( x, loaded_images, CLASSES), output)) det_names = list( map(lambda x: "{det}/{x}".format(det=args.det, x=x), [osp.basename(image_name) for image_name in list_of_images])) image_manager.write_images(det_names, det_images) end = time.time() print("SUMMARY") print("----------------------------------------------------------") print("{:25s}: {}".format("Task", "Time Taken (in seconds)")) print() print("{:25s}: {:2.3f}".format("Reading addresses", load_batch - read_dir)) print("{:25s}: {:2.3f}".format("Loading batch", start_det_loop - load_batch)) print("{:25s}: {:2.3f}".format( "Detection (" + str(len(list_of_images)) + " images)", output_recast - start_det_loop)) print("{:25s}: {:2.3f}".format("Output Processing", class_load - output_recast)) print("{:25s}: {:2.3f}".format("Drawing Boxes", end - draw)) print("{:25s}: {:2.3f}".format("Average time_per_img", (end - load_batch) / len(list_of_images))) print("----------------------------------------------------------") torch.cuda.empty_cache()
num_classes = 2 CUDA = torch.cuda.is_available() bbox_attrs = 5 + num_classes print("Loading network.....") model = Darknet(args.cfgfile) if args.weights_path.endswith(".weights"): # Load darknet weights model.load_darknet_weights(args.weights_path) else: # Load checkpoint weights model.load_state_dict(torch.load(args.weights_path)) model.eval() # Set in evaluation mode print("Network successfully loaded") model.net_info["height"] = args.reso inp_dim = int(model.net_info["height"]) assert inp_dim % 32 == 0 assert inp_dim > 32 if CUDA: model.cuda() model(get_test_input(inp_dim, CUDA), CUDA) model.eval() videofile = args.video
def run(): logger = logging.getLogger() # Parse command window input parser = argparse.ArgumentParser(description='SingleShotPose') parser.add_argument('--datacfg', type=str, default='cfg/ape.data') # data config parser.add_argument('--modelcfg', type=str, default='cfg/yolo-pose.cfg') # network config parser.add_argument( '--initweightfile', type=str, default='backup/init.weights') # initialization weights parser.add_argument('--pretrain_num_epochs', type=int, default=0) # how many epoch to pretrain args = parser.parse_args() datacfg = args.datacfg modelcfg = args.modelcfg initweightfile = args.initweightfile pretrain_num_epochs = args.pretrain_num_epochs print("ARGS: ", args) # Parse data configuration file data_options = read_data_cfg(datacfg) trainlist = data_options['valid'] gpus = data_options['gpus'] num_workers = int(data_options['num_workers']) backupdir = data_options['backup'] im_width = int(data_options['width']) im_height = int(data_options['height']) fx = float(data_options['fx']) fy = float(data_options['fy']) u0 = float(data_options['u0']) v0 = float(data_options['v0']) print("DATA OPTIONS: ", data_options) # Parse network and training configuration parameters net_options = parse_cfg(modelcfg)[0] loss_options = parse_cfg(modelcfg)[-1] batch_size = int(net_options['batch']) max_batches = int(net_options['max_batches']) max_epochs = int(net_options['max_epochs']) learning_rate = float(net_options['learning_rate']) momentum = float(net_options['momentum']) decay = float(net_options['decay']) conf_thresh = float(net_options['conf_thresh']) num_keypoints = int(net_options['num_keypoints']) num_classes = int(loss_options['classes']) num_anchors = int(loss_options['num']) steps = [float(step) for step in net_options['steps'].split(',')] scales = [float(scale) for scale in net_options['scales'].split(',')] # anchors = [float(anchor) for anchor in loss_options['anchors'].split(',')] print("NET OPTIONS: ", net_options) print("LOSS OPTIONS: ", loss_options) # Specifiy the model and the loss model = Darknet(modelcfg) # # Model settings model.load_weights(initweightfile) model.print_network() # model.seen = 0 # processed_batches = model.seen/batch_size init_width = 416 # model.width init_height = 416 # model.height batch_size = 1 num_workers = 0 # print("Size: ", init_width, init_height) bg_file_names = get_all_files('../VOCdevkit/VOC2012/JPEGImages') # Specify the number of workers use_cuda = True kwargs = { 'num_workers': num_workers, 'pin_memory': True } if use_cuda else {} logger.info("Loading data") # valid_dataset = dataset_multi.listDataset("../LINEMOD/duck/test_occlusion.txt", shape=(init_width, init_height), # shuffle=False, # objclass="duck", # transform=transforms.Compose([ # transforms.ToTensor(), # ])) # Get the dataloader for training dataset dataloader = torch.utils.data.DataLoader(dataset.listDataset( trainlist, shape=(init_width, init_height), shuffle=False, transform=transforms.Compose([ transforms.ToTensor(), ]), train=False, seen=0, batch_size=batch_size, num_workers=num_workers, bg_file_names=bg_file_names), batch_size=batch_size, shuffle=False, **kwargs) model.cuda() model.eval() delay = {True: 0, False: 1} paused = True # print("Classes in dataset ", num_classes) print("Batches in dataloader: ", len(dataloader)) tbar = tqdm(dataloader, ascii=True, dynamic_ncols=True) for ii, s in enumerate(tbar): images, targets = s # print(ii, "IMAGES:" , images.shape) # print(ii, "TARGET\n", targets.shape) bs = images.shape[0] t = targets.cpu().numpy().reshape(bs, 50, -1) # print("TARGET [0, 0:1] \n", t[0, :1]) # print("CLASSES ", t[0, :, 0]) images_gpu = images.cuda() model_out = model(images_gpu).detach() all_boxes = np.array( get_region_boxes(model_out, num_classes, num_keypoints, anchor_dim=num_anchors)).reshape( batch_size, 1, -1) # print("Model OUT", all_boxes.shape) pred = np.zeros_like(all_boxes) pred[:, 0, 0] = all_boxes[:, 0, -1] pred[:, 0, 1:-2] = all_boxes[:, 0, :-3] viz = visualize_results(images, t, pred, img_size=416, show_3d=True) cv2.imshow("Res ", viz) k = cv2.waitKey(delay[paused]) if k & 0xFF == ord('q'): break if k & 0xFF == ord('p'): paused = not paused
def demo(): params = { "video": "video.avi", # Video to run detection upon "dataset": "pasacal", # Dataset on which the network has been trained "confidence": 0.5, # Object Confidence to filter predictions "nms_thresh": 0.4, # NMS Threshold "cfgfile": "cfg/yolov3.cfg", # Config file "weightsfile": "yolov3.weights", # Weightsfile "repo": 416 # Input resolution of the network. Increase to increase accuracy. Decrease to increase speed } confidence = float(params["confidence"]) nms_thesh = float(params["nms_thresh"]) start = 0 CUDA = torch.cuda.is_available() num_classes = 80 bbox_attrs = 5 + num_classes bboxes = [] xywh = [] print("Loading network.....") model = Darknet(params["cfgfile"]) model.load_weights(params["weightsfile"]) print("Network successfully loaded") model.net_info["height"] = params["repo"] inp_dim = int(model.net_info["height"]) assert inp_dim % 32 == 0 assert inp_dim > 32 if CUDA: model.cuda() model.eval() videofile = params["video"] # set 0 for debug cap = cv2.VideoCapture(0) assert cap.isOpened(), 'Cannot capture source' frames = 0 start = time.time() while cap.isOpened(): ret, frame = cap.read() print("ret: ", ret) print("frame: ", frame.shape) if ret: img, orig_im, dim = prep_image(frame, inp_dim) im_dim = torch.FloatTensor(dim).repeat(1, 2) if CUDA: im_dim = im_dim.cuda() img = img.cuda() with torch.no_grad(): output = model(Variable(img), CUDA) output = write_results(output, confidence, num_classes, nms=True, nms_conf=nms_thesh) if type(output) == int: frames += 1 print( "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" ) print("FPS of the video is {:5.2f}".format( frames / (time.time() - start))) cv2.imshow("frame", orig_im) key = cv2.waitKey(1) if key & 0xFF == ord('q'): break continue im_dim = im_dim.repeat(output.size(0), 1) scaling_factor = torch.min(inp_dim / im_dim, 1)[0].view(-1, 1) output[:, [1, 3]] -= (inp_dim - scaling_factor * im_dim[:, 0].view(-1, 1)) / 2 output[:, [2, 4]] -= (inp_dim - scaling_factor * im_dim[:, 1].view(-1, 1)) / 2 output[:, 1:5] /= scaling_factor for i in range(output.shape[0]): output[i, [1, 3]] = torch.clamp(output[i, [1, 3]], 0.0, im_dim[i, 0]) output[i, [2, 4]] = torch.clamp(output[i, [2, 4]], 0.0, im_dim[i, 1]) print("output: ", output) print("output: ", output.shape) for i in output: x0 = i[1].int() y0 = i[2].int() x1 = i[3].int() y1 = i[4].int() bbox = (x0, y0, x1, y1) bboxes.append(bbox) print(bbox) w = x1 - x0 h = y1 - y0 xywh.append((x0, y0, w, h)) print(x0, y0, w, h) #return bboxes classes = load_classes('data/coco.names') colors = pkl.load(open("pallete", "rb")) # write bbox list(map(lambda x: write(x, orig_im, classes, colors), output)) cv2.imshow("frame", orig_im) key = cv2.waitKey(1) if key & 0xFF == ord('q'): break frames += 1 print("FPS of the video is {:5.2f}g7".format( frames / (time.time() - start))) #return xywh else: break
class SegPoseNet(nn.Module): def __init__(self, data_options): super(SegPoseNet, self).__init__() pose_arch_cfg = data_options['pose_arch_cfg'] self.width = int(data_options['width']) self.height = int(data_options['height']) self.channels = int(data_options['channels']) self.domains = int(data_options['domains']) # note you need to change this after modifying the network self.output_h = 76 self.output_w = 76 self.coreModel = Darknet(pose_arch_cfg, self.width, self.height, self.channels, self.domains) self.segLayer = PoseSegLayer(data_options) self.regLayer = Pose2DLayer(data_options) self.discLayer = Discriminator() self.training = False def forward(self, x, y = None, adapt = False, domains = None): outlayers = self.coreModel(x, domains=domains) if self.training and adapt: in1 = source_only(outlayers[0], domains) in2 = source_only(outlayers[1], domains) else: in1 = outlayers[0] in2 = outlayers[1] out3 = self.discLayer(outlayers[2]) out4 = outlayers[3] out5 = outlayers[4] out1 = self.segLayer(in1) out2 = self.regLayer(in2) out_preds = [out1, out2, out3, out4, out5] return out_preds def train(self): self.coreModel.train() self.segLayer.train() self.regLayer.train() self.discLayer.train() self.training = True def eval(self): self.coreModel.eval() self.segLayer.eval() self.regLayer.eval() self.discLayer.eval() self.training = False def print_network(self): self.coreModel.print_network() def load_weights(self, weightfile): self.coreModel.load_state_dict(torch.load(weightfile)) def save_weights(self, weightfile): torch.save(self.coreModel.state_dict(), weightfile)
class Patch(): def __init__(self, config, device): self.config = config self.device = device # Create pytorch3D renderer self.renderer = self.create_renderer() # Datasets self.mesh_dataset = MeshDataset(config.mesh_dir, device, max_num=config.num_meshes) self.bg_dataset = BackgroundDataset(config.bg_dir, config.img_size, max_num=config.num_bgs) self.test_bg_dataset = BackgroundDataset(config.test_bg_dir, config.img_size, max_num=config.num_test_bgs) # Initialize adversarial patch self.patch = None self.idx = None # Yolo model: self.dnet = Darknet(self.config.cfgfile) self.dnet.load_weights(self.config.weightfile) self.dnet = self.dnet.eval() self.dnet = self.dnet.to(self.device) if self.config.patch_dir is not None: self.patch = torch.load(self.config.patch_dir + '/patch_save.pt').to(self.device) self.idx = torch.load(self.config.patch_dir + '/idx_save.pt').to(self.device) self.test_bgs = DataLoader( self.test_bg_dataset, batch_size=1, shuffle=True, num_workers=1) self.min_contrast = 0.8 self.max_contrast = 1.2 self.min_brightness = -0.1 self.max_brightness = 0.1 self.noise_factor = 0.10 def attack_faster_rcnn(self): path_to_checkpoint='model-180000.pth' dataset_name="coco2017" backbone_name="resnet101" prob_thresh=0.6 dataset_class = DatasetBase.from_name(dataset_name) backbone = BackboneBase.from_name(backbone_name)(pretrained=False) model = FasterRCNN(backbone, dataset_class.num_classes(), pooler_mode=Config.POOLER_MODE, anchor_ratios=Config.ANCHOR_RATIOS, anchor_sizes=Config.ANCHOR_SIZES, rpn_pre_nms_top_n=Config.RPN_PRE_NMS_TOP_N, rpn_post_nms_top_n=Config.RPN_POST_NMS_TOP_N).cuda() model.load(path_to_checkpoint) train_bgs = DataLoader( self.bg_dataset, batch_size=self.config.batch_size, shuffle=True, num_workers=1) if self.patch is None or self.idx is None: self.initialize_patch() mesh = self.mesh_dataset.meshes[0] total_variation = TotalVariation_3d(mesh, self.idx).to(self.device) optimizer = torch.optim.SGD([self.patch], lr=1e-1, momentum=0.9) for epoch in range(self.config.epochs): ep_loss = 0.0 ep_acc = 0.0 n = 0.0 for mesh in self.mesh_dataset: # Copy mesh for each camera angle mesh = mesh.extend(self.num_angles_train) for bg_batch in train_bgs: bg_batch = bg_batch.to(self.device) optimizer.zero_grad() texture_image = mesh.textures.atlas_padded() # Random patch augmentation contrast = torch.FloatTensor(1).uniform_(self.min_contrast, self.max_contrast).to(self.device) brightness = torch.FloatTensor(1).uniform_(self.min_brightness, self.max_brightness).to(self.device) noise = torch.FloatTensor(self.patch.shape).uniform_(-1, 1) * self.noise_factor noise = noise.to(self.device) augmented_patch = (self.patch * contrast) + brightness + noise # Clamp patch to avoid PyTorch3D issues clamped_patch = augmented_patch.clone().clamp(min=1e-6, max=0.99999) mesh.textures._atlas_padded[:,self.idx,:,:,:] = clamped_patch mesh.textures.atlas = mesh.textures._atlas_padded mesh.textures._atlas_list = None # Render mesh onto background image rand_translation = torch.randint( -self.config.rand_translation, self.config.rand_translation, (2,) ) images = self.render_mesh_on_bg_batch(mesh, bg_batch, self.num_angles_train, x_translation=rand_translation[0].item(), y_translation=rand_translation[1].item()) reshape_img = images[:,:,:,:3].permute(0, 3, 1, 2) reshape_img = reshape_img.to(self.device) # image_tensor, scale = dataset_class.preprocess(reshape_img, Config.IMAGE_MIN_SIDE, Config.IMAGE_MAX_SIDE) detection_bboxes, detection_classes, detection_probs, _ = \ model.eval().forward(reshape_img.cuda()) # detection_bboxes /= scale kept_indices = detection_probs > prob_thresh detection_bboxes = detection_bboxes[kept_indices] detection_classes = detection_classes[kept_indices] detection_probs = detection_probs[kept_indices] human_dets = torch.where(detection_classes == 1, torch.ones(1), torch.zeros(1)).cuda() disap_loss = torch.mean(human_dets * detection_probs) tv = total_variation(self.patch) tv_loss = tv * 2.5 loss = disap_loss + tv_loss n += bg_batch.shape[0] if torch.isnan(loss).item(): continue ep_loss += loss.item() loss.backward(retain_graph=True) optimizer.step() # Save image and print performance statistics print('tv={}, dis={}'.format(tv_loss, disap_loss)) patch_save = self.patch.cpu().detach().clone() idx_save = self.idx.cpu().detach().clone() torch.save(patch_save, 'patch_save.pt') torch.save(idx_save, 'idx_save.pt') print('epoch={} loss={}'.format( epoch, (ep_loss / n) ) ) if epoch % 5 == 0: self.test_patch() self.change_cameras('train') def attack(self): train_bgs = DataLoader( self.bg_dataset, batch_size=self.config.batch_size, shuffle=True, num_workers=1) if self.patch is None or self.idx is None: self.initialize_patch() mesh = self.mesh_dataset.meshes[0] total_variation = TotalVariation_3d(mesh, self.idx).to(self.device) optimizer = torch.optim.SGD([self.patch], lr=1e-1, momentum=0.9) for epoch in range(self.config.epochs): ep_loss = 0.0 ep_acc = 0.0 n = 0.0 for mesh in self.mesh_dataset: # Copy mesh for each camera angle mesh = mesh.extend(self.num_angles_train) for bg_batch in train_bgs: bg_batch = bg_batch.to(self.device) # To enable random camera distance training, uncomment this line: # self.change_cameras('train', camera_dist=random.uniform(1.4, 3.0)) optimizer.zero_grad() texture_image = mesh.textures.atlas_padded() # Random patch augmentation contrast = torch.FloatTensor(1).uniform_(self.min_contrast, self.max_contrast).to(self.device) brightness = torch.FloatTensor(1).uniform_(self.min_brightness, self.max_brightness).to(self.device) noise = torch.FloatTensor(self.patch.shape).uniform_(-1, 1) * self.noise_factor noise = noise.to(self.device) augmented_patch = (self.patch * contrast) + brightness + noise # Clamp patch to avoid PyTorch3D issues clamped_patch = augmented_patch.clone().clamp(min=1e-6, max=0.99999) mesh.textures._atlas_padded[:,self.idx,:,:,:] = clamped_patch mesh.textures.atlas = mesh.textures._atlas_padded mesh.textures._atlas_list = None # Render mesh onto background image rand_translation = torch.randint( -self.config.rand_translation, self.config.rand_translation, (2,) ) images = self.render_mesh_on_bg_batch(mesh, bg_batch, self.num_angles_train, x_translation=rand_translation[0].item(), y_translation=rand_translation[1].item()) reshape_img = images[:,:,:,:3].permute(0, 3, 1, 2) reshape_img = reshape_img.to(self.device) # Run detection model on images output = self.dnet(reshape_img) d_loss = dis_loss(output, self.dnet.num_classes, self.dnet.anchors, self.dnet.num_anchors, 0) acc_loss = calc_acc(output, self.dnet.num_classes, self.dnet.num_anchors, 0) tv = total_variation(self.patch) tv_loss = tv * 2.5 loss = d_loss + tv_loss ep_loss += loss.item() ep_acc += acc_loss.item() n += bg_batch.shape[0] loss.backward(retain_graph=True) optimizer.step() # Save image and print performance statistics patch_save = self.patch.cpu().detach().clone() idx_save = self.idx.cpu().detach().clone() torch.save(patch_save, 'patch_save.pt') torch.save(idx_save, 'idx_save.pt') save_image(reshape_img[0].cpu().detach(), "TEST_RENDER.png") print('epoch={} loss={} success_rate={}'.format( epoch, (ep_loss / n), (ep_acc / n) / self.num_angles_train) ) if epoch % 5 == 0: self.test_patch() self.change_cameras('train') def test_patch(self): self.change_cameras('test') angle_success = torch.zeros(self.num_angles_test) total_loss = 0.0 n = 0.0 for mesh in self.mesh_dataset: mesh = mesh.extend(self.num_angles_test) for bg_batch in self.test_bgs: bg_batch = bg_batch.to(self.device) texture_image=mesh.textures.atlas_padded() clamped_patch = self.patch.clone().clamp(min=1e-6, max=0.99999) mesh.textures._atlas_padded[:,self.idx,:,:,:] = clamped_patch mesh.textures.atlas = mesh.textures._atlas_padded mesh.textures._atlas_list = None rand_translation = torch.randint( -self.config.rand_translation, self.config.rand_translation, (2,) ) images = self.render_mesh_on_bg_batch(mesh, bg_batch, self.num_angles_test, x_translation=rand_translation[0].item(), y_translation=rand_translation[1].item()) reshape_img = images[:,:,:,:3].permute(0, 3, 1, 2) reshape_img = reshape_img.to(self.device) output = self.dnet(reshape_img) for angle in range(self.num_angles_test): acc_loss = calc_acc(output[angle], self.dnet.num_classes, self.dnet.num_anchors, 0) angle_success[angle] += acc_loss.item() n += bg_batch.shape[0] save_image(reshape_img[0].cpu().detach(), "TEST.png") unseen_success_rate = torch.sum(angle_success) / (n * self.num_angles_test) print('Angle success rates: ', angle_success / n) print('Unseen bg success rate: ', unseen_success_rate.item()) def test_patch_faster_rcnn(self, path_to_checkpoint: str, dataset_name: str, backbone_name: str, prob_thresh: float): dataset_class = DatasetBase.from_name(dataset_name) backbone = BackboneBase.from_name(backbone_name)(pretrained=False) model = FasterRCNN(backbone, dataset_class.num_classes(), pooler_mode=Config.POOLER_MODE, anchor_ratios=Config.ANCHOR_RATIOS, anchor_sizes=Config.ANCHOR_SIZES, rpn_pre_nms_top_n=Config.RPN_PRE_NMS_TOP_N, rpn_post_nms_top_n=Config.RPN_POST_NMS_TOP_N).cuda() model.load(path_to_checkpoint) angle_success = torch.zeros(self.num_angles_test) total_loss = 0.0 n = 0.0 with torch.no_grad(): for mesh in self.mesh_dataset: mesh = mesh.extend(self.num_angles_test) for bg_batch in self.test_bgs: bg_batch = bg_batch.to(self.device) texture_image=mesh.textures.atlas_padded() clamped_patch = self.patch.clone().clamp(min=1e-6, max=0.99999) mesh.textures._atlas_padded[:,self.idx,:,:,:] = clamped_patch mesh.textures.atlas = mesh.textures._atlas_padded mesh.textures._atlas_list = None rand_translation = torch.randint( -self.config.rand_translation, self.config.rand_translation, (2,) ) images = self.render_mesh_on_bg_batch( mesh, bg_batch, self.num_angles_test, x_translation=rand_translation[0].item(), y_translation=rand_translation[1].item() ) reshape_img = images[:,:,:,:3].permute(0, 3, 1, 2) save_image(reshape_img[0].cpu().detach(), "TEST_PRE.png") for angle in range(self.num_angles_test): image = torchvision.transforms.ToPILImage()(reshape_img[angle,:,:,:].cpu()) # image_tensor, scale = dataset_class.preprocess(image, Config.IMAGE_MIN_SIDE, Config.IMAGE_MAX_SIDE) image_tensor = reshape_img[angle, ..., :] scale = 1.0 save_image(image_tensor.cpu().detach(), "TEST_POST.png") img = Image.open('TEST_POST.png').convert('RGB') img = torchvision.transforms.ToTensor()(image) image_tensor = img.cuda() detection_bboxes, detection_classes, detection_probs, _ = \ model.eval().forward(image_tensor.unsqueeze(dim=0).cuda()) detection_bboxes /= scale kept_indices = detection_probs > prob_thresh detection_bboxes = detection_bboxes[kept_indices] detection_classes = detection_classes[kept_indices] detection_probs = detection_probs[kept_indices] draw = ImageDraw.Draw(image) for bbox, cls, prob in zip(detection_bboxes.tolist(), detection_classes.tolist(), detection_probs.tolist()): color = random.choice(['red', 'green', 'blue', 'yellow', 'purple', 'white']) bbox = BBox(left=bbox[0], top=bbox[1], right=bbox[2], bottom=bbox[3]) category = dataset_class.LABEL_TO_CATEGORY_DICT[cls] draw.rectangle(((bbox.left, bbox.top), (bbox.right, bbox.bottom)), outline=color, width=3) draw.text((bbox.left, bbox.top), text=f'{category:s} {prob:.3f}', fill=color) if angle==0: image.save("out/images/test_%d.png" % n) n += 1.0 def initialize_patch(self): print('Initializing patch...') # Code for sampling faces: # mesh = self.mesh_dataset.meshes[0] # box = mesh.get_bounding_boxes() # max_x = box[0,0,1] # max_y = box[0,1,1] # max_z = box[0,2,1] # min_x = box[0,0,0] # min_y = box[0,1,0] # min_z = box[0,2,0] # len_z = max_z - min_z # len_x = max_x - min_x # len_y = max_y - min_y # verts = mesh.verts_padded() # v_shape = verts.shape # sampled_verts = torch.zeros(v_shape[1]).to('cuda') # for i in range(v_shape[1]): # #original human1 not SMPL # #if verts[0,i,2] > min_z + len_z * 0.55 and verts[0,i,0] > min_x + len_x*0.3 and verts[0,i,0] < min_x + len_x*0.7 and verts[0,i,1] > min_y + len_y*0.6 and verts[0,i,1] < min_y + len_y*0.7: # #SMPL front # if verts[0,i,2] > min_z + len_z * 0.55 and verts[0,i,0] > min_x + len_x*0.35 and verts[0,i,0] < min_x + len_x*0.65 and verts[0,i,1] > min_y + len_y*0.65 and verts[0,i,1] < min_y + len_y*0.75: # #back # #if verts[0,i,2] < min_z + len_z * 0.5 and verts[0,i,0] > min_x + len_x*0.35 and verts[0,i,0] < min_x + len_x*0.65 and verts[0,i,1] > min_y + len_y*0.65 and verts[0,i,1] < min_y + len_y*0.75: # #leg # #if verts[0,i,0] > min_x + len_x*0.5 and verts[0,i,0] < min_x + len_x and verts[0,i,1] > min_y + len_y*0.2 and verts[0,i,1] < min_y + len_y*0.3: # sampled_verts[i] = 1 # faces = mesh.faces_padded() # f_shape = faces.shape # sampled_planes = list() # for i in range(faces.shape[1]): # v1 = faces[0,i,0] # v2 = faces[0,i,1] # v3 = faces[0,i,2] # if sampled_verts[v1]+sampled_verts[v2]+sampled_verts[v3]>=1: # sampled_planes.append(i) # Sample faces from index file: sampled_planes = np.load(self.config.idx).tolist() idx = torch.Tensor(sampled_planes).long().to(self.device) self.idx = idx patch = torch.rand(len(sampled_planes), 1, 1, 3, device=(self.device), requires_grad=True) self.patch = patch def create_renderer(self): self.num_angles_train = self.config.num_angles_train self.num_angles_test = self.config.num_angles_test azim_train = torch.linspace(-1 * self.config.angle_range_train, self.config.angle_range_train, self.num_angles_train) azim_test = torch.linspace(-1 * self.config.angle_range_test, self.config.angle_range_test, self.num_angles_test) # Cameras for SMPL meshes: camera_dist = 2.2 R, T = look_at_view_transform(camera_dist, 6, azim_train) train_cameras = FoVPerspectiveCameras(device=self.device, R=R, T=T) self.train_cameras = train_cameras R, T = look_at_view_transform(camera_dist, 6, azim_test) test_cameras = FoVPerspectiveCameras(device=self.device, R=R, T=T) self.test_cameras = test_cameras raster_settings = RasterizationSettings( image_size=self.config.img_size, blur_radius=0.0, faces_per_pixel=1, ) lights = PointLights(device=self.device, location=[[0.0, 85, 100.0]]) renderer = MeshRenderer( rasterizer=MeshRasterizer( cameras=train_cameras, raster_settings=raster_settings ), shader=HardPhongShader( device=self.device, cameras=train_cameras, lights=lights ) ) return renderer def change_cameras(self, mode, camera_dist=2.2): azim_train = torch.linspace(-1 * self.config.angle_range_train, self.config.angle_range_train, self.num_angles_train) azim_test = torch.linspace(-1 * self.config.angle_range_test, self.config.angle_range_test, self.num_angles_test) R, T = look_at_view_transform(camera_dist, 6, azim_train) train_cameras = FoVPerspectiveCameras(device=self.device, R=R, T=T) self.train_cameras = train_cameras R, T = look_at_view_transform(camera_dist, 6, azim_test) test_cameras = FoVPerspectiveCameras(device=self.device, R=R, T=T) self.test_cameras = test_cameras if mode == 'train': self.renderer.rasterizer.cameras=self.train_cameras self.renderer.shader.cameras=self.train_cameras elif mode == 'test': self.renderer.rasterizer.cameras=self.test_cameras self.renderer.shader.cameras=self.test_cameras def render_mesh_on_bg(self, mesh, bg_img, num_angles, location=None, x_translation=0, y_translation=0): images = self.renderer(mesh) bg = bg_img.unsqueeze(0) bg_shape = bg.shape new_bg = torch.zeros(bg_shape[2], bg_shape[3], 3) new_bg[:,:,0] = bg[0,0,:,:] new_bg[:,:,1] = bg[0,1,:,:] new_bg[:,:,2] = bg[0,2,:,:] human = images[:, ..., :3] human_size = self.renderer.rasterizer.raster_settings.image_size if location is None: dH = bg_shape[2] - human_size dW = bg_shape[3] - human_size location = ( dW // 2 + x_translation, dW - (dW // 2) - x_translation, dH // 2 + y_translation, dH - (dH // 2) - y_translation ) contour = torch.where((human == 1).cpu(), torch.zeros(1).cpu(), torch.ones(1).cpu()) new_contour = torch.zeros(num_angles, bg_shape[2], bg_shape[3], 3) new_contour[:,:,:,0] = F.pad(contour[:,:,:,0], location, "constant", value=0) new_contour[:,:,:,1] = F.pad(contour[:,:,:,1], location, "constant", value=0) new_contour[:,:,:,2] = F.pad(contour[:,:,:,2], location, "constant", value=0) new_human = torch.zeros(num_angles, bg_shape[2], bg_shape[3], 3) new_human[:,:,:,0] = F.pad(human[:,:,:,0], location, "constant", value=0) new_human[:,:,:,1] = F.pad(human[:,:,:,1], location, "constant", value=0) new_human[:,:,:,2] = F.pad(human[:,:,:,2], location, "constant", value=0) final = torch.where((new_contour == 0).cpu(), new_bg.cpu(), new_human.cpu()) return final def render_mesh_on_bg_batch(self, mesh, bg_imgs, num_angles, location=None, x_translation=0, y_translation=0): num_bgs = bg_imgs.shape[0] images = self.renderer(mesh) # (num_angles, 416, 416, 4) images = torch.cat(num_bgs*[images], dim=0) # (num_angles * num_bgs, 416, 416, 4) bg_shape = bg_imgs.shape # bg_imgs: (num_bgs, 3, 416, 416) -> (num_bgs, 416, 416, 3) bg_imgs = bg_imgs.permute(0, 2, 3, 1) # bg_imgs: (num_bgs, 416, 416, 3) -> (num_bgs * num_angles, 416, 416, 3) bg_imgs = bg_imgs.repeat_interleave(repeats=num_angles, dim=0) # human: RGB channels of render (num_angles * num_bgs, 416, 416, 3) human = images[:, ..., :3] human_size = self.renderer.rasterizer.raster_settings.image_size if location is None: dH = bg_shape[2] - human_size dW = bg_shape[3] - human_size location = ( dW // 2 + x_translation, dW - (dW // 2) - x_translation, dH // 2 + y_translation, dH - (dH // 2) - y_translation ) contour = torch.where((human == 1), torch.zeros(1).to(self.device), torch.ones(1).to(self.device)) new_contour = torch.zeros(num_angles * num_bgs, bg_shape[2], bg_shape[3], 3, device=self.device) new_contour[:,:,:,0] = F.pad(contour[:,:,:,0], location, "constant", value=0) new_contour[:,:,:,1] = F.pad(contour[:,:,:,1], location, "constant", value=0) new_contour[:,:,:,2] = F.pad(contour[:,:,:,2], location, "constant", value=0) new_human = torch.zeros(num_angles * num_bgs, bg_shape[2], bg_shape[3], 3, device=self.device) new_human[:,:,:,0] = F.pad(human[:,:,:,0], location, "constant", value=0) new_human[:,:,:,1] = F.pad(human[:,:,:,1], location, "constant", value=0) new_human[:,:,:,2] = F.pad(human[:,:,:,2], location, "constant", value=0) # output: (num_angles * num_bgs, 416, 416, 3) final = torch.where((new_contour == 0), bg_imgs, new_human) return final
def valid(datacfg, cfgfile, weightfile, outfile): options = read_data_cfg(datacfg) valid_images = options['valid'] name_list = options['names'] prefix = 'results' names = load_class_names(name_list) print(names) with open(valid_images) as fp: tmp_files = fp.readlines() valid_files = [item.rstrip() for item in tmp_files] m = Darknet(cfgfile) # print(m) m.print_network() m.load_weights(weightfile) m.cuda() m.eval() valid_dataset = dataset.listDataset(valid_images, shape=(m.width, m.height), shuffle=False, transform=transforms.Compose([ transforms.ToTensor(), ])) valid_batchsize = 2 assert (valid_batchsize > 1) kwargs = {'num_workers': 4, 'pin_memory': True} valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=valid_batchsize, shuffle=False, **kwargs) fps = [0] * m.num_classes if not os.path.exists('results'): os.mkdir('results') print(len(names), m.num_classes) for i in range(m.num_classes): buf = '%s/%s%s.txt' % (prefix, outfile, names[i]) fps[i] = open(buf, 'w') lineId = -1 conf_thresh = 0.005 nms_thresh = 0.45 for _, (data, target) in tqdm(enumerate(valid_loader)): data = data.cuda() output = m(data) batch_boxes = get_all_boxes(output, conf_thresh, m.num_classes, only_objectness=0, validation=True) for i in range(data.size(0)): lineId = lineId + 1 fileId = os.path.basename(valid_files[lineId]).split('.')[0] width, height = get_image_size(valid_files[lineId]) # print(valid_files[lineId]) boxes = batch_boxes[i] boxes = nms(boxes, nms_thresh) for box in boxes: x1 = (box[0] - box[2] / 2.0) * width y1 = (box[1] - box[3] / 2.0) * height x2 = (box[0] + box[2] / 2.0) * width y2 = (box[1] + box[3] / 2.0) * height det_conf = box[4] for j in range((len(box) - 5) // 2): cls_conf = box[5 + 2 * j] cls_id = box[6 + 2 * j] prob = det_conf * cls_conf fps[cls_id].write('%s %f %f %f %f %f\n' % (fileId, prob, x1, y1, x2, y2)) for i in range(m.num_classes): fps[i].close()
class Darknet_Detector(): def __init__(self, id_num, cfg_file,wt_file,class_file,pallete_file, nms_threshold = .3 , conf = 0.7, resolution=1024, num_classes=80, nms_classwise= True): #Set up the neural network print("Loading network.....") self.model = Darknet(cfg_file) self.model.load_weights(wt_file) print("Network successfully loaded") self.nms = nms_threshold self.conf = conf self.nms_classwise = nms_classwise self.resolution = resolution # sets size of max dimension if id_num == 0: self.CUDA = True torch.cuda.set_device(0) torch.cuda.empty_cache() elif id_num == 1: self.CUDA = True torch.cuda.set_device(1) torch.cuda.empty_cache() else: self.CUDA = False self.colors = pkl.load(open(pallete_file, "rb")) self.num_classes = num_classes self.classes = load_classes(class_file) self.model.net_info["height"] = self.resolution inp_dim = int(self.model.net_info["height"]) assert inp_dim % 32 == 0 assert inp_dim > 32 #If there's a GPU availible, put the model on GPU if self.CUDA: self.model.cuda() #Set the model in evaluation mode self.model.eval() def prep_image(self,img,inp_dim): """ Prepare image for inputting to the neural network. Returns a Variable """ orig_im = img dim = orig_im.shape[1], orig_im.shape[0] img = cv2.resize(orig_im, (inp_dim, inp_dim)) img_ = img[:,:,::-1].transpose((2,0,1)).copy() img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0) return img_, orig_im, dim #def write(x, img): # c1 = tuple(x[1:3].int()) # c2 = tuple(x[3:5].int()) # cls = int(x[-1]) # label = "{0}".format(classes[cls]) # color = random.choice(colors) # cv2.rectangle(img, c1, c2,color, 1) # t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1 , 1)[0] # c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4 # cv2.rectangle(img, c1, c2,color, -1) # cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225,255,255], 1); # return img # orig_im = img # dim = orig_im.shape[1], orig_im.shape[0] # img = cv2.resize(orig_im, (inp_dim, inp_dim)) # img_ = img[:,:,::-1].transpose((2,0,1)).copy() # img_ = torch.from_numpy(img_).float().div(255.0).unsqueeze(0) # return img_, orig_im, dim def write(self,x, img): c1 = tuple(x[1:3].int()) c2 = tuple(x[3:5].int()) cls = int(x[-1]) label = "{0}".format(self.classes[cls]) color = random.choice(self.colors) cv2.rectangle(img, c1, c2,color, 1) t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1 , 1)[0] c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4 cv2.rectangle(img, c1, c2,color, -1) cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225,255,255], 1); return img def detect(self,image, show = False,verbose = False,save_file = None): start = time.time() # try: # image is already loaded img, orig_im, dim = self.prep_image(image, self.resolution) except: # image is a file path image = cv2.imread(image) img, orig_im, dim = self.prep_image(image, self.resolution) im_dim = torch.FloatTensor(dim).repeat(1,2) if self.CUDA: im_dim = im_dim.cuda() img = img.cuda() output = self.model(Variable(img), self.CUDA) output = write_results(output, self.conf, self.num_classes, nms = True, nms_conf = self.nms) output[:,1:5] = torch.clamp(output[:,1:5], 0.0, float(self.resolution))/self.resolution im_dim = im_dim.repeat(output.size(0), 1) output[:,[1,3]] *= image.shape[1] output[:,[2,4]] *= image.shape[0] out = list(map(lambda x: self.write(x, orig_im), output)) if verbose: print("FPS of the video is {:5.2f}".format( 1.0 / (time.time() - start))) if save_file != None: cv2.imwrite(save_file, orig_im) if show: cv2.imshow("frame", orig_im) cv2.waitKey(0) return output, orig_im
from darknet import Darknet from caffenet import CaffeNet from PIL import Image from utils import image2torch, convert2cpu from torch.autograd import Variable cfgfile1 = 'reid.cfg' weightfile1 = 'reid.weights' cfgfile2 = 'reid_nbn.cfg' weightfile2 = 'reid_nbn.weights' cfgfile3 = 'reid_nbn.prototxt' weightfile3 = 'reid_nbn.caffemodel' m1 = Darknet(cfgfile1) m1.load_weights(weightfile1) m1.eval() m2 = Darknet(cfgfile2) m2.load_weights(weightfile2) m2.eval() m3 = CaffeNet(cfgfile3) m3.load_weights(weightfile3) m3.eval() img = torch.rand(8, 3, 128, 64) img = Variable(img) output1 = m1(img).clone() output2 = m2(img).clone() output3 = m3(img).clone()
def valid(datacfg, cfgfile, weightfile, outfile): def truths_length(truths): for i in range(50): if truths[i][1] == 0: return i # Parse configuration files options = read_data_cfg(datacfg) valid_images = options['valid'] meshname = options['mesh'] backupdir = options['backup'] name = options['name'] if not os.path.exists(backupdir): makedirs(backupdir) # Parameters prefix = 'results' seed = int(time.time()) gpus = '0' # Specify which gpus to use test_width = 544 test_height = 544 torch.manual_seed(seed) use_cuda = True if use_cuda: os.environ['CUDA_VISIBLE_DEVICES'] = gpus torch.cuda.manual_seed(seed) save = True testtime = True use_cuda = True num_classes = 1 testing_samples = 0.0 eps = 1e-5 notpredicted = 0 conf_thresh = 0.1 nms_thresh = 0.4 match_thresh = 0.5 if save: makedirs(backupdir + '/test') makedirs(backupdir + '/test/gt') makedirs(backupdir + '/test/pr') # To save testing_error_trans = 0.0 testing_error_angle = 0.0 testing_error_pixel = 0.0 errs_2d = [] errs_3d = [] errs_trans = [] errs_angle = [] errs_corner2D = [] preds_trans = [] preds_rot = [] preds_corners2D = [] gts_trans = [] gts_rot = [] gts_corners2D = [] # Read object model information, get 3D bounding box corners mesh = MeshPly(meshname) vertices = np.c_[np.array(mesh.vertices), np.ones((len(mesh.vertices), 1))].transpose() print('vertices', vertices) corners3D = get_3D_corners(vertices) print('corners3D', corners3D) # diam = calc_pts_diameter(np.array(mesh.vertices)) diam = float(options['diam']) # Read intrinsic camera parameters internal_calibration = get_camera_intrinsic() # Get validation file names with open(valid_images) as fp: tmp_files = fp.readlines() valid_files = [item.rstrip() for item in tmp_files] # Specicy model, load pretrained weights, pass to GPU and set the module in evaluation mode model = Darknet(cfgfile) model.print_network() model.load_weights(weightfile) model.cuda() model.eval() # Get the parser for the test dataset valid_dataset = dataset.listDataset(valid_images, shape=(test_width, test_height), shuffle=False, transform=transforms.Compose([ transforms.ToTensor(), ])) valid_batchsize = 1 # Specify the number of workers for multiple processing, get the dataloader for the test dataset kwargs = {'num_workers': 4, 'pin_memory': True} test_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=valid_batchsize, shuffle=False, **kwargs) logging(" Testing {}...".format(name)) logging(" Number of test samples: %d" % len(test_loader.dataset)) # Iterate through test batches (Batch size for test data is 1) count = 0 z = np.zeros((3, 1)) for batch_idx, (data, target) in enumerate(test_loader): t1 = time.time() # Pass data to GPU if use_cuda: data = data.cuda() target = target.cuda() # Wrap tensors in Variable class, set volatile=True for inference mode and to use minimal memory during inference data = Variable(data, volatile=True) t2 = time.time() # Forward pass output = model(data).data t3 = time.time() # Using confidence threshold, eliminate low-confidence predictions all_boxes = get_region_boxes(output, conf_thresh, num_classes) t4 = time.time() # Iterate through all images in the batch for i in range(output.size(0)): print('output.size(0) is ', output.size(0)) # For each image, get all the predictions boxes = all_boxes[i] # For each image, get all the targets (for multiple object pose estimation, there might be more than 1 target per image) truths = target[i].view(-1, 21) # Get how many object are present in the scene num_gts = truths_length(truths) # Iterate through each ground-truth object for k in range(num_gts): box_gt = [ truths[k][1], truths[k][2], truths[k][3], truths[k][4], truths[k][5], truths[k][6], truths[k][7], truths[k][8], truths[k][9], truths[k][10], truths[k][11], truths[k][12], truths[k][13], truths[k][14], truths[k][15], truths[k][16], truths[k][17], truths[k][18], 1.0, 1.0, truths[k][0] ] best_conf_est = -1 # If the prediction has the highest confidence, choose it as our prediction for single object pose estimation for j in range(len(boxes)): if (boxes[j][18] > best_conf_est): match = corner_confidence9( box_gt[:18], torch.FloatTensor(boxes[j][:18])) box_pr = boxes[j] best_conf_est = boxes[j][18] # Denormalize the corner predictions corners2D_gt = np.array(np.reshape(box_gt[:18], [9, 2]), dtype='float32') corners2D_pr = np.array(np.reshape(box_pr[:18], [9, 2]), dtype='float32') corners2D_gt[:, 0] = corners2D_gt[:, 0] * 1280 corners2D_gt[:, 1] = corners2D_gt[:, 1] * 720 corners2D_pr[:, 0] = corners2D_pr[:, 0] * 1280 corners2D_pr[:, 1] = corners2D_pr[:, 1] * 720 preds_corners2D.append(corners2D_pr) gts_corners2D.append(corners2D_gt) # Compute corner prediction error corner_norm = np.linalg.norm(corners2D_gt - corners2D_pr, axis=1) corner_dist = np.mean(corner_norm) errs_corner2D.append(corner_dist) # Compute [R|t] by pnp _, R_gt, t_gt = pnp( np.array(np.transpose( np.concatenate((np.zeros((3, 1)), corners3D[:3, :]), axis=1)), dtype='float32'), corners2D_gt, np.array(internal_calibration, dtype='float32')) _, R_pr, t_pr = pnp( np.array(np.transpose( np.concatenate((np.zeros((3, 1)), corners3D[:3, :]), axis=1)), dtype='float32'), corners2D_pr, np.array(internal_calibration, dtype='float32')) if save: preds_trans.append(t_pr) gts_trans.append(t_gt) preds_rot.append(R_pr) gts_rot.append(R_gt) np.savetxt( backupdir + '/test/gt/R_' + valid_files[count][-8:-3] + 'txt', np.array(R_gt, dtype='float32')) np.savetxt( backupdir + '/test/gt/t_' + valid_files[count][-8:-3] + 'txt', np.array(t_gt, dtype='float32')) np.savetxt( backupdir + '/test/pr/R_' + valid_files[count][-8:-3] + 'txt', np.array(R_pr, dtype='float32')) np.savetxt( backupdir + '/test/pr/t_' + valid_files[count][-8:-3] + 'txt', np.array(t_pr, dtype='float32')) np.savetxt( backupdir + '/test/gt/corners_' + valid_files[count][-8:-3] + 'txt', np.array(corners2D_gt, dtype='float32')) np.savetxt( backupdir + '/test/pr/corners_' + valid_files[count][-8:-3] + 'txt', np.array(corners2D_pr, dtype='float32')) # Compute translation error trans_dist = np.sqrt(np.sum(np.square(t_gt - t_pr))) errs_trans.append(trans_dist) # Compute angle error angle_dist = calcAngularDistance(R_gt, R_pr) errs_angle.append(angle_dist) # Compute pixel error Rt_gt = np.concatenate((R_gt, t_gt), axis=1) Rt_pr = np.concatenate((R_pr, t_pr), axis=1) proj_2d_gt = compute_projection(vertices, Rt_gt, internal_calibration) proj_2d_pred = compute_projection(vertices, Rt_pr, internal_calibration) norm = np.linalg.norm(proj_2d_gt - proj_2d_pred, axis=0) pixel_dist = np.mean(norm) errs_2d.append(pixel_dist) # Compute 3D distances transform_3d_gt = compute_transformation(vertices, Rt_gt) transform_3d_pred = compute_transformation(vertices, Rt_pr) norm3d = np.linalg.norm(transform_3d_gt - transform_3d_pred, axis=0) vertex_dist = np.mean(norm3d) errs_3d.append(vertex_dist) # Sum errors testing_error_trans += trans_dist testing_error_angle += angle_dist testing_error_pixel += pixel_dist testing_samples += 1 count = count + 1 t5 = time.time() # Compute 2D projection error, 6D pose error, 5cm5degree error px_threshold = 5 acc = len(np.where( np.array(errs_2d) <= px_threshold)[0]) * 100. / (len(errs_2d) + eps) acc5cm5deg = len( np.where((np.array(errs_trans) <= 0.05) & (np.array(errs_angle) <= 5))[0]) * 100. / (len(errs_trans) + eps) acc3d10 = len(np.where( np.array(errs_3d) <= diam * 0.1)[0]) * 100. / (len(errs_3d) + eps) acc5cm5deg = len( np.where((np.array(errs_trans) <= 0.05) & (np.array(errs_angle) <= 5))[0]) * 100. / (len(errs_trans) + eps) corner_acc = len(np.where(np.array(errs_corner2D) <= px_threshold) [0]) * 100. / (len(errs_corner2D) + eps) mean_err_2d = np.mean(errs_2d) mean_corner_err_2d = np.mean(errs_corner2D) nts = float(testing_samples) if testtime: print('-----------------------------------') print(' tensor to cuda : %f' % (t2 - t1)) print(' predict : %f' % (t3 - t2)) print('get_region_boxes : %f' % (t4 - t3)) print(' eval : %f' % (t5 - t4)) print(' total : %f' % (t5 - t1)) print('-----------------------------------') # Print test statistics logging('Results of {}'.format(name)) logging(' Acc using {} px 2D Projection = {:.2f}%'.format( px_threshold, acc)) logging(' Acc using 10% threshold - {} vx 3D Transformation = {:.2f}%'. format(diam * 0.1, acc3d10)) logging(' Acc using 5 cm 5 degree metric = {:.2f}%'.format(acc5cm5deg)) logging( " Mean 2D pixel error is %f, Mean vertex error is %f, mean corner error is %f" % (mean_err_2d, np.mean(errs_3d), mean_corner_err_2d)) logging( ' Translation error: %f m, angle error: %f degree, pixel error: % f pix' % (testing_error_trans / nts, testing_error_angle / nts, testing_error_pixel / nts)) if save: predfile = backupdir + '/predictions_linemod_' + name + '.mat' scipy.io.savemat( predfile, { 'R_gts': gts_rot, 't_gts': gts_trans, 'corner_gts': gts_corners2D, 'R_prs': preds_rot, 't_prs': preds_trans, 'corner_prs': preds_corners2D })
class YOLO_detection: def __init__(self): self.boxes = BoundingBoxes() self.box = BoundingBox() self.image_pub = rospy.Publisher("YOLO_detect_result", Image, queue_size=1) self.boxes_pub = rospy.Publisher("YOLO_detect_result_boxes", BoundingBoxes, queue_size=1) # self.result = rospy.Publisher('YOLO_detect_result', Float64MultiArray, queue_size=10) self.bridge = CvBridge() #self.image_sub = rospy.Subscriber("/camera/rgb/image_raw", Image, self.callback) self.image_sub = rospy.Subscriber("/wideangle/image_color", Image, self.callback) self.batch_size = 1 self.reso = 416 self.confidence = 0.5 self.nms_thesh = 0.4 self.CUDA = torch.cuda.is_available() self.num_classes = 80 # self.classes = load_classes("/home/iairiv/code/yolo/src/yolo_detection/src/data/coco.names") # self.cfg_file = "/home/iairiv/code/yolo/src/yolo_detection/src/cfg/yolov3.cfg" # self.weights_file = "/home/iairiv/code/yolo/src/yolo_detection/src/yolov3.weights" self.colors = random_color() # self.classes = load_classes("/space/code/rosadas/src/yolo_detection/src/data/coco.names") # self.cfg_file = "/space/code/rosadas/src/yolo_detection/src/cfg/yolov3.cfg" # self.weights_file = "/space/code/rosadas/src/yolo_detection/src/yolov3.weights" self.classes = load_classes(rospy.get_param("yolo_classname")) self.cfg_file = rospy.get_param("yolo_cfg") self.weights_file = rospy.get_param("yolo_weight") self.model = Darknet(self.cfg_file) self.model.load_weights(self.weights_file) self.model.net_info["height"] = self.reso if self.CUDA: self.model.cuda() self.model.eval() self.send_by_UDP = False self.draw_res = True # if self.send_by_UDP: # self.UDP = UDPtrans.YOLO_UDP('195.0.0.5', 7800) def transform_input(self, img): return prep_image(img, self.reso) def yolo_detection(self, input): if self.CUDA: input = input.cuda() with torch.no_grad(): prediction = self.model(Variable(input), self.CUDA) # print prediction prediction = write_results(prediction, self.confidence, self.num_classes, nms_conf=self.nms_thesh) return prediction def write(self, output, img): # im_dim_list = [(img.shape[1], img.shape[0])] # im_dim_list = torch.FloatTensor(im_dim_list).repeat(1, 2) # if self.CUDA: # im_dim_list = im_dim_list.cuda() # scaling_factor = torch.min(self.reso / im_dim_list, 1)[0].view(-1, 1) # # print output # output[:, [1, 3]] -= (self.reso - scaling_factor * im_dim_list[:, 0]) / 2 # output[:, [2, 4]] -= (self.reso - scaling_factor * im_dim_list[:, 1]) / 2 # output[:, 1:5] /= scaling_factor # for x in output: # c1 = tuple(x[1:3].int()) # c2 = tuple(x[3:5].int()) # cls = int(x[-1]) # color = self.colors[cls] # label = "{0}".format(self.classes[cls]) # print(label) # color = [255,255,0] # cv2.rectangle(img, c1, c2, color, 2) # t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1, 1)[0] # c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4 # cv2.rectangle(img, c1, c2, color, 2) # cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 2, [225, 255, 255], 1) # return img for x in output: c1 = tuple(x[1:3].int()) c2 = tuple(x[3:5].int()) cls = int(x[-1]) color = (0, 255, 0) #self.colors[cls] label = "{0}".format(self.classes[cls]) print(label) cv2.rectangle(img, c1, c2, color, 4) t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1, 1)[0] c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4 cv2.rectangle(img, c1, c2, color, -1) cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225, 255, 255], 1) return img def callback(self, data): startt = time.time() try: start_time = rospy.Time.now() start_time_second = start_time.to_sec() timeArray = time.localtime(start_time_second) timeArray_H_M_S = time.strftime("%H_%M_%S", timeArray) nano_seconds = str( int(start_time.to_nsec() - int(start_time_second) * 1e9)).zfill(9) timeArray_H_M_S_MS = timeArray_H_M_S + "_" + nano_seconds[:3] print(timeArray_H_M_S_MS) # YOLO detect cv_image = self.bridge.imgmsg_to_cv2(data, "bgr8") # cv_image = cv2.resize(cv_image, (0, 0), fx=0.5, fy=0.5, interpolation=cv2.INTER_NEAREST) input_image = self.transform_input(cv_image) prediction = self.yolo_detection(input_image) # print(type(prediction)) # coordinate transformation if type(prediction) == int: if self.draw_res == True: result = cv_image # if self.send_by_UDP: # self.UDP.send_message(timeArray_H_M_S_MS, None) else: # image size should be the same with the size when we calibrate im_dim_list_list = [(cv_image.shape[1], cv_image.shape[0])] # print im_dim_list_list im_dim_list = torch.FloatTensor(im_dim_list_list).repeat(1, 2) if self.CUDA: im_dim_list = im_dim_list.cuda() scaling_factor = torch.min(self.reso / im_dim_list, 1)[0].view(-1, 1) prediction[:, [1, 3]] -= (self.reso - scaling_factor * im_dim_list[:, 0]) / 2 prediction[:, [2, 4]] -= (self.reso - scaling_factor * im_dim_list[:, 1]) / 2 prediction[:, 1:5] /= scaling_factor prediction[:, [1, 3]] = torch.clamp(prediction[:, [1, 3]], 0.0, im_dim_list_list[0][0]) prediction[:, [2, 4]] = torch.clamp(prediction[:, [2, 4]], 0.0, im_dim_list_list[0][1]) # print prediction # UDP send # if self.send_by_UDP: # self.UDP.send_message(timeArray_H_M_S_MS, prediction.cpu().numpy().tolist()) # draw Image if self.draw_res: result = self.write(prediction, cv_image) # pub.publish(self.boxes) except CvBridgeError as e: print(e) # return prediction # # # cv2.imshow("image windows", result) # # cv2.waitKey(3) # try: # prediction = prediction.cpu().numpy().tolist() # boxes = self.boxes.bounding_boxes() # print(type(prediction)) boxes = BoundingBoxes() if type(prediction) == int: detec_len = 0 else: detec_len = len(prediction) for i in range(detec_len): box = BoundingBox() box.num = prediction[i][0] box.xmin = prediction[i][1] box.ymin = prediction[i][2] box.xmax = prediction[i][3] box.ymax = prediction[i][4] box.probability = prediction[i][6] box.id = "{0}".format(self.classes[int(prediction[i][7])]) #self.box_pub.publish(self.box) boxes.bounding_boxes.append(box) boxes.objNum = detec_len boxes.header.stamp = rospy.Time.now() self.image_pub.publish(self.bridge.cv2_to_imgmsg(result, "bgr8")) self.boxes_pub.publish(boxes) except CvBridgeError as e: print(e) # time.sleep(0.05) print('yolo use:', time.time() - startt)
class YOLO3(object): def __init__(self, cfgfile, weightfile, namesfile, use_cuda=True, is_plot=False, is_xywh=False): # net definition self.net = Darknet(cfgfile) self.net.load_weights(weightfile) print('Loading weights from %s... Done!' % (weightfile)) self.device = "cuda" if use_cuda else "cpu" self.net.eval() self.net.to(self.device) # constants self.size = self.net.width, self.net.height self.conf_thresh = 0.5 self.nms_thresh = 0.4 self.use_cuda = use_cuda self.is_plot = is_plot self.is_xywh = is_xywh self.class_names = self.load_class_names(namesfile) def __call__(self, ori_img): # img to tensor assert isinstance(ori_img, np.ndarray), "input must be a numpy array!" img = ori_img.astype(np.float) / 255. img = cv2.resize(img, self.size) img = torch.from_numpy(img).float().permute(2, 0, 1).unsqueeze(0) # forward with torch.no_grad(): img = img.to(self.device) out_boxes = self.net(img) boxes = get_all_boxes(out_boxes, self.conf_thresh, self.net.num_classes, self.use_cuda)[0] boxes = nms(boxes, self.nms_thresh) # print(boxes) # plot boxes if self.is_plot: return self.plot_bbox(ori_img, boxes) if len(boxes) == 0: return None, None, None height, width = ori_img.shape[:2] boxes = np.vstack(boxes) bbox = np.empty_like(boxes[:, :4]) if self.is_xywh: # bbox x y w h bbox[:, 0] = boxes[:, 0] * width bbox[:, 1] = boxes[:, 1] * height bbox[:, 2] = boxes[:, 2] * width bbox[:, 3] = boxes[:, 3] * height else: # bbox xmin ymin xmax ymax bbox[:, 0] = (boxes[:, 0] - boxes[:, 2] / 2.0) * width bbox[:, 1] = (boxes[:, 1] - boxes[:, 3] / 2.0) * height bbox[:, 2] = (boxes[:, 0] + boxes[:, 2] / 2.0) * width bbox[:, 3] = (boxes[:, 1] + boxes[:, 3] / 2.0) * height cls_conf = boxes[:, 5] cls_ids = boxes[:, 6] return bbox, cls_conf, cls_ids def load_class_names(self, namesfile): with open(namesfile, 'r', encoding='utf8') as fp: class_names = [line.strip() for line in fp.readlines()] return class_names def plot_bbox(self, ori_img, boxes): img = ori_img height, width = img.shape[:2] for box in boxes: # get x1 x2 x3 x4 x1 = int(round(((box[0] - box[2] / 2.0) * width).item())) y1 = int(round(((box[1] - box[3] / 2.0) * height).item())) x2 = int(round(((box[0] + box[2] / 2.0) * width).item())) y2 = int(round(((box[1] + box[3] / 2.0) * height).item())) cls_conf = box[5] cls_id = box[6] # import random # color = random.choices(range(256),k=3) color = [int(x) for x in np.random.randint(256, size=3)] # put texts and rectangles img = cv2.putText(img, self.class_names[cls_id], (x1, y1), cv2.FONT_HERSHEY_SIMPLEX, 1, color, 2) img = cv2.rectangle(img, (x1, y1), (x2, y2), color, 2) return img
class YOLOv3(object): def __init__(self, cfgfile, weightfile, namesfile, score_thresh=0.7, conf_thresh=0.01, nms_thresh=0.45, is_xywh=False, use_cuda=True): # net definition self.net = Darknet(cfgfile) self.net.load_weights(weightfile) logger = logging.getLogger("root.detector") logger.info('Loading weights from %s... Done!' % (weightfile)) self.device = "cuda" if use_cuda else "cpu" self.net.eval() self.net.to(self.device) # constants self.size = self.net.width, self.net.height self.score_thresh = score_thresh self.conf_thresh = conf_thresh self.nms_thresh = nms_thresh self.use_cuda = use_cuda self.is_xywh = is_xywh self.num_classes = self.net.num_classes self.class_names = self.load_class_names(namesfile) def __call__(self, ori_img): # img to tensor assert isinstance(ori_img, np.ndarray), "input must be a numpy array!" img = ori_img.astype(np.float) / 255. img = cv2.resize(img, self.size) img = torch.from_numpy(img).float().permute(2, 0, 1).unsqueeze(0) # forward with torch.no_grad(): img = img.to(self.device) out_boxes = self.net(img) boxes = get_all_boxes(out_boxes, self.conf_thresh, self.num_classes, use_cuda=self.use_cuda) # batch size is 1 # boxes = nms(boxes, self.nms_thresh) boxes = post_process(boxes, self.net.num_classes, self.conf_thresh, self.nms_thresh)[0].cpu() boxes = boxes[boxes[:, -2] > self.score_thresh, :] # bbox xmin ymin xmax ymax if len(boxes) == 0: bbox = torch.FloatTensor([]).reshape([0, 4]) cls_conf = torch.FloatTensor([]) cls_ids = torch.LongTensor([]) else: height, width = ori_img.shape[:2] bbox = boxes[:, :4] if self.is_xywh: # bbox x y w h bbox = xyxy_to_xywh(bbox) bbox = bbox * torch.FloatTensor([[width, height, width, height]]) cls_conf = boxes[:, 5] cls_ids = boxes[:, 6].long() return bbox.numpy(), cls_conf.numpy(), cls_ids.numpy() def load_class_names(self, namesfile): with open(namesfile, 'r', encoding='utf8') as fp: class_names = [line.strip() for line in fp.readlines()] return class_names
def valid(datacfg, modelcfg, weightfile): # Parameters options = read_data_cfg(datacfg) dataDir = options['dataDir'] meshname = options['mesh'] name = options['name'] filetype = options['rgbfileType'] fx = float(options['fx']) fy = float(options['fy']) u0 = float(options['u0']) v0 = float(options['v0']) seed = int(time.time()) gpus = options['gpus'] img_width = 640 img_height = 480 torch.manual_seed(seed) use_cuda = True if use_cuda: os.environ['CUDA_VISIBLE_DEVICES'] = gpus torch.cuda.manual_seed(seed) visualize = True num_classes = 1 conf_thresh = 0.5 # nms_thresh = 0.4 # match_thresh = 0.5 # Read object model information, get 3D bounding box corners mesh = MeshPly(meshname) vertices = np.c_[np.array(mesh.vertices), np.ones((len(mesh.vertices), 1))].transpose() corners3D = get_3D_corners(vertices) # Read intrinsic camera parameters internal_calibration = get_camera_intrinsic(u0, v0, fx, fy) # Specify model, load pretrained weights, pass to GPU and set the module in evaluation mode model = Darknet(modelcfg) model.load_weights(weightfile) model.cuda() model.eval() # apply transformation on the input images transform = transforms.Compose([ transforms.ToTensor(), # transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) # transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # read still images as per the test set with open(os.path.join(dataDir, 'test.txt'), 'r') as file: lines = file.readlines() imgindex = lines[2].rstrip() imgpath = os.path.join(dataDir, 'rgb', str(imgindex) + filetype) # read image for visualization img = cv2.imread(imgpath) # cv2.imshow('yolo6d', img), # cv2.waitKey(1) # read images usin PIL img_ = Image.open(imgpath).convert('RGB') img_ = img_.resize((img_width, img_height)) t1 = time.time() # transform into Tensor img_ = transform(img_) data = Variable(img_).cuda().unsqueeze(0) t2 = time.time() # Forward pass output = model(data).data t3 = time.time() # Using confidence threshold, eliminate low-confidence predictions all_boxes = get_region_boxes2(output, conf_thresh, num_classes) # all_boxes = do_detect(model, img, 0.1, 0.4) t4 = time.time() # For each image, get all the predictions allBoxes = [] boxes = all_boxes[0] print(len(boxes) - 1, 'onigiri(s) found') for j in range(len(boxes) - 1): # ignore 1st box (NOTE: not sure why its incorrect) box_pr = boxes[j + 1] # Denormalize the corner predictions corners2D_pr = np.array(np.reshape(box_pr[:18], [9, 2]), dtype='float32') corners2D_pr[:, 0] = corners2D_pr[:, 0] * img_width corners2D_pr[:, 1] = corners2D_pr[:, 1] * img_height # Compute [R|t] by PnP R_pr, t_pr = pnp( np.array(np.transpose( np.concatenate((np.zeros((3, 1)), corners3D[:3, :]), axis=1)), dtype='float32'), corners2D_pr, np.array(internal_calibration, dtype='float32')) Rt_pr = np.concatenate((R_pr, t_pr), axis=1) proj_corners_pr = np.transpose( compute_projection(corners3D, Rt_pr, internal_calibration)) allBoxes.append(proj_corners_pr) t5 = time.time() # Visualize if visualize: # Projections for corner in allBoxes: color = (0, 0, 255) linewidth = 2 img = cv2.line(img, tuple(corner[0]), tuple(corner[1]), color, linewidth) img = cv2.line(img, tuple(corner[0]), tuple(corner[2]), color, linewidth) img = cv2.line(img, tuple(corner[0]), tuple(corner[4]), color, linewidth) img = cv2.line(img, tuple(corner[1]), tuple(corner[3]), color, linewidth) img = cv2.line(img, tuple(corner[1]), tuple(corner[5]), color, linewidth) img = cv2.line(img, tuple(corner[2]), tuple(corner[3]), color, linewidth) img = cv2.line(img, tuple(corner[2]), tuple(corner[6]), color, linewidth) img = cv2.line(img, tuple(corner[3]), tuple(corner[7]), color, linewidth) img = cv2.line(img, tuple(corner[4]), tuple(corner[5]), color, linewidth) img = cv2.line(img, tuple(corner[4]), tuple(corner[6]), color, linewidth) img = cv2.line(img, tuple(corner[5]), tuple(corner[7]), color, linewidth) img = cv2.line(img, tuple(corner[6]), tuple(corner[7]), color, linewidth) cv2.imshow('yolo6d pose', img) key = cv2.waitKey(10000) & 0xFF if key == 27: print('stopping, keyboard interrupt') sys.exit()
class Patch(): def __init__(self, config, device): self.config = config self.device = device # Create pytorch3D renderer self.renderer = self.create_renderer() # Datasets self.mesh_dataset = MeshDataset(config.mesh_dir, device) self.bg_dataset = BackgroundDataset(config.bg_dir, config.img_size, max_num=config.num_bgs) self.test_bg_dataset = BackgroundDataset(config.test_bg_dir, config.img_size, max_num=config.num_test_bgs) # Initialize adversarial patch, and TV loss #self.patch = torch.rand((100, 100, 3), device=device, requires_grad=True) self.total_variation = TotalVariation().to(device) self.patch = torch.load("data/patch_save_2.pt").to(device) self.idx = torch.load("data/idx_save_2.pt").to(device) # Yolo model: self.dnet = Darknet(self.config.cfgfile) self.dnet.load_weights(self.config.weightfile) self.dnet = self.dnet.eval() self.dnet = self.dnet.to(self.device) def attack(self): train_bgs = DataLoader(self.bg_dataset, batch_size=self.config.batch_size, shuffle=True, num_workers=1) mesh = self.mesh_dataset.meshes[0] print(self.patch.shape) total_variation = TotalVariation().cuda() optimizer = torch.optim.SGD([self.patch], lr=1.0, momentum=0.9) for epoch in range(self.config.epochs): ep_loss = 0.0 ep_acc = 0.0 n = 0.0 for mesh in self.mesh_dataset: # Copy mesh for each camera angle mesh = mesh.extend(self.num_angles) #mesh_texture = mesh.textures.maps_padded() #c = 0 for bg_batch in train_bgs: #c = c+1 #print('iter'+ str(c)) bg_batch = bg_batch.to(self.device) optimizer.zero_grad() # Apply patch to mesh texture (hard coded for now) #mesh_texture[:, 575:675, 475:575, :] = self.patch[None] texture_image = mesh.textures.atlas_padded() mesh.textures._atlas_padded[0, self.idx, :, :, :] = self.patch mesh.textures.atlas = mesh.textures._atlas_padded mesh.textures._atlas_list = None # Render mesh onto background image # images = self.render_mesh_on_bg(mesh, bg) #images = self.render_mesh_on_bg_batch(mesh, bg_batch) rand_translation = torch.randint(-100, 100, (2, )) images = self.render_mesh_on_bg_batch( mesh, bg_batch, x_translation=rand_translation[0].item(), y_translation=rand_translation[1].item()) # print('images: ', images.shape) reshape_img = images[:, :, :, :3].permute(0, 3, 1, 2) reshape_img = reshape_img.to(self.device) # Run detection model on images output = self.dnet(reshape_img) # Compute losses: d_loss = dis_loss(output, self.dnet.num_classes, self.dnet.anchors, self.dnet.num_anchors, 0) acc_loss = calc_acc(output, self.dnet.num_classes, self.dnet.num_anchors, 0) tv = self.total_variation(self.patch) tv_loss = tv * 2.5 loss = d_loss + torch.sum( torch.max(tv_loss, torch.tensor(0.1).to(self.device))) ep_loss += loss.item() ep_acc += acc_loss.item() n += bg_batch.shape[0] #TODO: Remove Retain Graph loss.backward(retain_graph=True) optimizer.step() # Save image and print performance statistics patch_save = self.patch.cpu().detach().clone() idx_save = self.idx.cpu().detach().clone() # torch.save(patch_save, 'patch_save.pt') # torch.save(idx_save, 'idx_save.pt') #save_image(self.patch.cpu().detach().permute(2, 0, 1), self.config.output + '_{}.png'.format(epoch)) print('epoch={} loss={} success_rate={}'.format( epoch, (ep_loss / n), (ep_acc / n) / self.num_angles)) self.test_patch() #TODO: Pass the variable value if epoch % 10 == 0: self.test_patch_faster_rcnn( path_to_checkpoint="faster_rcnn/model-180000.pth", dataset_name="coco2017", backbone_name="resnet101", prob_thresh=0.6) def test_patch(self): angle_success = torch.zeros(self.num_angles) total_loss = 0.0 n = 0.0 for mesh in self.mesh_dataset: mesh = mesh.extend(self.num_angles) #mesh_texture = mesh.textures.maps_padded() for bg in self.test_bg_dataset: #mesh_texture[:, 575:675, 475:575, :] = self.patch[None] texture_image = mesh.textures.atlas_padded() mesh.textures._atlas_padded[0, self.idx, :, :, :] = self.patch mesh.textures.atlas = mesh.textures._atlas_padded mesh.textures._atlas_list = None #images = self.render_mesh_on_bg(mesh, bg) rand_translation = torch.randint(-100, 100, (2, )) images = self.render_mesh_on_bg( mesh, bg, x_translation=rand_translation[0].item(), y_translation=rand_translation[1].item()) reshape_img = images[:, :, :, :3].permute(0, 3, 1, 2) reshape_img = reshape_img.to(self.device) output = self.dnet(reshape_img) d_loss = dis_loss(output, self.dnet.num_classes, self.dnet.anchors, self.dnet.num_anchors, 0) for angle in range(self.num_angles): acc_loss = calc_acc(output[angle], self.dnet.num_classes, self.dnet.num_anchors, 0) angle_success[angle] += acc_loss.item() tv = self.total_variation(self.patch) tv_loss = tv * 2.5 loss = d_loss + torch.sum( torch.max(tv_loss, torch.tensor(0.1).to(self.device))) total_loss += loss.item() n += 1.0 unseen_success_rate = angle_success.mean() / len(self.test_bg_dataset) print('Unseen bg success rate: ', unseen_success_rate.item()) def test_patch_faster_rcnn(self, path_to_checkpoint: str, dataset_name: str, backbone_name: str, prob_thresh: float): #TODO: Make it for general model(even though this might be difficult) dataset_class = DatasetBase.from_name(dataset_name) backbone = BackboneBase.from_name(backbone_name)(pretrained=False) model = FasterRCNN( backbone, dataset_class.num_classes(), pooler_mode=Config.POOLER_MODE, anchor_ratios=Config.ANCHOR_RATIOS, anchor_sizes=Config.ANCHOR_SIZES, rpn_pre_nms_top_n=Config.RPN_PRE_NMS_TOP_N, rpn_post_nms_top_n=Config.RPN_POST_NMS_TOP_N).cuda() model.load(path_to_checkpoint) angle_success = torch.zeros(self.num_angles) total_loss = 0.0 with torch.no_grad(): for mesh in self.mesh_dataset: mesh = mesh.extend(self.num_angles) #mesh_texture = mesh.textures.maps_padded() n = 0.0 for bg in self.test_bg_dataset: texture_image = mesh.textures.atlas_padded() mesh.textures._atlas_padded[0, self.idx, :, :, :] = self.patch mesh.textures.atlas = mesh.textures._atlas_padded mesh.textures._atlas_list = None images = self.render_mesh_on_bg(mesh, bg) reshape_img = images[:, :, :, :3].permute(0, 3, 1, 2) #reshape_img = reshape_img.to(self.device) for angle in range(self.num_angles): save_image(reshape_img[angle].cpu().detach(), "out/tmp.png") image = T.transforms.Image.open("out/tmp.png") image_tensor, scale = dataset_class.preprocess( image, Config.IMAGE_MIN_SIDE, Config.IMAGE_MAX_SIDE) detection_bboxes, detection_classes, detection_probs, _ = \ model.eval().forward(image_tensor.unsqueeze(dim=0).cuda()) detection_bboxes /= scale kept_indices = detection_probs > prob_thresh detection_bboxes = detection_bboxes[kept_indices] detection_classes = detection_classes[kept_indices] detection_probs = detection_probs[kept_indices] draw = ImageDraw.Draw(image) for bbox, cls, prob in zip(detection_bboxes.tolist(), detection_classes.tolist(), detection_probs.tolist()): color = random.choice([ 'red', 'green', 'blue', 'yellow', 'purple', 'white' ]) bbox = BBox(left=bbox[0], top=bbox[1], right=bbox[2], bottom=bbox[3]) category = dataset_class.LABEL_TO_CATEGORY_DICT[ cls] draw.rectangle(((bbox.left, bbox.top), (bbox.right, bbox.bottom)), outline=color) draw.text((bbox.left, bbox.top), text=f'{category:s} {prob:.3f}', fill=color) if angle == 0: image.save("out/images/test_%d.png" % n) #angle_success[angle] += success save_image(reshape_img[0].cpu().detach(), "rendered_output.png") n += 1.0 unseen_success_rate = angle_success.mean() / len(self.test_bg_dataset) print('Unseen model (faster_rcnn) success rate: ', unseen_success_rate.item()) def create_renderer(self): self.num_angles = self.config.num_angles azim = torch.linspace(-1 * self.config.angle_range, self.config.angle_range, self.num_angles) R, T = look_at_view_transform(dist=1.0, elev=0, azim=azim) T[:, 1] = -85 T[:, 2] = 200 cameras = FoVPerspectiveCameras(device=self.device, R=R, T=T) raster_settings = RasterizationSettings( image_size=self.config.img_size, blur_radius=0.0, faces_per_pixel=1, ) lights = PointLights(device=self.device, location=[[0.0, 85, 100.0]]) renderer = MeshRenderer(rasterizer=MeshRasterizer( cameras=cameras, raster_settings=raster_settings), shader=HardPhongShader(device=self.device, cameras=cameras, lights=lights)) return renderer def render_mesh_on_bg(self, mesh, bg_img, location=None, x_translation=0, y_translation=0): images = self.renderer(mesh) bg = bg_img.unsqueeze(0) bg_shape = bg.shape new_bg = torch.zeros(bg_shape[2], bg_shape[3], 3) new_bg[:, :, 0] = bg[0, 0, :, :] new_bg[:, :, 1] = bg[0, 1, :, :] new_bg[:, :, 2] = bg[0, 2, :, :] human = images[:, ..., :3] human_size = self.renderer.rasterizer.raster_settings.image_size if location is None: dH = bg_shape[2] - human_size dW = bg_shape[3] - human_size location = (dW // 2 + x_translation, dW - (dW // 2) - x_translation, dH // 2 + y_translation, dH - (dH // 2) - y_translation) contour = torch.where((human == 1).cpu(), torch.zeros(1).cpu(), torch.ones(1).cpu()) new_contour = torch.zeros(self.num_angles, bg_shape[2], bg_shape[3], 3) new_contour[:, :, :, 0] = F.pad(contour[:, :, :, 0], location, "constant", value=0) new_contour[:, :, :, 1] = F.pad(contour[:, :, :, 1], location, "constant", value=0) new_contour[:, :, :, 2] = F.pad(contour[:, :, :, 2], location, "constant", value=0) new_human = torch.zeros(self.num_angles, bg_shape[2], bg_shape[3], 3) new_human[:, :, :, 0] = F.pad(human[:, :, :, 0], location, "constant", value=0) new_human[:, :, :, 1] = F.pad(human[:, :, :, 1], location, "constant", value=0) new_human[:, :, :, 2] = F.pad(human[:, :, :, 2], location, "constant", value=0) final = torch.where((new_contour == 0).cpu(), new_bg.cpu(), new_human.cpu()) return final def render_mesh_on_bg_batch(self, mesh, bg_imgs, location=None, x_translation=0, y_translation=0): num_bgs = bg_imgs.shape[0] images = self.renderer(mesh) # (num_angles, 416, 416, 4) save_image(images[0, ..., :3].cpu().detach().permute(2, 0, 1), "rendered_output_here.png") images = torch.cat(num_bgs * [images], dim=0) # (num_angles * num_bgs, 416, 416, 4) bg_shape = bg_imgs.shape # bg_imgs: (num_bgs, 3, 416, 416) -> (num_bgs, 416, 416, 3) bg_imgs = bg_imgs.permute(0, 2, 3, 1) # bg_imgs: (num_bgs, 416, 416, 3) -> (num_bgs * num_angles, 416, 416, 3) bg_imgs = bg_imgs.repeat_interleave(repeats=self.num_angles, dim=0) # human: RGB channels of render (num_angles * num_bgs, 416, 416, 3) human = images[:, ..., :3] human_size = self.renderer.rasterizer.raster_settings.image_size if location is None: dH = bg_shape[2] - human_size dW = bg_shape[3] - human_size location = (dW // 2 + x_translation, dW - (dW // 2) - x_translation, dH // 2 + y_translation, dH - (dH // 2) - y_translation) contour = torch.where((human == 1), torch.zeros(1).to(self.device), torch.ones(1).to(self.device)) new_contour = torch.zeros(self.num_angles * num_bgs, bg_shape[2], bg_shape[3], 3, device=self.device) new_contour[:, :, :, 0] = F.pad(contour[:, :, :, 0], location, "constant", value=0) new_contour[:, :, :, 1] = F.pad(contour[:, :, :, 1], location, "constant", value=0) new_contour[:, :, :, 2] = F.pad(contour[:, :, :, 2], location, "constant", value=0) new_human = torch.zeros(self.num_angles * num_bgs, bg_shape[2], bg_shape[3], 3, device=self.device) new_human[:, :, :, 0] = F.pad(human[:, :, :, 0], location, "constant", value=0) new_human[:, :, :, 1] = F.pad(human[:, :, :, 1], location, "constant", value=0) new_human[:, :, :, 2] = F.pad(human[:, :, :, 2], location, "constant", value=0) # output: (num_angles * num_bgs, 416, 416, 3) final = torch.where((new_contour == 0).cpu(), bg_imgs.cpu(), new_human.cpu()) return final
def main(args): '''' main ''' # Image preprocessing transform = transforms.Compose([transforms.ToTensor()]) num_classes = 80 yolov3 = Darknet(args.cfg_file) yolov3.load_weights(args.weights_file) yolov3.net_info["height"] = args.reso inp_dim = int(yolov3.net_info["height"]) assert inp_dim % 32 == 0 assert inp_dim > 32 print("yolo-v3 network successfully loaded") attribute_size = [15, 7, 3, 5, 8, 4, 15, 7, 3, 5, 3, 3, 4] encoder = EncoderClothing(args.embed_size, device, args.roi_size, attribute_size) yolov3.to(device) encoder.to(device) yolov3.eval() encoder.eval() encoder.load_state_dict(torch.load(args.encoder_path)) # cap = cv2.VideoCapture('demo2.mp4') cap = cv2.VideoCapture(0) assert cap.isOpened(), "Cannot capture source" frames = 0 start = time.time() counter = Counter() color_stream = list() pattern_stream = list() gender_stream = list() season_stream = list() class_stream = list() sleeves_stream = list() ret, frame = cap.read() if ret: image, orig_img, dim = prep_image2(frame, inp_dim) im_dim = torch.FloatTensor(dim).repeat(1, 2) image_tensor = image.to(device) detections = yolov3(image_tensor, device, True) os.system("clear") cv2.imshow("frame", orig_img) cv2.moveWindow("frame", 50, 50) text_img = np.zeros((200, 1750, 3)) cv2.imshow("text", text_img) cv2.moveWindow("text", 50, dim[1] + 110) while cap.isOpened(): ret, frame = cap.read() if ret: image, orig_img, dim = prep_image2(frame, inp_dim) im_dim = torch.FloatTensor(dim).repeat(1, 2) image_tensor = image.to(device) im_dim = im_dim.to(device) # Generate an caption from the image # prediction mode for yolo-v3 detections = yolov3(image_tensor, device, True) detections = write_results( detections, args.confidence, device, num_classes, nms=True, nms_conf=args.nms_thresh, ) # original image dimension --> im_dim # view_image(detections) text_img = np.zeros((200, 1750, 3)) if type(detections) != int: if detections.shape[0]: bboxs = detections[:, 1:5].clone() im_dim = im_dim.repeat(detections.shape[0], 1) scaling_factor = torch.min(inp_dim / im_dim, 1)[0].view(-1, 1) detections[:, [1, 3]] -= (inp_dim - scaling_factor * im_dim[:, 0].view(-1, 1)) / 2 detections[:, [2, 4]] -= (inp_dim - scaling_factor * im_dim[:, 1].view(-1, 1)) / 2 detections[:, 1:5] /= scaling_factor small_object_ratio = \ torch.FloatTensor(detections.shape[0]) for i in range(detections.shape[0]): detections[i, [1, 3]] = torch.clamp( detections[i, [1, 3]], 0.0, im_dim[i, 0]) detections[i, [2, 4]] = torch.clamp( detections[i, [2, 4]], 0.0, im_dim[i, 1]) object_area = (detections[i, 3] - detections[i, 1]) * ( detections[i, 4] - detections[i, 2]) orig_img_area = im_dim[i, 0] * im_dim[i, 1] small_object_ratio[i] = object_area / orig_img_area detections = detections[small_object_ratio > 0.05] im_dim = im_dim[small_object_ratio > 0.05] if detections.size(0) > 0: feature = yolov3.get_feature() feature = feature.repeat(detections.size(0), 1, 1, 1) orig_img_dim = im_dim[:, 1:] orig_img_dim = orig_img_dim.repeat(1, 2) scaling_val = 16 bboxs /= scaling_val bboxs = bboxs.round() bboxs_index = torch.arange(bboxs.size(0), dtype=torch.int) bboxs_index = bboxs_index.to(device) bboxs = bboxs.to(device) roi_align = RoIAlign(args.roi_size, args.roi_size, transform_fpcoor=True).to(device) roi_features = roi_align(feature, bboxs, bboxs_index) outputs = encoder(roi_features) for i in range(detections.shape[0]): sampled_caption = [] # attr_fc = outputs[] for j in range(len(outputs)): max_index = torch.max(outputs[j][i].data, 0)[1] word = attribute_pool[j][max_index] sampled_caption.append(word) sentence = " ".join(sampled_caption) sys.stdout.write(" " + "\r") sys.stdout.write(sentence + " " + "\r") sys.stdout.flush() write( detections[i], orig_img, sentence, i + 1, coco_classes, colors, ) cv2.putText( text_img, sentence, (0, i * 40 + 35), cv2.FONT_HERSHEY_PLAIN, 2, [255, 255, 255], 1, ) cv2.imshow("frame", orig_img) cv2.imshow("text", text_img) key = cv2.waitKey(1) if key & 0xFF == ord("q"): break if key & 0xFF == ord("w"): wait(0) if key & 0xFF == ord("s"): continue frames += 1 # print("FPS of the video is {:5.2f}". # format( frames / (time.time() - start))) else: break
class Car_DC(): def __init__(self, src_dir, dst_dir, car_cfg_path=local_car_cfg_path, car_det_weights_path=local_car_det_weights_path, inp_dim=768, prob_th=0.2, nms_th=0.4, num_classes=1): """ model initialization """ # super parameters self.inp_dim = inp_dim self.prob_th = prob_th self.nms_th = nms_th self.num_classes = num_classes self.dst_dir = dst_dir # clear dst_dir if os.path.exists(self.dst_dir): for x in os.listdir(self.dst_dir): if x.endswith('.jpg'): os.remove(self.dst_dir + '/' + x) else: os.makedirs(self.dst_dir) # initialize vehicle detection model self.detector = Darknet(car_cfg_path) self.detector.load_weights(car_det_weights_path) # set input dimension of image self.detector.net_info['height'] = self.inp_dim self.detector.to(device) self.detector.eval() # evaluation mode print('=> car detection model initiated.') # initiate multilabel classifier self.classifier = Car_Classifier(num_cls=19, model_path=local_model_path) # initiate imgs_path self.imgs_path = [os.path.join(src_dir, x) for x in os.listdir( src_dir) if x.endswith('.jpg')] def cls_draw_bbox(self, output, orig_img): """ 1. predict vehicle's attributes based on bbox of vehicle 2. draw bbox to orig_img """ labels = [] pt_1s = [] pt_2s = [] # 1 for det in output: # rectangle points pt_1 = tuple(det[1:3].int()) # the left-up point pt_2 = tuple(det[3:5].int()) # the right down point pt_1s.append(pt_1) pt_2s.append(pt_2) # turn BGR back to RGB ROI = Image.fromarray( orig_img[pt_1[1]: pt_2[1], pt_1[0]: pt_2[0]][:, :, ::-1]) # ROI.show() # call classifier to predict car_color, car_direction, car_type = self.classifier.predict(ROI) label = str(car_color + ' ' + car_direction + ' ' + car_type) labels.append(label) print('=> predicted label: ', label) # 2 color = (0, 215, 255) for i, det in enumerate(output): pt_1 = pt_1s[i] pt_2 = pt_2s[i] # draw bounding box cv2.rectangle(orig_img, pt_1, pt_2, color, thickness=2) # get str text size txt_size = cv2.getTextSize( label, cv2.FONT_HERSHEY_PLAIN, 2, 2)[0] # pt_2 = pt_1[0] + txt_size[0] + 3, pt_1[1] + txt_size[1] + 5 pt_2 = pt_1[0] + txt_size[0] + 3, pt_1[1] - txt_size[1] - 5 # draw text background rect cv2.rectangle(orig_img, pt_1, pt_2, color, thickness=-1) # text # draw text cv2.putText(orig_img, labels[i], (pt_1[0], pt_1[1]), # pt_1[1] + txt_size[1] + 4 cv2.FONT_HERSHEY_PLAIN, 2, [225, 255, 255], 2) def process_predict(self, prediction, prob_th, num_cls, nms_th, inp_dim, orig_img_size): """ processing detections """ scaling_factor = min([inp_dim / float(x) for x in orig_img_size]) # W, H scaling factor output = post_process(prediction, prob_th, num_cls, nms=True, nms_conf=nms_th, CUDA=True) # post-process such as nms if type(output) != int: output[:, [1, 3]] -= (inp_dim - scaling_factor * orig_img_size[0]) / 2.0 # x, w output[:, [2, 4]] -= (inp_dim - scaling_factor * orig_img_size[1]) / 2.0 # y, h output[:, 1:5] /= scaling_factor for i in range(output.shape[0]): output[i, [1, 3]] = torch.clamp( output[i, [1, 3]], 0.0, orig_img_size[0]) output[i, [2, 4]] = torch.clamp( output[i, [2, 4]], 0.0, orig_img_size[1]) return output def detect_classify(self): """ detect and classify """ for x in self.imgs_path: # read image data img = Image.open(x) img2det = process_img(img, self.inp_dim) img2det = img2det.to(device) # put image data to device # vehicle detection prediction = self.detector.forward(img2det, CUDA=True) # calculating scaling factor orig_img_size = list(img.size) output = self.process_predict(prediction, self.prob_th, self.num_classes, self.nms_th, self.inp_dim, orig_img_size) orig_img = cv2.cvtColor(np.asarray( img), cv2.COLOR_RGB2BGR) # RGB => BGR if type(output) != int: self.cls_draw_bbox(output, orig_img) dst_path = self.dst_dir + '/' + os.path.split(x)[1] if not os.path.exists(dst_path): cv2.imwrite(dst_path, orig_img)
# start Flask application app = Flask(__name__) CORS(app) if METHOD is 'yolo_608_coco': MODEL = Darknet(YOLOV3_608_CFG_PATH) elif METHOD is 'yolo_416_coco': MODEL = Darknet(YOLOV3_416_CFG_PATH) else: raise Exception(f'Undefined method: "{METHOD}"') MODEL.load_weights(YOLOV3_WEIGHTS_PATH) MODEL.eval() assert os.path.exists( PROJECT_PATH ), f'{PROJECT_PATH} does not exist. Consider to git clone the repo.' # if there is no folder for archiving, create if not os.path.exists(ARCHIVE_PATH): os.makedirs(ARCHIVE_PATH) def show_image_w_bboxes_for_server(img_path, model, orientation): ''' Reads an image from the disk and applies a detection algorithm specified in model. Arguments
def selection(x, rec, privacy, detected_obj): if x[2] in privacy: print('[DETECT] {}'.format(x[2])) detected_obj.append(x[2]) up = x[0][1].item() left = x[0][0].item() height = (x[1][1] - up).item() width = (x[1][0] - left).item() rec.append([up, left, height, width]) return rec, detected_obj if __name__ == '__main__': from utils.util import load_classes, write_results from darknet import Darknet from utils.preprocess import prep_image, inp_to_image image = cv2.imread('imgs/dog.jpg') conf = 0.5 nms = 0.4 rec = [] device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') detecter = Darknet('cfgs/yolov3.cfg') detecter.load_weights('weights/yolov3.weights') detecter.to(device) detecter.eval() rec = yolo_detecter(image, detecter, conf, nms, rec, device) print(rec)
print("Loading network.....") model = Darknet(args.cfgfile) model.load_weights(args.weightsfile) print("Network successfully loaded") model.net_info["height"] = args.reso inp_dim = int(model.net_info["height"]) assert inp_dim % 32 == 0 assert inp_dim > 32 # If there's a GPU availible, put the model on GPU if CUDA: model.cuda() # Set the model in evaluation mode model.eval() read_dir = time.time() # Detection phase try: imlist = [ osp.join(osp.realpath('.'), images, img) for img in os.listdir(images) ] except NotADirectoryError: imlist = [] imlist.append(osp.join(osp.realpath('.'), images)) except FileNotFoundError: print("No file or directory with the name {}".format(images)) exit() if not os.path.exists(args.det):
def test(datacfg, cfgfile, weightfile, imgfile): # ******************************************# # PARAMETERS PREPARATION # # ******************************************# #parse configuration files options = read_data_cfg(datacfg) meshname = options['mesh'] name = options['name'] #Parameters for the network seed = int(time.time()) gpus = '0' # define gpus to use test_width = 544 # define test image size test_height = 544 torch.manual_seed(seed) # seed torch random use_cuda = True if use_cuda: os.environ['CUDA_VISIBLE_DEVICES'] = gpus torch.cuda.manual_seed(seed) # seed cuda random conf_thresh = 0.1 num_classes = 1 # Read object 3D model, get 3D Bounding box corners mesh = MeshPly(meshname) vertices = np.c_[np.array(mesh.vertices), np.ones((len(mesh.vertices), 1))].transpose() corners3D = get_3D_corners(vertices) diam = float(options['diam']) # now configure camera intrinsics internal_calibration = get_camera_intrinsic() # ******************************************# # NETWORK CREATION # # ******************************************# # Create the network based on cfg file model = Darknet(cfgfile) model.print_network() model.load_weights(weightfile) model.cuda() model.eval() # ******************************************# # INPUT IMAGE PREPARATION FOR NN # # ******************************************# # Now prepare image: convert to RGB, resize, transform to Tensor # use cuda, img = Image.open(imgfile).convert('RGB') ori_size = img.size # store original size img = img.resize((test_width, test_height)) t1 = time.time() img = transforms.Compose([ transforms.ToTensor(), ])(img) #.float() img = Variable(img, requires_grad=True) img = img.unsqueeze(0) # add a fake batch dimension img = img.cuda() # ******************************************# # PASS IT TO NETWORK AND GET PREDICTION # # ******************************************# # Forward pass output = model(img).data #print("Output Size: {}".format(output.size(0))) t2 = time.time() # ******************************************# # EXTRACT PREDICTIONS # # ******************************************# # Using confidence threshold, eliminate low-confidence predictions # and get only boxes over the confidence threshold all_boxes = get_region_boxes(output, conf_thresh, num_classes) boxes = all_boxes[0] # iterate through boxes to find the one with highest confidence best_conf_est = -1 best_box_index = -1 for j in range(len(boxes)): # the confidence is in index = 18 if (boxes[j][18] > best_conf_est): box_pr = boxes[j] # get bounding box best_conf_est = boxes[j][18] best_box_index = j #print("Best box is: {} and 2D prediction is {}".format(best_box_index,box_pr)) # Denormalize the corner predictions # This are the predicted 2D points with which a bounding cube can be drawn corners2D_pr = np.array(np.reshape(box_pr[:18], [9, 2]), dtype='float32') corners2D_pr[:, 0] = corners2D_pr[:, 0] * ori_size[0] # Width corners2D_pr[:, 1] = corners2D_pr[:, 1] * ori_size[1] # Height t3 = time.time() # **********************************************# # GET OBJECT POSE ESTIMATION # # Remember the problem in 6D Pose estimation # # is exactly to estimate the pose - position # # and orientation of the object of interest # # with reference to a camera frame. That is # # why although the 2D projection of the 3D # # bounding cube are ready, we still need to # # compute the rotation matrix -orientation- # # and a translation vector -position- for the # # object # # # # **********************************************# # get rotation matrix and transform R_pr, t_pr = pnp( np.array(np.transpose( np.concatenate((np.zeros((3, 1)), corners3D[:3, :]), axis=1)), dtype='float32'), corners2D_pr, np.array(internal_calibration, dtype='float32')) t4 = time.time() # ******************************************# # DISPLAY IMAGE WITH BOUNDING CUBE # # ******************************************# # Reload Original img img = cv2.imread(imgfile) # create a window to display image wname = "Prediction" cv2.namedWindow(wname) # draw each predicted 2D point for i, (x, y) in enumerate(corners2D_pr): # get colors to draw the lines col1 = 28 * i col2 = 255 - (28 * i) col3 = np.random.randint(0, 256) cv2.circle(img, (x, y), 3, (col1, col2, col3), -1) cv2.putText(img, str(i), (int(x) + 5, int(y) + 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (col1, col2, col3), 1) # Get each predicted point and the centroid p1 = corners2D_pr[1] p2 = corners2D_pr[2] p3 = corners2D_pr[3] p4 = corners2D_pr[4] p5 = corners2D_pr[5] p6 = corners2D_pr[6] p7 = corners2D_pr[7] p8 = corners2D_pr[8] center = corners2D_pr[0] # Draw cube lines around detected object # draw front face line_point = 3 cv2.line(img, (p1[0], p1[1]), (p2[0], p2[1]), (0, 255, 0), line_point) cv2.line(img, (p2[0], p2[1]), (p4[0], p4[1]), (0, 255, 0), line_point) cv2.line(img, (p4[0], p4[1]), (p3[0], p3[1]), (0, 255, 0), line_point) cv2.line(img, (p3[0], p3[1]), (p1[0], p1[1]), (0, 255, 0), line_point) # draw back face cv2.line(img, (p5[0], p5[1]), (p6[0], p6[1]), (0, 255, 0), line_point) cv2.line(img, (p7[0], p7[1]), (p8[0], p8[1]), (0, 255, 0), line_point) cv2.line(img, (p6[0], p6[1]), (p8[0], p8[1]), (0, 255, 0), line_point) cv2.line(img, (p5[0], p5[1]), (p7[0], p7[1]), (0, 255, 0), line_point) # draw right face cv2.line(img, (p2[0], p2[1]), (p6[0], p6[1]), (0, 255, 0), line_point) cv2.line(img, (p1[0], p1[1]), (p5[0], p5[1]), (0, 255, 0), line_point) # draw left face cv2.line(img, (p3[0], p3[1]), (p7[0], p7[1]), (0, 255, 0), line_point) cv2.line(img, (p4[0], p4[1]), (p8[0], p8[1]), (0, 255, 0), line_point) # Show the image and wait key press cv2.imshow(wname, img) cv2.waitKey() print("Rotation: {}".format(R_pr)) print("Translation: {}".format(t_pr)) print(" Predict time: {}".format(t2 - t1)) print(" 2D Points extraction time: {}".format(t3 - t2)) print(" Pose calculation time: {}:".format(t4 - t3)) print(" Total time: {}".format(t4 - t1)) print("Press any key to close.")