def main(): detector = pyyolo.YOLO("./models/yolov3.cfg", "./models/yolov3.weights", "./models/coco.data", detection_threshold=0.5, hier_threshold=0.5, nms_threshold=0.45) cap = cv2.VideoCapture(0) while True: ret, frame = cap.read() dets = detector.detect(frame, rgb=False) for i, det in enumerate(dets): print(f'Detection: {i}, {det}') xmin, ymin, xmax, ymax = det.to_xyxy() cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), (0, 0, 255)) cv2.imshow('cvwindow', frame) if cv2.waitKey(1) == 27: break
def __init__(self): '''Initialize ros publisher, ros subscriber''' # topic where we publish self.image_pub = rospy.Publisher("/output/image_raw/compressed", CompressedImage) # self.bridge = CvBridge() # subscribed Topic self.subscriber = rospy.Subscriber("/camera/image/compressed", CompressedImage, self.callback, queue_size = 1) #adicionei o que está abaixo. adicionei o 'self.' como estava nas outras coisas - ja nao sei trabalhar com classes (?) #initializing YOLO detector self.detector = pyyolo.YOLO("/opt/darknet/cfg/yolov3.cfg", "/opt/darknet/cfg/yolov3.weights", "/opt/darknet/cfg/coco.data", detection_threshold = 0.5, hier_threshold = 0.5, nms_threshold = 0.45) #initializing OpenPose self.opWrapper = op.WrapperPython() params = dict() params["model_folder"] = "/opt/openpose/models" self.opWrapper.configure(params) self.opWrapper.start() #loading the classifier self.clf = joblib.load(sys.argv[1]) if VERBOSE : print("subscribed to /camera/image/compressed") def strToBool(string): if string == 'True': return True elif string == 'False': return False def paintImage(quadrant,img): overlay = img.copy() # safe copy of the image to apply alpha (only if requested) h, w = img.shape[:2] # gets the image shape (height, width) h = int(h) # converts parameters to integers w= int(w) # next, it compares the x and the y to a reference from the original image (x/2,x/4 or y/2,y/4) # note that the image was devided in 16 rectangles: firstly divided in four and each of these, devided # in four again #____________________________# 0 # 0 \ 1 \ 4 \ 5 # #-----------\----------------# h/4 # 2 \ 3 \ 6 \ 7 # #___________\________________# h/2 # 8 \ 9 \ 12 \ 13 # # ----------\----------------# (3h)/4 # 10 \ 11 \ 13 \ 15 # # __________\________________# h # 0 w/4 w/2 3w/4 w if quadrant == 0: cv2.rectangle(overlay, (0,0), (int(w/4),int(h/4)), (229,88,191), thickness=-1) elif quadrant == 1: cv2.rectangle(overlay, (int(w/4),0), (int(w/2),int(h/4)), (229,88,191), thickness=-1) elif quadrant == 2: cv2.rectangle(overlay, (0,int(h/4)), (int(w/4),int(h/2)), (229,88,191), thickness=-1) elif quadrant == 3: cv2.rectangle(overlay, (int(w/4),int(h/4)), (int(w/2),int(h/2)), (229,88,191), thickness=-1) elif quadrant == 4: cv2.rectangle(overlay, (int(w/2),0), (int((3*w)/4),int(h/4)), (229,88,191), thickness=-1) elif quadrant == 5: cv2.rectangle(overlay, (int((3*w)/4),0), (int(w),int(h/4)), (229,88,191), thickness=-1) elif quadrant == 6: cv2.rectangle(overlay, ((int(w/2),int(h/4))), (int((3*w)/4),int(h/2)), (229,88,191), thickness=-1) elif quadrant == 7: cv2.rectangle(overlay, (int((3*w)/4),int(h/4)), (int(w),int(h/2)), (229,88,191), thickness=-1) elif quadrant == 8: cv2.rectangle(overlay, (0,int(h/2)), (int(w/4),int((3*h)/4)), (229,88,191), thickness=-1) elif quadrant == 9: cv2.rectangle(overlay, (int(w/4),int(h/2)), (int(w/2),int((3*h)/4)), (229,88,191), thickness=-1) elif quadrant == 10: cv2.rectangle(overlay, (0,int((3*h)/4)), (int(w/4),int(h)), (229,88,191), thickness=-1) elif quadrant == 11: cv2.rectangle(overlay, (int(w/4),int((3*h)/4)), (int(w/2),h), (229,88,191), thickness=-1) elif quadrant == 12: cv2.rectangle(overlay, (int(w/2),int(h/2)), (int((3*w)/4),int((3*h)/4)), (229,88,191), thickness=-1) elif quadrant == 13: cv2.rectangle(overlay, (int((3*w)/4),int(h/2)), (int(w),int((3*h)/4)), (229,88,191), thickness=-1) elif quadrant == 14: cv2.rectangle(overlay, (int(w/2),int((3*h)/4)), (int((3*w)/4),int(h)), (229,88,191), thickness=-1) elif quadrant == 15: cv2.rectangle(overlay, (int((3*w)/4),int((3*h)/4)), (int(w),int(h)), (229,88,191), thickness=-1) new = cv2.addWeighted(overlay, 0.4, img, 1 - 0.4, 0, img) #purple overlay return new #returns the quadrant and a new picture, with the underlined quadrant def short_long(entity): #short_long() classify the distance of the interaction based on the kind of the object (if you hold it = short) distance='null' if entity in ['tv','pottedplant','vase','cat','bowl','clock','toilet','chair','bench','couch']: distance='long' if entity in ['dining table','refrigerator','microwave','sink','apple','banana','laptop',\ 'keyboard','mouse','knife','fork','backpack','oven','toaster']: distance='short' return distance
def main(): pipeline = rs.pipeline() config = rs.config() config.enable_stream(rs.stream.depth, 1280, 720, rs.format.z16, 30) config.enable_stream(rs.stream.color, 1280, 720, rs.format.bgr8, 30) pipeline.start(config) profile = pipeline.get_active_profile() depth_profile = rs.video_stream_profile(profile.get_stream( rs.stream.depth)) intr = depth_profile.get_intrinsics() detector = pyyolo.YOLO("./models/" + MODEL + ".cfg", "./models/" + MODEL + ".weights", "./models/" + DATA + ".data", detection_threshold=0.5, hier_threshold=0.5, nms_threshold=0.45) while True: # Get RealSense frame first so we can guarantee we have one frames = pipeline.wait_for_frames() # Frames 1280 width x 720 height #get_distance(x: int, y: int) -> float depth_frame = frames.get_depth_frame() color_frame = frames.get_color_frame() if not depth_frame or not color_frame: continue # Convert images to numpy arrays depth_image = np.asanyarray(depth_frame.get_data()) color_image = np.asanyarray(color_frame.get_data()) dets = detector.detect(color_image, rgb=False) for i, det in enumerate(dets): if det.name != TARGET_OBJECT: continue ''' TODOS here: Select target object based on how many frames it shows up in one second with high enough confidence From there, take the last frame it was found and calculate depth based on that frame. ''' xmin, ymin, xmax, ymax = det.to_xyxy() cv2.rectangle(color_image, (xmin, ymin), (xmax, ymax), (0, 0, 255)) cv2.putText(color_image, det.name + "," + str(det.prob), (xmin, ymin - 10), cv2.FONT_HERSHEY_SIMPLEX, 1, (107, 168, 50), 1) found = False xcenter = int((xmin + xmax) / 2) ycenter = int((ymin + ymax) / 2) checkNorth = checkSouth = ycenter checkEast = checkWest = xcenter float_distance = 0 while checkNorth >= ymin and checkEast <= xmax and checkSouth <= ymax and checkWest >= xmin: float_distance = depth_frame.get_distance(xcenter, checkNorth) if float_distance != 0: found = True break else: checkNorth -= 10 float_distance = depth_frame.get_distance(checkEast, ycenter) if float_distance != 0: found = True break else: checkEast += 10 float_distance = depth_frame.get_distance(xcenter, checkSouth) if float_distance != 0: found = True break else: checkSouth += 10 float_distance = depth_frame.get_distance(checkWest, ycenter) if float_distance != 0: found = True break else: checkWest -= 10 cv2.circle(color_image, (xcenter, ycenter), 10, (87, 134, 255), 3) cv2.putText(color_image, (str(float_distance) + "m") if found else "Not Available", (xcenter - 20, ycenter - 20), cv2.FONT_HERSHEY_SIMPLEX, 1, (87, 134, 255), 1) point3D = rs.rs2_deproject_pixel_to_point(intr, [xcenter, ycenter], float_distance) print("Body Frame: " + str(point3D)) print("Inertial Frame: " + str(bodyToInertialFrame(point3D))) cv2.imshow("color_image preview", color_image) if cv2.waitKey(1) == 27: break pipeline.stop()
def main(video, VFOA, visualFeedback): #main function, converts a video *video*, a ground-truth visual focus of atention *VFOA* (allowing the user # to choose whether she/he wants to see a visual representation X11 - Xming used) if not os.path.exists( os.path.join(os.path.abspath(os.path.join(os.getcwd(), os.pardir)), 'dataset' + str(date.today()))): os.makedirs((os.path.join( os.path.abspath(os.path.join(os.getcwd(), os.pardir)), 'dataset' + str(date.today())))) videoname = video.rsplit('/', 1)[-1][:-4] resultingVector = [0,0,0,0,0,0,0,0,0,0,\ 0,0,0,0,0,0,0,0,0,0,\ 0,0,0,0,0,0,0,0,0,0,\ 0,0,0,0,0,0,0,0,0,0,\ 0,0,0,0,0,0,0] #coordinates of the head ((x,y)*5 keypoints) and hands ((x,y)*2 keypoints), #context (16 quadrants * 2 conditions (short or long)) and quadrant (1) = 47 #initializing YOLO detector detector = pyyolo.YOLO("/opt/darknet/cfg/yolov3.cfg", "/opt/darknet/cfg/yolov3.weights", "/opt/darknet/cfg/coco.data", detection_threshold=0.5, hier_threshold=0.5, nms_threshold=0.45) #initializing OpenPose opWrapper = op.WrapperPython() params = dict() params["model_folder"] = "/opt/openpose/models" opWrapper.configure(params) opWrapper.start() #inputing the video to OpenCV cap = cv2.VideoCapture(video) #initializing frame counter and calculating total frames j = 0 total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) #used frames usedFrames = 0 #buffer - only use one frame in each 3, to make data more different buffer = 4 #cycling through the video while True: ret, frame = cap.read() if not ret: break #resizing frame frame = cv2.resize(frame, (640, 352)) if j == 1: h, w = frame.shape[:2] #in order to have higher variance in data, only accepting 1 out of a sequence of 15 frames if j % buffer != 0: j += 1 else: xVFOA, yVFOA = -1, -1 #initializing variables VFOA = visual focus of attention (x and y). Note that #(-1,-1) is an impossible VFOA, so it'll trigger an error, if needed objectDistance = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] #initializng a variable that'll count how many close/long range #objects are in each quadrant maxDiagonal = -1 #aproximation: if there are several objects of the same type of the VFOA, # it's assumed that the biggest one (closest to the camera) is the right # one print('Frame: ' + str(j) + '/' + str(total_frames)) #sense of progress (how many frames left) #executing YOLO detector dets = detector.detect(frame) #executing OpenPose datum = op.Datum() imageToProcess = frame datum.cvInputData = imageToProcess opWrapper.emplaceAndPop([datum]) frame = datum.cvOutputData #people counter if np.cumsum( datum.poseKeypoints )[0] == 255: #if there're no people OpenPose will ouput an array with only 255 in it peopleCounter = 0 else: peopleCounter = len( datum.poseKeypoints ) #else, counting how many people there are. if there were none, #this len() would output an error - this way, it won't and #we're still capable of counting the people #only interested in scenes with 1 subject, ignore all others if peopleCounter != 1: print('There are ' + str(peopleCounter) + ' people in this frame, thus it will be ignored.') else: keypoints = (datum.poseKeypoints[0]) #list of keypoints # YOLO detector, a cicle for each object for i, det in enumerate(dets): print(f'Detection: {i}, {det}') xmin, ymin, xmax, ymax = det.to_xyxy() x, y = int((xmin + xmax) / 2), int( (ymin + ymax) / 2) #center of the object quadrant = coords2Quadrante( (x, y), frame, False)[0] #detect in which quadrant it is distance = short_long( det.name) #classify the type of interaction if distance == 'short': #counting the number of objects in each quadrant objectDistance[0][quadrant] += 1 elif distance == 'long': objectDistance[1][quadrant] += 1 #visual aid if visualFeedback: cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), (0, 0, 255)) cv2.putText(frame, str(det.name), (x, y), cv2.FONT_HERSHEY_COMPLEX, 0.6, (255, 255, 0)) #name over the object cv2.putText(frame, str(j) + '/' + str(total_frames), (20, 20), cv2.FONT_HERSHEY_COMPLEX, 0.4, (255, 255, 0)) #frame number if det.name == VFOA: #If there are more than one object of the VFOA kind, it assumes the right one to be the one thisDiagonal = ( np.sqrt((xmax - xmin)**2 + (ymax - ymin)**2) ) #closest to the screen and, therefore, if thisDiagonal > maxDiagonal: # the one with the biggest diagonal xVFOA, yVFOA = x, y maxDiagonal = thisDiagonal if xVFOA + yVFOA == -2: #If the targeted object was not found, x and y VFOA remained with its original values (-1 and -1) print( 'The requested target (' + VFOA + ') was not found on this frame, thus it will be ignored.' ) else: #if the VFOA was found, proceed if visualFeedback: cv2.arrowedLine( frame, (int(keypoints[0][0]), int(keypoints[0][1])), (int(xVFOA), int(yVFOA)), (0, 255, 221), thickness=1) quadrantVFOA = coords2Quadrante( (xVFOA, yVFOA), frame, True)[0] #quadrant of the VFOA #building the result, firstly with keyposes - #Check https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/doc/output.md - Pose Output Format (BODY_25) resultingVector = [keypoints[0][0],keypoints[0][1], \ keypoints[1][0],keypoints[1][1], \ keypoints[17][0],keypoints[17][1],\ keypoints[18][0],keypoints[18][1],\ keypoints[15][0],keypoints[15][1],\ keypoints[16][0],keypoints[16][1],\ keypoints[4][0],keypoints[4][1],\ keypoints[7][0],keypoints[7][1]] #nose, neck, right ear, left ear, right eye, left eye, right hand, left hand resultingVector = [ -1 if x == 0 else x for x in resultingVector ] #increasing the strangeness of undetected points # so that it's more evident for thge SVM to understand #counting the number of keyposes that weren't detected, if there're more than 3 (3*(x,y) = 6), the data is discarded if (resultingVector[8:12].count(-1) > 1) or resultingVector.count( -1) >= 6: #this tolerance value can be changed print( 'There were a total of ' + str(int(resultingVector.count(-1)) / 2) + ' keypoints missing. Thus, this frame will be ignored.' ) else: #adding context and the quadrant to the vector resultingVector += objectDistance[0] + objectDistance[ 1] + [ str(date.today()) + '_' + videoname + '_' + sys.argv[2] + '_' + str(j) ] + [quadrantVFOA] #outputing to a textfile, note that the chosen name is extremely tailored, manipulating expected inputs # in particular, videos found in ~/source_videos/ folder f = open( os.path.join( os.path.abspath( os.path.join(os.getcwd(), os.pardir)), 'dataset' + str(date.today()), str(date.today()) + '_' + videoname + '_' + sys.argv[2] + '_' + str(j) + '.txt'), "w+") f.writelines(str(resultingVector)) f.close() if visualFeedback: cv2.imwrite( os.path.join( os.path.abspath( os.path.join(os.getcwd(), os.pardir)), 'dataset' + str(date.today()), str(date.today()) + '_' + videoname + '_' + sys.argv[2] + '_' + str(j) + '.png'), frame) usedFrames += 1 print( 'This frame has generated data successfully. The ' + VFOA + ' can be found in quadrant ' + str(quadrantVFOA) + '.') if visualFeedback: cv2.imshow('cvwindow', frame) #showing the frame if visualFeedback: cv2.waitKey( 10 ) #waiting - this value can be decreased, to shorten generation times print('\n') #visual shell organization j += 1 #next frame if cv2.waitKey(1) == 27: #??? break print('The images were analysed in a: ' + str(w) + 'x' + str(h) + ' resolution.') print('A total of ' + str(usedFrames) + ' were used, out of ' + str(total_frames) + ' frames. (' + str(usedFrames / (total_frames / buffer))[0:4] + '%)') cap.release() cv2.destroyAllWindows()