def frameMABO(dname, redo=False): d = GetDataset(dname) dirname = os.path.join( os.path.dirname(__file__), '../results/ACT-detector/', dname) eval_file = os.path.join(dirname, "frameMABO.pkl") if os.path.isfile(eval_file) and not redo: with open(eval_file, 'rb') as fid: BO = pickle.load(fid) else: vlist = d.test_vlist() BO = {l: [] for l in d.labels} # best overlap for v in vlist: gt = d.gttubes(v) h, w = d.resolution(v) # load per-frame detections vdets = {i: np.empty((0,4), dtype=np.float32) for i in range(1, 1+d.nframes(v))} # load results for each chunk for i in xrange(1, 1 + d.nframes(v) - K + 1): resname = os.path.join(dirname, d.frame_format(v,i) + '.pkl') if not os.path.isfile(resname): print("ERROR: Missing extracted tubelets " + resname) sys.exit() with open(resname, 'rb') as fid: dets, _ = pickle.load(fid) for k in xrange(K): vdets[i+k] = np.concatenate((vdets[i + k], dets[:, 2+4*k:6+4*k]), axis=0) # for each frame for i in xrange(1, 1 + d.nframes(v)): for ilabel in gt: label = d.labels[ilabel] for t in gt[ilabel]: # the gt tube does not cover frame i if not i in t[:,0]: continue gtbox = t[t[:,0] == i, 1:5] # box of gt tube at frame i if vdets[i].size == 0: # we missed it BO[label].append(0) continue ious = iou2d(vdets[i], gtbox) BO[label].append( np.max(ious) ) # save file with open(eval_file, 'wb') as fid: pickle.dump( BO, fid) # print MABO results ABO = {la: 100 * np.mean(np.array(BO[la])) for la in d.labels} # average best overlap for la in d.labels: print("{:20s} {:6.2f}".format(la, ABO[la])) print("{:20s} {:6.2f}".format("MABO", np.mean(np.array(ABO.values()))))
def load_frame_detections(d, vlist, dirname, nms): if isinstance(d, str): d = GetDataset(d) alldets = [] # list of numpy array with <video_index> <frame_index> <ilabel> <score> <x1> <y1> <x2> <y2> for iv, v in enumerate(vlist): h,w = d.resolution(v) # aggregate the results for each frame vdets = {i: np.empty((0,6), dtype=np.float32) for i in range(1, 1 + d.nframes(v))} # x1, y1, x2, y2, score, ilabel # load results for each starting frame for i in xrange(1, 1 + d.nframes(v) - K + 1): resname = os.path.join(dirname, d.frame_format(v,i) + '.pkl') if not os.path.isfile(resname): print("ERROR: Missing extracted tubelets "+resname) sys.exit() with open(resname, 'rb') as fid: dets, _ = pickle.load(fid) if dets.size == 0: continue for k in xrange(K): vdets[i+k] = np.concatenate( (vdets[i+k],dets[:,np.array([2+4*k,3+4*k,4+4*k,5+4*k,1,0])] ), axis=0) # Perform NMS in each frame for i in vdets: idx = np.empty((0,), dtype=np.int32) for ilabel in xrange(d.nlabels): a = np.where(vdets[i][:,5] == ilabel)[0] if a.size == 0: continue idx = np.concatenate((idx, a[nms2d(vdets[i][vdets[i][:, 5] == ilabel, :5], nms)]), axis=0) if idx.size == 0: continue alldets.append(np.concatenate((iv * np.ones((idx.size, 1), dtype=np.float32), i * np.ones((idx.size, 1), dtype=np.float32), vdets[i][idx, :][:, np.array([5, 4, 0, 1, 2, 3], dtype=np.int32)]), axis=1)) return np.concatenate(alldets, axis=0)
def frameMABO(dname, redo=False): d = GetDataset(dname) dirname = os.path.join(os.path.dirname(__file__), '../results/ACT-detector/', dname) eval_file = os.path.join(dirname, "frameMABO.pkl") if os.path.isfile(eval_file) and not redo: with open(eval_file, 'rb') as fid: BO = pickle.load(fid) else: vlist = d.test_vlist() BO = {l: [] for l in d.labels} # best overlap for v in vlist: gt = d.gttubes(v) h, w = d.resolution(v) # load per-frame detections vdets = { i: np.empty((0, 4), dtype=np.float32) for i in range(1, 1 + d.nframes(v)) } # load results for each chunk for i in range(1, 1 + d.nframes(v) - K + 1): resname = os.path.join(dirname, d.frame_format(v, i) + '.pkl') if not os.path.isfile(resname): print("ERROR: Missing extracted tubelets " + resname) sys.exit() with open(resname, 'rb') as fid: dets, _ = pickle.load(fid) for k in range(K): vdets[i + k] = np.concatenate( (vdets[i + k], dets[:, 2 + 4 * k:6 + 4 * k]), axis=0) # for each frame for i in range(1, 1 + d.nframes(v)): for ilabel in gt: label = d.labels[ilabel] for t in gt[ilabel]: # the gt tube does not cover frame i if not i in t[:, 0]: continue gtbox = t[t[:, 0] == i, 1:5] # box of gt tube at frame i if vdets[i].size == 0: # we missed it BO[label].append(0) continue ious = iou2d(vdets[i], gtbox) BO[label].append(np.max(ious)) # save file with open(eval_file, 'wb') as fid: pickle.dump(BO, fid) # print MABO results ABO = {la: 100 * np.mean(np.array(BO[la])) for la in d.labels} # average best overlap for la in d.labels: print("{:20s} {:6.2f}".format(la, ABO[la])) print("{:20s} {:6.2f}".format("MABO", np.mean(np.array(ABO.values()))))
def extract_tubelets(dname, gpu=-1, redo=False): """Extract the tubelets for a given dataset args: - dname: dataset name (example: 'JHMDB') - gpu (default -1): use gpu given in argument, or use cpu if -1 - redo: wheter or not to recompute already computed files save a pickle file for each frame the file contains a tuple (dets, dets_all) - dets is a numpy array with 2+4*K columns containing the tubelets starting at this frame after per-class nms at 0.45 and thresholding the scores at 0.01 the columns are <label> <score> and then <x1> <y1> <x2> <y2> for each of the frame in the tubelet - dets_all contains the tubelets obtained after a global nms at 0.7 and thresholding the scores at 0.01 it is a numpy arrray with 4*K + L + 1 containing the coordinates of the tubelets and the scores for all labels note: this version is inefficient: it is better to estimate the per-frame features once """ d = GetDataset(dname) if gpu >= 0: caffe.set_mode_gpu() caffe.set_device(gpu) model_dir = os.path.join(os.path.dirname(__file__), '../models/ACT-detector/', dname) output_dir = os.path.join(os.path.dirname(__file__), '../results/ACT-detector/', dname) # load the RGB network rgb_proto = os.path.join(model_dir, "deploy_RGB.prototxt") rgb_model = os.path.join(model_dir, "RGB.caffemodel") net_rgb = caffe.Net(rgb_proto, caffe.TEST, weights=rgb_model) # load the FLOW5 network flo_proto = os.path.join(model_dir, "deploy_FLOW5.prototxt") flo_model = os.path.join(model_dir, "FLOW5.caffemodel") net_flo = caffe.Net(flo_proto, caffe.TEST, weights=flo_model) vlist = d.test_vlist() for iv, v in enumerate(vlist): print("Processing video {:d}/{:d}: {:s}".format(iv + 1, len(vlist), v)) h, w = d.resolution(v) # network output is normalized between 0,1 ; so we will multiply it by the following array resolution_array = np.array([w, h, w, h] * K, dtype=np.float32) # now process each frame for i in range(1, 1 + d.nframes(v) - K + 1): outfile = os.path.join(output_dir, d.frame_format(v, i) + ".pkl") # skip if already computed if os.path.isfile(outfile) and not redo: continue # read the frames for the forward kwargs_rgb = {} kwargs_flo = {} for j in range(K): im = cv2.imread(d.imfile(v, i + j)) if im is None: print("Image {:s} does not exist".format(d.imfile( v, i + j))) return imscale = cv2.resize(im, (IMGSIZE, IMGSIZE), interpolation=cv2.INTER_LINEAR) kwargs_rgb['data_stream' + str(j)] = np.transpose( imscale - MEAN, (2, 0, 1))[None, :, :, :] imf = [ cv2.imread(d.flowfile(v, min(d.nframes(v), i + j + iflow))) for iflow in range(NFLOWS) ] if np.any(imf) is None: print("Flow image {:s} does not exist".format( d.flowfile(v, i + j))) return imscalef = [ cv2.resize(im, (IMGSIZE, IMGSIZE), interpolation=cv2.INTER_LINEAR) for im in imf ] timscale = [ np.transpose(im - MEAN, (2, 0, 1))[None, :, :, :] for im in imscalef ] kwargs_flo['data_stream' + str(j) + 'flow'] = np.concatenate( timscale, axis=1) # compute rgb and flow scores # two forward passes: one for the rgb and one for the flow net_rgb.forward( end="mbox_conf_flatten", **kwargs_rgb) # forward of rgb with confidence and regression net_flo.forward( end="mbox_conf_flatten", ** kwargs_flo) # forward of flow5 with confidence and regression # compute late fusion of rgb and flow scores (keep regression from rgb) # use net_rgb for standard detections, net_flo for having all boxes scores = 0.5 * (net_rgb.blobs['mbox_conf_flatten'].data + net_flo.blobs['mbox_conf_flatten'].data) net_rgb.blobs['mbox_conf_flatten'].data[...] = scores net_flo.blobs['mbox_conf_flatten'].data[...] = scores net_flo.blobs['mbox_loc'].data[ ...] = net_rgb.blobs['mbox_loc'].data # two forward passes, only for the last layer # dets is the detections after per-class NMS and thresholding (stardard) # dets_all contains all the scores and regressions for all tubelets dets = net_rgb.forward( start='detection_out')['detection_out'][0, 0, :, 1:] dets_all = net_flo.forward( start='detection_out_full')['detection_out_full'][0, 0, :, 1:] # parse detections with per-class NMS if dets.shape[0] == 1 and np.all(dets == -1): dets = np.empty((0, dets.shape[1]), dtype=np.float32) dets[:, 2:] *= resolution_array # network output was normalized in [0..1] dets[:, 0] -= 1 # label 0 was background, come back to label in [0..nlabels-1] dets[:, 2::2] = np.maximum(0, np.minimum(w, dets[:, 2::2])) dets[:, 3::2] = np.maximum(0, np.minimum(h, dets[:, 3::2])) # parse detections with global NMS at 0.7 (top 300) # coordinates were normalized in [0..1] dets_all[:, 0:4 * K] *= resolution_array dets_all[:, 0:4 * K:2] = np.maximum( 0, np.minimum(w, dets_all[:, 0:4 * K:2])) dets_all[:, 1:4 * K:2] = np.maximum( 0, np.minimum(h, dets_all[:, 1:4 * K:2])) idx = nms_tubelets( np.concatenate( (dets_all[:, :4 * K], np.max(dets_all[:, 4 * K + 1:], axis=1)[:, None]), axis=1), 0.7, 300) dets_all = dets_all[idx, :] # save file if not os.path.isdir(os.path.dirname(outfile)): os.system('mkdir -p ' + os.path.dirname(outfile)) with open(outfile, 'wb') as fid: pickle.dump((dets, dets_all), fid)
def load_frame_detections(d, vlist, dirname, nms): if isinstance(d, str): d = GetDataset(d) alldets = [ ] # list of numpy array with <video_index> <frame_index> <ilabel> <score> <x1> <y1> <x2> <y2> for iv, v in enumerate(vlist): h, w = d.resolution(v) # aggregate the results for each frame vdets = { i: np.empty((0, 6), dtype=np.float32) for i in range(1, 1 + d.nframes(v)) } # x1, y1, x2, y2, score, ilabel # load results for each starting frame for i in range(1, 1 + d.nframes(v) - K + 1): resname = os.path.join(dirname, d.frame_format(v, i) + '.pkl') if not os.path.isfile(resname): print("ERROR: Missing extracted tubelets " + resname) sys.exit() with open(resname, 'rb') as fid: dets, _ = pickle.load(fid) if dets.size == 0: continue for k in range(K): vdets[i + k] = np.concatenate( (vdets[i + k], dets[:, np.array([ 2 + 4 * k, 3 + 4 * k, 4 + 4 * k, 5 + 4 * k, 1, 0 ])]), axis=0) # Perform NMS in each frame for i in vdets: idx = np.empty((0, ), dtype=np.int32) for ilabel in range(d.nlabels): a = np.where(vdets[i][:, 5] == ilabel)[0] if a.size == 0: continue idx = np.concatenate((idx, a[nms2d( vdets[i][vdets[i][:, 5] == ilabel, :5], nms)]), axis=0) if idx.size == 0: continue alldets.append( np.concatenate( (iv * np.ones( (idx.size, 1), dtype=np.float32), i * np.ones( (idx.size, 1), dtype=np.float32), vdets[i][idx, :] [:, np.array([5, 4, 0, 1, 2, 3], dtype=np.int32)]), axis=1)) return np.concatenate(alldets, axis=0)
def extract_tubelets(dname, gpu=-1, redo=False): """Extract the tubelets for a given dataset args: - dname: dataset name (example: 'JHMDB') - gpu (default -1): use gpu given in argument, or use cpu if -1 - redo: wheter or not to recompute already computed files save a pickle file for each frame the file contains a tuple (dets, dets_all) - dets is a numpy array with 2+4*K columns containing the tubelets starting at this frame after per-class nms at 0.45 and thresholding the scores at 0.01 the columns are <label> <score> and then <x1> <y1> <x2> <y2> for each of the frame in the tubelet - dets_all contains the tubelets obtained after a global nms at 0.7 and thresholding the scores at 0.01 it is a numpy arrray with 4*K + L + 1 containing the coordinates of the tubelets and the scores for all labels note: this version is inefficient: it is better to estimate the per-frame features once """ d = GetDataset(dname) if gpu >= 0: caffe.set_mode_gpu() caffe.set_device(gpu) model_dir = os.path.join(os.path.dirname(__file__), '../models/ACT-detector/', dname) output_dir = os.path.join(os.path.dirname(__file__), '../results/ACT-detector/', dname) # load the RGB network rgb_proto = os.path.join(model_dir, "deploy_RGB.prototxt") rgb_model = os.path.join(model_dir, "../generated_AVA_iter_118662.caffemodel") net_rgb = caffe.Net(rgb_proto, caffe.TEST, weights=rgb_model) # load the FLOW5 network flo_proto = os.path.join(model_dir, "deploy_FLOW5.prototxt") flo_model = os.path.join(model_dir, "../generated_AVA_iter_59463.caffemodel") net_flo = caffe.Net(flo_proto, caffe.TEST, weights=flo_model) vlist = d.test_vlist() for iv, v in enumerate(vlist): print("Processing video {:d}/{:d}: {:s}".format( iv+1, len(vlist), v)) h, w = d.resolution(v) # network output is normalized between 0,1 ; so we will multiply it by the following array resolution_array = np.array([w,h,w,h]*K, dtype=np.float32) # now process each frame for i in xrange(1, 1 + d.nframes(v) - K + 1): outfile = os.path.join(output_dir, d.frame_format(v,i) + ".pkl") # skip if already computed if os.path.isfile(outfile) and not redo: continue # read the frames for the forward kwargs_rgb = {} kwargs_flo = {} for j in xrange(K): cap = cv2.VideoCapture(d.vidfile(v,0)) #print(frame) #print(int(cap.get(7))) cap.set(1,i + j - 1) im = cap.read()[1] cap.release() #im = cv2.imread(d.imfile(v, i + j)) if im is None: print "Image {:s} does not exist".format(d.imfile(v, i+j)) return imscale = cv2.resize(im, (IMGSIZE, IMGSIZE), interpolation=cv2.INTER_LINEAR) kwargs_rgb['data_stream' + str(j)] = np.transpose(imscale-MEAN, (2, 0, 1))[None, :, :, :] imf = [cv2.imread(d.flowfile(v.split(".")[0], min(d.nframes(v), i + j + iflow))) for iflow in xrange(NFLOWS)] if np.any(imf) is None: print "Flow image {:s} does not exist".format(d.flowfile(v, i+j)) return imscalef = [cv2.resize(im, (IMGSIZE, IMGSIZE), interpolation=cv2.INTER_LINEAR) for im in imf] timscale = [np.transpose(im-MEAN, (2, 0, 1))[None, :, :, :] for im in imscalef] kwargs_flo['data_stream' + str(j) + 'flow'] = np.concatenate(timscale, axis=1) # compute rgb and flow scores # two forward passes: one for the rgb and one for the flow net_rgb.forward(end="mbox_conf_flatten", **kwargs_rgb) # forward of rgb with confidence and regression net_flo.forward(end="mbox_conf_flatten", **kwargs_flo) # forward of flow5 with confidence and regression # compute late fusion of rgb and flow scores (keep regression from rgb) # use net_rgb for standard detections, net_flo for having all boxes scores = 0.5 * (net_rgb.blobs['mbox_conf_flatten'].data + net_flo.blobs['mbox_conf_flatten'].data) net_rgb.blobs['mbox_conf_flatten'].data[...] = scores net_flo.blobs['mbox_conf_flatten'].data[...] = scores net_flo.blobs['mbox_loc'].data[...] = net_rgb.blobs['mbox_loc'].data # two forward passes, only for the last layer # dets is the detections after per-class NMS and thresholding (stardard) # dets_all contains all the scores and regressions for all tubelets dets = net_rgb.forward(start='detection_out')['detection_out'][0, 0, :, 1:] dets_all = net_flo.forward(start='detection_out_full')['detection_out_full'][0, 0, :, 1:] # parse detections with per-class NMS if dets.shape[0] == 1 and np.all(dets == -1): dets = np.empty((0, dets.shape[1]), dtype=np.float32) dets[:, 2:] *= resolution_array # network output was normalized in [0..1] dets[:, 0] -= 1 # label 0 was background, come back to label in [0..nlabels-1] dets[:, 2::2] = np.maximum(0, np.minimum(w, dets[:, 2::2])) dets[:, 3::2] = np.maximum(0, np.minimum(h, dets[:, 3::2])) # parse detections with global NMS at 0.7 (top 300) # coordinates were normalized in [0..1] dets_all[:, 0:4*K] *= resolution_array dets_all[:, 0:4*K:2] = np.maximum(0, np.minimum(w, dets_all[:, 0:4*K:2])) dets_all[:, 1:4*K:2] = np.maximum(0, np.minimum(h, dets_all[:, 1:4*K:2])) idx = nms_tubelets(np.concatenate((dets_all[:, :4*K], np.max(dets_all[:, 4*K+1:], axis=1)[:, None]), axis=1), 0.7, 300) dets_all = dets_all[idx, :] # save file if not os.path.isdir(os.path.dirname(outfile)): os.system('mkdir -p ' + os.path.dirname(outfile)) with open(outfile, 'wb') as fid: pickle.dump((dets, dets_all), fid)