def frameMABO(dname, redo=False): d = GetDataset(dname) dirname = os.path.join( os.path.dirname(__file__), '../results/ACT-detector/', dname) eval_file = os.path.join(dirname, "frameMABO.pkl") if os.path.isfile(eval_file) and not redo: with open(eval_file, 'rb') as fid: BO = pickle.load(fid) else: vlist = d.test_vlist() BO = {l: [] for l in d.labels} # best overlap for v in vlist: gt = d.gttubes(v) h, w = d.resolution(v) # load per-frame detections vdets = {i: np.empty((0,4), dtype=np.float32) for i in range(1, 1+d.nframes(v))} # load results for each chunk for i in xrange(1, 1 + d.nframes(v) - K + 1): resname = os.path.join(dirname, d.frame_format(v,i) + '.pkl') if not os.path.isfile(resname): print("ERROR: Missing extracted tubelets " + resname) sys.exit() with open(resname, 'rb') as fid: dets, _ = pickle.load(fid) for k in xrange(K): vdets[i+k] = np.concatenate((vdets[i + k], dets[:, 2+4*k:6+4*k]), axis=0) # for each frame for i in xrange(1, 1 + d.nframes(v)): for ilabel in gt: label = d.labels[ilabel] for t in gt[ilabel]: # the gt tube does not cover frame i if not i in t[:,0]: continue gtbox = t[t[:,0] == i, 1:5] # box of gt tube at frame i if vdets[i].size == 0: # we missed it BO[label].append(0) continue ious = iou2d(vdets[i], gtbox) BO[label].append( np.max(ious) ) # save file with open(eval_file, 'wb') as fid: pickle.dump( BO, fid) # print MABO results ABO = {la: 100 * np.mean(np.array(BO[la])) for la in d.labels} # average best overlap for la in d.labels: print("{:20s} {:6.2f}".format(la, ABO[la])) print("{:20s} {:6.2f}".format("MABO", np.mean(np.array(ABO.values()))))
def load_frame_detections(d, vlist, dirname, nms): if isinstance(d, str): d = GetDataset(d) alldets = [] # list of numpy array with <video_index> <frame_index> <ilabel> <score> <x1> <y1> <x2> <y2> for iv, v in enumerate(vlist): h,w = d.resolution(v) # aggregate the results for each frame vdets = {i: np.empty((0,6), dtype=np.float32) for i in range(1, 1 + d.nframes(v))} # x1, y1, x2, y2, score, ilabel # load results for each starting frame for i in xrange(1, 1 + d.nframes(v) - K + 1): resname = os.path.join(dirname, d.frame_format(v,i) + '.pkl') if not os.path.isfile(resname): print("ERROR: Missing extracted tubelets "+resname) sys.exit() with open(resname, 'rb') as fid: dets, _ = pickle.load(fid) if dets.size == 0: continue for k in xrange(K): vdets[i+k] = np.concatenate( (vdets[i+k],dets[:,np.array([2+4*k,3+4*k,4+4*k,5+4*k,1,0])] ), axis=0) # Perform NMS in each frame for i in vdets: idx = np.empty((0,), dtype=np.int32) for ilabel in xrange(d.nlabels): a = np.where(vdets[i][:,5] == ilabel)[0] if a.size == 0: continue idx = np.concatenate((idx, a[nms2d(vdets[i][vdets[i][:, 5] == ilabel, :5], nms)]), axis=0) if idx.size == 0: continue alldets.append(np.concatenate((iv * np.ones((idx.size, 1), dtype=np.float32), i * np.ones((idx.size, 1), dtype=np.float32), vdets[i][idx, :][:, np.array([5, 4, 0, 1, 2, 3], dtype=np.int32)]), axis=1)) return np.concatenate(alldets, axis=0)
def BuildTubes(dname, redo=False): d = GetDataset(dname) dirname = os.path.join(os.path.dirname(__file__), '../results/ACT-detector/', dname) vlist = d.test_vlist() for iv, v in enumerate(vlist): print("Processing video {:d}/{:d}: {:s}".format(iv + 1, len(vlist), v)) outfile = os.path.join(dirname, v + "_tubes.pkl") if os.path.isfile(outfile) and not redo: continue RES = {} nframes = d.nframes(v) # load detected tubelets VDets = {} for startframe in range(1, nframes + 2 - K): resname = os.path.join(dirname, d.frame_format(v, startframe) + '.pkl') if not os.path.isfile(resname): print("ERROR: Missing extracted tubelets " + resname) sys.exit() with open(resname, 'rb') as fid: _, VDets[startframe] = pickle.load(fid) for ilabel in range(d.nlabels): FINISHED_TUBES = [] CURRENT_TUBES = [] # tubes is a list of tuple (frame, lstubelets) def tubescore(tt): return np.mean(np.array([tt[i][1][-1] for i in range(len(tt))])) for frame in range(1, d.nframes(v) + 2 - K): # load boxes of the new frame and do nms while keeping Nkeep highest scored ltubelets = VDets[ frame][:, range(4 * K) + [4 * K + 1 + ilabel ]] # Nx(4K+1) with (x1 y1 x2 y2)*K ilabel-score idx = nms_tubelets(ltubelets, 0.3, top_k=10) ltubelets = ltubelets[idx, :] # just start new tubes if frame == 1: for i in range(ltubelets.shape[0]): CURRENT_TUBES.append([(1, ltubelets[i, :])]) continue # sort current tubes according to average score avgscore = [tubescore(t) for t in CURRENT_TUBES] argsort = np.argsort(-np.array(avgscore)) CURRENT_TUBES = [CURRENT_TUBES[i] for i in argsort] # loop over tubes finished = [] for it, t in enumerate(CURRENT_TUBES): # compute ious between the last box of t and ltubelets last_frame, last_tubelet = t[-1] ious = [] offset = frame - last_frame if offset < K: nov = K - offset ious = sum([ iou2d( ltubelets[:, 4 * iov:4 * iov + 4], last_tubelet[4 * (iov + offset):4 * (iov + offset + 1)]) for iov in range(nov) ]) / float(nov) else: ious = iou2d(ltubelets[:, :4], last_tubelet[4 * K - 4:4 * K]) valid = np.where(ious >= 0.2)[0] if valid.size > 0: # take the one with maximum score idx = valid[np.argmax(ltubelets[valid, -1])] CURRENT_TUBES[it].append((frame, ltubelets[idx, :])) ltubelets = np.delete(ltubelets, idx, axis=0) else: # skip if offset >= 5: finished.append(it) # finished tubes that are done for it in finished[:: -1]: # process in reverse order to delete them with the right index FINISHED_TUBES.append(CURRENT_TUBES[it][:]) del CURRENT_TUBES[it] # start new tubes for i in range(ltubelets.shape[0]): CURRENT_TUBES.append([(frame, ltubelets[i, :])]) # all tubes are not finished FINISHED_TUBES += CURRENT_TUBES # build real tubes output = [] for t in FINISHED_TUBES: score = tubescore(t) # just start new tubes if score < 0.01: continue beginframe = t[0][0] endframe = t[-1][0] + K - 1 length = endframe + 1 - beginframe # delete tubes with short duraton if length < 15: continue # build final tubes by average the tubelets out = np.zeros((length, 6), dtype=np.float32) out[:, 0] = np.arange(beginframe, endframe + 1) n_per_frame = np.zeros((length, 1), dtype=np.int32) for i in range(len(t)): frame, box = t[i] for k in range(K): out[frame - beginframe + k, 1:5] += box[4 * k:4 * k + 4] out[frame - beginframe + k, -1] += box[-1] n_per_frame[frame - beginframe + k, 0] += 1 out[:, 1:] /= n_per_frame output.append((out, score)) RES[ilabel] = output with open(outfile, 'wb') as fid: pickle.dump(RES, fid)
def frameCLASSIF(dname, redo=False): d = GetDataset(dname) dirname = os.path.join(os.path.dirname(__file__), '../results/ACT-detector/', dname) eval_file = os.path.join(dirname, "frameCLASSIF.pkl") if os.path.isfile(eval_file) and not redo: with open(eval_file, 'rb') as fid: CLASSIF = pickle.load(fid) else: vlist = d.test_vlist() CORRECT = [0 for ilabel in range(d.nlabels)] TOTAL = [0 for ilabel in range(d.nlabels)] for v in vlist: nframes = d.nframes(v) # load all tubelets VDets = {} for startframe in range(1, nframes + 2 - K): resname = os.path.join(dirname, d.frame_format(v, startframe) + '.pkl') if not os.path.isfile(resname): print("ERROR: Missing extracted tubelets " + resname) sys.exit() with open(resname, 'rb') as fid: _, VDets[startframe] = pickle.load(fid) # iterate over ground-truth tubes = d.gttubes(v) for ilabel in tubes: for g in tubes[ilabel]: for i in range(g.shape[0]): frame = int(g[i, 0]) # just in case a tube is longer than the video if frame > nframes: continue gtbox = g[i, 1:5] scores = np.zeros((d.nlabels, ), dtype=np.float32) # average the score over the 6 frames for sf in range(max(1, frame - K + 1), min(nframes - K + 1, frame) + 1): overlaps = iou2d( VDets[sf][:, 4 * (frame - sf):4 * (frame - sf) + 4], gtbox) scores += np.sum(VDets[sf][overlaps >= 0.7, 4 * K + 1:], axis=0) # check classif if np.argmax(scores) == ilabel: CORRECT[ilabel] += 1 TOTAL[ilabel] += 1 CLASSIF = [ float(CORRECT[ilabel]) / float(TOTAL[ilabel]) for ilabel in range(d.nlabels) ] with open(eval_file, 'wb') as fid: pickle.dump(CLASSIF, fid) # print classif results for il, la in enumerate(d.labels): print("{:20s} {:6.2f}".format(la, 100 * CLASSIF[il])) print("{:20s} {:6.2f}".format("CLASSIF", 100 * np.mean(np.array(CLASSIF))))
def frameMABO(dname, redo=False): d = GetDataset(dname) dirname = os.path.join(os.path.dirname(__file__), '../results/ACT-detector/', dname) eval_file = os.path.join(dirname, "frameMABO.pkl") if os.path.isfile(eval_file) and not redo: with open(eval_file, 'rb') as fid: BO = pickle.load(fid) else: vlist = d.test_vlist() BO = {l: [] for l in d.labels} # best overlap for v in vlist: gt = d.gttubes(v) h, w = d.resolution(v) # load per-frame detections vdets = { i: np.empty((0, 4), dtype=np.float32) for i in range(1, 1 + d.nframes(v)) } # load results for each chunk for i in range(1, 1 + d.nframes(v) - K + 1): resname = os.path.join(dirname, d.frame_format(v, i) + '.pkl') if not os.path.isfile(resname): print("ERROR: Missing extracted tubelets " + resname) sys.exit() with open(resname, 'rb') as fid: dets, _ = pickle.load(fid) for k in range(K): vdets[i + k] = np.concatenate( (vdets[i + k], dets[:, 2 + 4 * k:6 + 4 * k]), axis=0) # for each frame for i in range(1, 1 + d.nframes(v)): for ilabel in gt: label = d.labels[ilabel] for t in gt[ilabel]: # the gt tube does not cover frame i if not i in t[:, 0]: continue gtbox = t[t[:, 0] == i, 1:5] # box of gt tube at frame i if vdets[i].size == 0: # we missed it BO[label].append(0) continue ious = iou2d(vdets[i], gtbox) BO[label].append(np.max(ious)) # save file with open(eval_file, 'wb') as fid: pickle.dump(BO, fid) # print MABO results ABO = {la: 100 * np.mean(np.array(BO[la])) for la in d.labels} # average best overlap for la in d.labels: print("{:20s} {:6.2f}".format(la, ABO[la])) print("{:20s} {:6.2f}".format("MABO", np.mean(np.array(ABO.values()))))
def frameAP_error(dname, th=0.5, redo=False): d = GetDataset(dname) dirname = os.path.join(os.path.dirname(__file__), '../results/ACT-detector/', dname) eval_file = os.path.join(dirname, "frameAP{:g}ErrorAnalysis.pkl".format(th)) if os.path.isfile(eval_file) and not redo: with open(eval_file, 'rb') as fid: res = pickle.load(fid) else: vlist = d.test_vlist() # load per-frame detections alldets = load_frame_detections(d, vlist, dirname, 0.3) res = {} # compute AP for each class for ilabel, label in enumerate(d.labels): # detections of this class detections = alldets[alldets[:, 2] == ilabel, :] gt = {} othergt = {} labellist = {} for iv, v in enumerate(vlist): tubes = d.gttubes(v) labellist[v] = tubes.keys() for il in tubes: for tube in tubes[il]: for i in range(tube.shape[0]): k = (iv, int(tube[i, 0])) if il == ilabel: if k not in gt: gt[k] = [] gt[k].append(tube[i, 1:5].tolist()) else: if k not in othergt: othergt[k] = [] othergt[k].append(tube[i, 1:5].tolist()) for k in gt: gt[k] = np.array(gt[k]) for k in othergt: othergt[k] = np.array(othergt[k]) dupgt = deepcopy(gt) # pr will be an array containing precision-recall values and 4 types of errors: # localization, classification, timing, others pr = np.empty((detections.shape[0] + 1, 6), dtype=np.float32) # precision, recall pr[0, 0] = 1.0 pr[0, 1:] = 0.0 fn = sum([g.shape[0] for g in gt.values()]) # false negatives fp = 0 # false positives tp = 0 # true positives EL = 0 # localization errors EC = 0 # classification error: overlap >=0.5 with an another object EO = 0 # other errors ET = 0 # timing error: the video contains the action but not at this frame for i, j in enumerate(np.argsort(-detections[:, 3])): k = (int(detections[j, 0]), int(detections[j, 1])) box = detections[j, 4:8] ispositive = False if k in dupgt: if k in gt: ious = iou2d(gt[k], box) amax = np.argmax(ious) if k in gt and ious[amax] >= th: ispositive = True gt[k] = np.delete(gt[k], amax, 0) if gt[k].size == 0: del gt[k] else: EL += 1 elif k in othergt: ious = iou2d(othergt[k], box) if np.max(ious) >= th: EC += 1 else: EO += 1 elif ilabel in labellist[k[0]]: ET += 1 else: EO += 1 if ispositive: tp += 1 fn -= 1 else: fp += 1 pr[i + 1, 0] = float(tp) / float(tp + fp) pr[i + 1, 1] = float(tp) / float(tp + fn) pr[i + 1, 2] = float(EL) / float(tp + fp) pr[i + 1, 3] = float(EC) / float(tp + fp) pr[i + 1, 4] = float(ET) / float(tp + fp) pr[i + 1, 5] = float(EO) / float(tp + fp) res[label] = pr # save results with open(eval_file, 'wb') as fid: pickle.dump(res, fid) # display results AP = 100 * np.array( [pr_to_ap(res[label][:, [0, 1]]) for label in d.labels]) othersap = [ 100 * np.array([pr_to_ap(res[label][:, [j, 1]]) for label in d.labels]) for j in range(2, 6) ] EL = othersap[0] EC = othersap[1] ET = othersap[2] EO = othersap[3] EM = 100 - 100 * np.array([res[label][-1, 1] for label in d.labels ]) # missed detections = 1 - recall LIST = [AP, EL, EC, ET, EO, EM] print("Error Analysis") print("") print("{:20s} {:8s} {:8s} {:8s} {:8s} {:8s} {:8s}".format( 'label', ' AP ', ' Loc. ', ' Cls. ', ' Time ', ' Other ', ' missed ')) print("") for il, label in enumerate(d.labels): print("{:20s} ".format(label) + " ".join(["{:8.2f}".format(L[il]) for L in LIST])) print("") print("{:20s} ".format("mean") + " ".join(["{:8.2f}".format(np.mean(L)) for L in LIST])) print("")
def frameAP(dname, th=0.5, redo=False): d = GetDataset(dname) dirname = os.path.join(os.path.dirname(__file__), '../results/ACT-detector/', dname) eval_file = os.path.join(dirname, "frameAP{:g}.pkl".format(th)) if os.path.isfile(eval_file) and not redo: with open(eval_file, 'rb') as fid: res = pickle.load(fid) else: vlist = d.test_vlist() # load per-frame detections alldets = load_frame_detections(d, vlist, dirname, 0.3) res = {} # compute AP for each class for ilabel, label in enumerate(d.labels): # detections of this class detections = alldets[alldets[:, 2] == ilabel, :] # load ground-truth of this class gt = {} for iv, v in enumerate(vlist): tubes = d.gttubes(v) if not ilabel in tubes: continue for tube in tubes[ilabel]: for i in range(tube.shape[0]): k = (iv, int(tube[i, 0])) if not k in gt: gt[k] = [] gt[k].append(tube[i, 1:5].tolist()) for k in gt: gt[k] = np.array(gt[k]) # pr will be an array containing precision-recall values pr = np.empty((detections.shape[0] + 1, 2), dtype=np.float32) # precision,recall pr[0, 0] = 1.0 pr[0, 1] = 0.0 fn = sum([g.shape[0] for g in gt.values()]) # false negatives fp = 0 # false positives tp = 0 # true positives for i, j in enumerate(np.argsort(-detections[:, 3])): k = (int(detections[j, 0]), int(detections[j, 1])) box = detections[j, 4:8] ispositive = False if k in gt: ious = iou2d(gt[k], box) amax = np.argmax(ious) if ious[amax] >= th: ispositive = True gt[k] = np.delete(gt[k], amax, 0) if gt[k].size == 0: del gt[k] if ispositive: tp += 1 fn -= 1 else: fp += 1 pr[i + 1, 0] = float(tp) / float(tp + fp) pr[i + 1, 1] = float(tp) / float(tp + fn) res[label] = pr # save results with open(eval_file, 'wb') as fid: pickle.dump(res, fid) # display results ap = 100 * np.array([pr_to_ap(res[label]) for label in d.labels]) print("frameAP") for il, _ in enumerate(d.labels): print("{:20s} {:8.2f}".format('', ap[il])) print("{:20s} {:8.2f}".format("mAP", np.mean(ap))) print("")
def extract_tubelets(dname, gpu=-1, redo=False): """Extract the tubelets for a given dataset args: - dname: dataset name (example: 'JHMDB') - gpu (default -1): use gpu given in argument, or use cpu if -1 - redo: wheter or not to recompute already computed files save a pickle file for each frame the file contains a tuple (dets, dets_all) - dets is a numpy array with 2+4*K columns containing the tubelets starting at this frame after per-class nms at 0.45 and thresholding the scores at 0.01 the columns are <label> <score> and then <x1> <y1> <x2> <y2> for each of the frame in the tubelet - dets_all contains the tubelets obtained after a global nms at 0.7 and thresholding the scores at 0.01 it is a numpy arrray with 4*K + L + 1 containing the coordinates of the tubelets and the scores for all labels note: this version is inefficient: it is better to estimate the per-frame features once """ d = GetDataset(dname) if gpu >= 0: caffe.set_mode_gpu() caffe.set_device(gpu) model_dir = os.path.join(os.path.dirname(__file__), '../models/ACT-detector/', dname) output_dir = os.path.join(os.path.dirname(__file__), '../results/ACT-detector/', dname) # load the RGB network rgb_proto = os.path.join(model_dir, "deploy_RGB.prototxt") rgb_model = os.path.join(model_dir, "RGB.caffemodel") net_rgb = caffe.Net(rgb_proto, caffe.TEST, weights=rgb_model) # load the FLOW5 network flo_proto = os.path.join(model_dir, "deploy_FLOW5.prototxt") flo_model = os.path.join(model_dir, "FLOW5.caffemodel") net_flo = caffe.Net(flo_proto, caffe.TEST, weights=flo_model) vlist = d.test_vlist() for iv, v in enumerate(vlist): print("Processing video {:d}/{:d}: {:s}".format(iv + 1, len(vlist), v)) h, w = d.resolution(v) # network output is normalized between 0,1 ; so we will multiply it by the following array resolution_array = np.array([w, h, w, h] * K, dtype=np.float32) # now process each frame for i in range(1, 1 + d.nframes(v) - K + 1): outfile = os.path.join(output_dir, d.frame_format(v, i) + ".pkl") # skip if already computed if os.path.isfile(outfile) and not redo: continue # read the frames for the forward kwargs_rgb = {} kwargs_flo = {} for j in range(K): im = cv2.imread(d.imfile(v, i + j)) if im is None: print("Image {:s} does not exist".format(d.imfile( v, i + j))) return imscale = cv2.resize(im, (IMGSIZE, IMGSIZE), interpolation=cv2.INTER_LINEAR) kwargs_rgb['data_stream' + str(j)] = np.transpose( imscale - MEAN, (2, 0, 1))[None, :, :, :] imf = [ cv2.imread(d.flowfile(v, min(d.nframes(v), i + j + iflow))) for iflow in range(NFLOWS) ] if np.any(imf) is None: print("Flow image {:s} does not exist".format( d.flowfile(v, i + j))) return imscalef = [ cv2.resize(im, (IMGSIZE, IMGSIZE), interpolation=cv2.INTER_LINEAR) for im in imf ] timscale = [ np.transpose(im - MEAN, (2, 0, 1))[None, :, :, :] for im in imscalef ] kwargs_flo['data_stream' + str(j) + 'flow'] = np.concatenate( timscale, axis=1) # compute rgb and flow scores # two forward passes: one for the rgb and one for the flow net_rgb.forward( end="mbox_conf_flatten", **kwargs_rgb) # forward of rgb with confidence and regression net_flo.forward( end="mbox_conf_flatten", ** kwargs_flo) # forward of flow5 with confidence and regression # compute late fusion of rgb and flow scores (keep regression from rgb) # use net_rgb for standard detections, net_flo for having all boxes scores = 0.5 * (net_rgb.blobs['mbox_conf_flatten'].data + net_flo.blobs['mbox_conf_flatten'].data) net_rgb.blobs['mbox_conf_flatten'].data[...] = scores net_flo.blobs['mbox_conf_flatten'].data[...] = scores net_flo.blobs['mbox_loc'].data[ ...] = net_rgb.blobs['mbox_loc'].data # two forward passes, only for the last layer # dets is the detections after per-class NMS and thresholding (stardard) # dets_all contains all the scores and regressions for all tubelets dets = net_rgb.forward( start='detection_out')['detection_out'][0, 0, :, 1:] dets_all = net_flo.forward( start='detection_out_full')['detection_out_full'][0, 0, :, 1:] # parse detections with per-class NMS if dets.shape[0] == 1 and np.all(dets == -1): dets = np.empty((0, dets.shape[1]), dtype=np.float32) dets[:, 2:] *= resolution_array # network output was normalized in [0..1] dets[:, 0] -= 1 # label 0 was background, come back to label in [0..nlabels-1] dets[:, 2::2] = np.maximum(0, np.minimum(w, dets[:, 2::2])) dets[:, 3::2] = np.maximum(0, np.minimum(h, dets[:, 3::2])) # parse detections with global NMS at 0.7 (top 300) # coordinates were normalized in [0..1] dets_all[:, 0:4 * K] *= resolution_array dets_all[:, 0:4 * K:2] = np.maximum( 0, np.minimum(w, dets_all[:, 0:4 * K:2])) dets_all[:, 1:4 * K:2] = np.maximum( 0, np.minimum(h, dets_all[:, 1:4 * K:2])) idx = nms_tubelets( np.concatenate( (dets_all[:, :4 * K], np.max(dets_all[:, 4 * K + 1:], axis=1)[:, None]), axis=1), 0.7, 300) dets_all = dets_all[idx, :] # save file if not os.path.isdir(os.path.dirname(outfile)): os.system('mkdir -p ' + os.path.dirname(outfile)) with open(outfile, 'wb') as fid: pickle.dump((dets, dets_all), fid)
import tensorflow as tf import numpy as np from Dataset import GetDataset from Embeddings import GetEmbeddings from datetime import datetime trainDataset, testDataset = GetDataset(10000) handle = tf.placeholder(tf.string, shape=[]) iterator = tf.data.Iterator.from_string_handle(handle, trainDataset.output_types, trainDataset.output_shapes) text, label, tokens = iterator.get_next() trainIterator = trainDataset.make_initializable_iterator() testIterator = testDataset.make_initializable_iterator() embeddings = GetEmbeddings() vocabSize, numberOfEmbeddings = embeddings.shape embeddedTokens = tf.keras.layers.Embedding( vocabSize + 1, numberOfEmbeddings, embeddings_initializer=tf.keras.initializers.Constant(embeddings), mask_zero=True, trainable=False)(tokens) cell = tf.keras.layers.LSTMCell(50) rnn = tf.keras.layers.RNN(cell) semantics = rnn(embeddedTokens) prediction = tf.keras.layers.Dense(1, activation='sigmoid')(semantics)
def videoAP(dname, th=0.5, redo=False): d = GetDataset(dname) dirname = os.path.join( os.path.dirname(__file__), '../results/ACT-detector/', dname) eval_file = os.path.join(dirname, "videoAP{:g}.pkl".format(th)) if os.path.isfile(eval_file) and not redo: with open(eval_file, 'rb') as fid: res = pickle.load(fid) else: vlist = d.test_vlist() # load detections # alldets = for each label in 1..nlabels, list of tuple (v,score,tube as Kx5 array) alldets = {ilabel: [] for ilabel in xrange(d.nlabels)} for v in vlist: tubename = os.path.join(dirname, v + '_tubes.pkl') if not os.path.isfile(tubename): print("ERROR: Missing extracted tubes " + tubename) sys.exit() with open(tubename, 'rb') as fid: tubes = pickle.load(fid) for ilabel in xrange(d.nlabels): ltubes = tubes[ilabel] idx = nms3dt(ltubes, 0.3) alldets[ilabel] += [(v,ltubes[i][1], ltubes[i][0]) for i in idx] # compute AP for each class res = {} for ilabel in xrange(d.nlabels): detections = alldets[ilabel] # load ground-truth gt = {} for v in vlist: tubes = d.gttubes(v) if not ilabel in tubes: continue gt[v] = tubes[ilabel] if len(gt[v])==0: del gt[v] # precision,recall pr = np.empty((len(detections) + 1, 2), dtype=np.float32) pr[0,0] = 1.0 pr[0,1] = 0.0 fn = sum([ len(g) for g in gt.values()]) # false negatives fp = 0 # false positives tp = 0 # true positives for i, j in enumerate( np.argsort(-np.array([dd[1] for dd in detections]))): v, score, tube = detections[j] ispositive = False if v in gt: ious = [iou3dt(g, tube) for g in gt[v]] amax = np.argmax(ious) if ious[amax] >= th: ispositive = True del gt[v][amax] if len(gt[v]) == 0: del gt[v] if ispositive: tp += 1 fn -= 1 else: fp += 1 pr[i+1,0] = float(tp) / float(tp + fp) pr[i+1,1] = float(tp) / float(tp + fn) res[d.labels[ilabel]] = pr # save results with open(eval_file, 'wb') as fid: pickle.dump(res, fid) # display results ap = 100 * np.array([pr_to_ap(res[label]) for label in d.labels]) print "frameAP" for il, _ in enumerate(d.labels): print("{:20s} {:8.2f}".format('', ap[il])) print("{:20s} {:8.2f}".format("mAP", np.mean(ap))) print("")
def BuildTubes(dname, redo=False): d = GetDataset(dname) dirname = os.path.join( os.path.dirname(__file__), '../results/ACT-detector/', dname) vlist = d.test_vlist() for iv, v in enumerate(vlist): print("Processing video {:d}/{:d}: {:s}".format(iv + 1, len(vlist), v)) outfile = os.path.join(dirname, v + "_tubes.pkl") if os.path.isfile(outfile) and not redo: continue RES = {} nframes = d.nframes(v) # load detected tubelets VDets = {} for startframe in xrange(1, nframes + 2 - K): resname = os.path.join(dirname, d.frame_format(v, startframe) + '.pkl') if not os.path.isfile(resname): print("ERROR: Missing extracted tubelets " + resname) sys.exit() with open(resname, 'rb') as fid: _, VDets[startframe] = pickle.load(fid) for ilabel in xrange(d.nlabels): FINISHED_TUBES = [] CURRENT_TUBES = [] # tubes is a list of tuple (frame, lstubelets) def tubescore(tt): return np.mean(np.array([tt[i][1][-1] for i in xrange(len(tt))])) for frame in xrange(1, d.nframes(v) + 2 - K): # load boxes of the new frame and do nms while keeping Nkeep highest scored ltubelets = VDets[frame][:,range(4*K) + [4*K + 1 + ilabel]] # Nx(4K+1) with (x1 y1 x2 y2)*K ilabel-score idx = nms_tubelets(ltubelets, 0.3, top_k=10) ltubelets = ltubelets[idx,:] # just start new tubes if frame == 1: for i in xrange(ltubelets.shape[0]): CURRENT_TUBES.append( [(1,ltubelets[i,:])] ) continue # sort current tubes according to average score avgscore = [tubescore(t) for t in CURRENT_TUBES ] argsort = np.argsort(-np.array(avgscore)) CURRENT_TUBES = [CURRENT_TUBES[i] for i in argsort] # loop over tubes finished = [] for it, t in enumerate(CURRENT_TUBES): # compute ious between the last box of t and ltubelets last_frame, last_tubelet = t[-1] ious = [] offset = frame - last_frame if offset < K: nov = K - offset ious = sum([iou2d(ltubelets[:, 4*iov:4*iov+4], last_tubelet[4*(iov+offset):4*(iov+offset+1)]) for iov in xrange(nov)])/float(nov) else: ious = iou2d(ltubelets[:, :4], last_tubelet[4*K-4:4*K]) valid = np.where(ious >= 0.2)[0] if valid.size>0: # take the one with maximum score idx = valid[ np.argmax(ltubelets[valid, -1])] CURRENT_TUBES[it].append((frame, ltubelets[idx,:])) ltubelets = np.delete(ltubelets, idx, axis=0) else: # skip if offset>=5: finished.append(it) # finished tubes that are done for it in finished[::-1]: # process in reverse order to delete them with the right index FINISHED_TUBES.append( CURRENT_TUBES[it][:]) del CURRENT_TUBES[it] # start new tubes for i in xrange(ltubelets.shape[0]): CURRENT_TUBES.append([(frame,ltubelets[i,:])]) # all tubes are not finished FINISHED_TUBES += CURRENT_TUBES # build real tubes output = [] for t in FINISHED_TUBES: score = tubescore(t) # just start new tubes if score< 0.01: continue beginframe = t[0][0] endframe = t[-1][0]+K-1 length = endframe+1-beginframe # delete tubes with short duraton if length < 15: continue # build final tubes by average the tubelets out = np.zeros((length, 6), dtype=np.float32) out[:, 0] = np.arange(beginframe,endframe+1) n_per_frame = np.zeros((length, 1), dtype=np.int32) for i in xrange(len(t)): frame, box = t[i] for k in xrange(K): out[frame-beginframe+k, 1:5] += box[4*k:4*k+4] out[frame-beginframe+k, -1] += box[-1] n_per_frame[frame-beginframe+k ,0] += 1 out[:,1:] /= n_per_frame output.append((out, score)) RES[ilabel] = output with open(outfile, 'wb') as fid: pickle.dump(RES, fid)
def frameCLASSIF(dname, redo=False): d = GetDataset(dname) dirname = os.path.join(os.path.dirname(__file__), '../results/ACT-detector/', dname) eval_file = os.path.join(dirname, "frameCLASSIF.pkl") if os.path.isfile(eval_file) and not redo: with open(eval_file, 'rb') as fid: CLASSIF = pickle.load(fid) else: vlist = d.test_vlist() #print(vlist) CORRECT = [0 for ilabel in xrange(d.nlabels)] TOTAL = [0 for ilabel in xrange(d.nlabels)] for v in vlist: nframes = d.nframes(v) # load all tubelets VDets = {} for startframe in xrange(1, nframes + 2 - K): resname = os.path.join(dirname, d.frame_format(v, startframe) + '.pkl') if not os.path.isfile(resname): print("ERROR: Missing extracted tubelets " + resname) sys.exit() with open(resname, 'rb') as fid: _, VDets[startframe] = pickle.load(fid) # iterate over ground-truth tubes = d.gttubes(v) for ilabel in tubes: for g in tubes[ilabel]: for i in xrange(g.shape[0]): frame = int(g[i, 0]) # just in case a tube is longer than the video if frame > nframes: continue gtbox = g[i, 1:5] scores = np.zeros((d.nlabels,), dtype=np.float32) # average the score over the 6 frames for sf in xrange(max(1, frame - K + 1), min(nframes - K + 1, frame) + 1): overlaps = iou2d(VDets[sf][:, 4*(frame-sf):4*(frame-sf)+4], gtbox) scores += np.sum(VDets[sf][overlaps >= 0.7, 4*K + 1:],axis=0) # check classif if np.argmax(scores) == ilabel: CORRECT[ilabel] += 1 TOTAL[ilabel] += 1 print(TOTAL) print(CORRECT) CLASSIF = [float(CORRECT[ilabel]) / float(TOTAL[ilabel]) for ilabel in xrange(d.nlabels) if TOTAL[ilabel] != 0 ] with open(eval_file, 'wb') as fid: pickle.dump(CLASSIF, fid) # print classif results for il, la in enumerate(d.labels): print("{:20s} {:6.2f}".format(la, 100*CLASSIF[il])) print("{:20s} {:6.2f}".format("CLASSIF", 100*np.mean(np.array(CLASSIF))))
def frameAP(dname, th=0.5, redo=False): d = GetDataset(dname) dirname = os.path.join(os.path.dirname(__file__), '../results/ACT-detector/', dname) eval_file = os.path.join(dirname, "frameAP{:g}.pkl".format(th)) if os.path.isfile(eval_file) and not redo: with open(eval_file, 'rb') as fid: res = pickle.load(fid) else: vlist = d.test_vlist() print(vlist) # load per-frame detections alldets = load_frame_detections(d, vlist, dirname, 0.3) res = {} # compute AP for each class for ilabel,label in enumerate(d.labels): # detections of this class detections = alldets[alldets[:, 2] == ilabel, :] # load ground-truth of this class gt = {} for iv, v in enumerate(vlist): tubes = d.gttubes(v) if not ilabel in tubes: continue for tube in tubes[ilabel]: for i in xrange(tube.shape[0]): k = (iv, int(tube[i, 0])) if not k in gt: gt[k] = [] gt[k].append(tube[i, 1:5].tolist()) for k in gt: gt[k] = np.array( gt[k] ) # pr will be an array containing precision-recall values pr = np.empty((detections.shape[0] + 1, 2), dtype=np.float32)# precision,recall pr[0, 0] = 1.0 pr[0, 1] = 0.0 fn = sum([g.shape[0] for g in gt.values()]) # false negatives fp = 0 # false positives tp = 0 # true positives for i, j in enumerate(np.argsort(-detections[:,3])): k = (int(detections[j,0]), int(detections[j,1])) box = detections[j, 4:8] ispositive = False if k in gt: ious = iou2d(gt[k], box) amax = np.argmax(ious) if ious[amax] >= th: ispositive = True gt[k] = np.delete(gt[k], amax, 0) if gt[k].size == 0: del gt[k] if ispositive: tp += 1 fn -= 1 else: fp += 1 if((tp+fn)>0): pr[i+1, 0] = float(tp) / float(tp + fp) pr[i+1, 1] = float(tp) / float(tp + fn) else: pr[i+1, 0] = 0 pr[i+1, 1] = 0 res[label] = pr # save results with open(eval_file, 'wb') as fid: pickle.dump(res, fid) # display results ap = 100*np.array([pr_to_ap(res[label]) for label in d.labels]) print "frameAP" for il, _ in enumerate(d.labels): print("{:20s} {:8.2f}".format('', ap[il])) print("{:20s} {:8.2f}".format("mAP", np.mean(ap))) print("")
def extract_tubelets(dname, gpu=-1, redo=False): """Extract the tubelets for a given dataset args: - dname: dataset name (example: 'JHMDB') - gpu (default -1): use gpu given in argument, or use cpu if -1 - redo: wheter or not to recompute already computed files save a pickle file for each frame the file contains a tuple (dets, dets_all) - dets is a numpy array with 2+4*K columns containing the tubelets starting at this frame after per-class nms at 0.45 and thresholding the scores at 0.01 the columns are <label> <score> and then <x1> <y1> <x2> <y2> for each of the frame in the tubelet - dets_all contains the tubelets obtained after a global nms at 0.7 and thresholding the scores at 0.01 it is a numpy arrray with 4*K + L + 1 containing the coordinates of the tubelets and the scores for all labels note: this version is inefficient: it is better to estimate the per-frame features once """ d = GetDataset(dname) if gpu >= 0: caffe.set_mode_gpu() caffe.set_device(gpu) model_dir = os.path.join(os.path.dirname(__file__), '../models/ACT-detector/', dname) output_dir = os.path.join(os.path.dirname(__file__), '../results/ACT-detector/', dname) # load the RGB network rgb_proto = os.path.join(model_dir, "deploy_RGB.prototxt") rgb_model = os.path.join(model_dir, "../generated_AVA_iter_118662.caffemodel") net_rgb = caffe.Net(rgb_proto, caffe.TEST, weights=rgb_model) # load the FLOW5 network flo_proto = os.path.join(model_dir, "deploy_FLOW5.prototxt") flo_model = os.path.join(model_dir, "../generated_AVA_iter_59463.caffemodel") net_flo = caffe.Net(flo_proto, caffe.TEST, weights=flo_model) vlist = d.test_vlist() for iv, v in enumerate(vlist): print("Processing video {:d}/{:d}: {:s}".format( iv+1, len(vlist), v)) h, w = d.resolution(v) # network output is normalized between 0,1 ; so we will multiply it by the following array resolution_array = np.array([w,h,w,h]*K, dtype=np.float32) # now process each frame for i in xrange(1, 1 + d.nframes(v) - K + 1): outfile = os.path.join(output_dir, d.frame_format(v,i) + ".pkl") # skip if already computed if os.path.isfile(outfile) and not redo: continue # read the frames for the forward kwargs_rgb = {} kwargs_flo = {} for j in xrange(K): cap = cv2.VideoCapture(d.vidfile(v,0)) #print(frame) #print(int(cap.get(7))) cap.set(1,i + j - 1) im = cap.read()[1] cap.release() #im = cv2.imread(d.imfile(v, i + j)) if im is None: print "Image {:s} does not exist".format(d.imfile(v, i+j)) return imscale = cv2.resize(im, (IMGSIZE, IMGSIZE), interpolation=cv2.INTER_LINEAR) kwargs_rgb['data_stream' + str(j)] = np.transpose(imscale-MEAN, (2, 0, 1))[None, :, :, :] imf = [cv2.imread(d.flowfile(v.split(".")[0], min(d.nframes(v), i + j + iflow))) for iflow in xrange(NFLOWS)] if np.any(imf) is None: print "Flow image {:s} does not exist".format(d.flowfile(v, i+j)) return imscalef = [cv2.resize(im, (IMGSIZE, IMGSIZE), interpolation=cv2.INTER_LINEAR) for im in imf] timscale = [np.transpose(im-MEAN, (2, 0, 1))[None, :, :, :] for im in imscalef] kwargs_flo['data_stream' + str(j) + 'flow'] = np.concatenate(timscale, axis=1) # compute rgb and flow scores # two forward passes: one for the rgb and one for the flow net_rgb.forward(end="mbox_conf_flatten", **kwargs_rgb) # forward of rgb with confidence and regression net_flo.forward(end="mbox_conf_flatten", **kwargs_flo) # forward of flow5 with confidence and regression # compute late fusion of rgb and flow scores (keep regression from rgb) # use net_rgb for standard detections, net_flo for having all boxes scores = 0.5 * (net_rgb.blobs['mbox_conf_flatten'].data + net_flo.blobs['mbox_conf_flatten'].data) net_rgb.blobs['mbox_conf_flatten'].data[...] = scores net_flo.blobs['mbox_conf_flatten'].data[...] = scores net_flo.blobs['mbox_loc'].data[...] = net_rgb.blobs['mbox_loc'].data # two forward passes, only for the last layer # dets is the detections after per-class NMS and thresholding (stardard) # dets_all contains all the scores and regressions for all tubelets dets = net_rgb.forward(start='detection_out')['detection_out'][0, 0, :, 1:] dets_all = net_flo.forward(start='detection_out_full')['detection_out_full'][0, 0, :, 1:] # parse detections with per-class NMS if dets.shape[0] == 1 and np.all(dets == -1): dets = np.empty((0, dets.shape[1]), dtype=np.float32) dets[:, 2:] *= resolution_array # network output was normalized in [0..1] dets[:, 0] -= 1 # label 0 was background, come back to label in [0..nlabels-1] dets[:, 2::2] = np.maximum(0, np.minimum(w, dets[:, 2::2])) dets[:, 3::2] = np.maximum(0, np.minimum(h, dets[:, 3::2])) # parse detections with global NMS at 0.7 (top 300) # coordinates were normalized in [0..1] dets_all[:, 0:4*K] *= resolution_array dets_all[:, 0:4*K:2] = np.maximum(0, np.minimum(w, dets_all[:, 0:4*K:2])) dets_all[:, 1:4*K:2] = np.maximum(0, np.minimum(h, dets_all[:, 1:4*K:2])) idx = nms_tubelets(np.concatenate((dets_all[:, :4*K], np.max(dets_all[:, 4*K+1:], axis=1)[:, None]), axis=1), 0.7, 300) dets_all = dets_all[idx, :] # save file if not os.path.isdir(os.path.dirname(outfile)): os.system('mkdir -p ' + os.path.dirname(outfile)) with open(outfile, 'wb') as fid: pickle.dump((dets, dets_all), fid)
def videoAP(dname, th=0.5, redo=False): d = GetDataset(dname) dirname = os.path.join(os.path.dirname(__file__), '../results/ACT-detector/', dname) eval_file = os.path.join(dirname, "videoAP{:g}.pkl".format(th)) if os.path.isfile(eval_file) and not redo: with open(eval_file, 'rb') as fid: res = pickle.load(fid) else: vlist = d.test_vlist() # load detections # alldets = for each label in 1..nlabels, list of tuple (v,score,tube as Kx5 array) alldets = {ilabel: [] for ilabel in range(d.nlabels)} for v in vlist: tubename = os.path.join(dirname, v + '_tubes.pkl') if not os.path.isfile(tubename): print("ERROR: Missing extracted tubes " + tubename) sys.exit() with open(tubename, 'rb') as fid: tubes = pickle.load(fid) for ilabel in range(d.nlabels): ltubes = tubes[ilabel] idx = nms3dt(ltubes, 0.3) alldets[ilabel] += [(v, ltubes[i][1], ltubes[i][0]) for i in idx] # compute AP for each class res = {} for ilabel in range(d.nlabels): detections = alldets[ilabel] # load ground-truth gt = {} for v in vlist: tubes = d.gttubes(v) if not ilabel in tubes: continue gt[v] = tubes[ilabel] if len(gt[v]) == 0: del gt[v] # precision,recall pr = np.empty((len(detections) + 1, 2), dtype=np.float32) pr[0, 0] = 1.0 pr[0, 1] = 0.0 fn = sum([len(g) for g in gt.values()]) # false negatives fp = 0 # false positives tp = 0 # true positives for i, j in enumerate( np.argsort(-np.array([dd[1] for dd in detections]))): v, score, tube = detections[j] ispositive = False if v in gt: ious = [iou3dt(g, tube) for g in gt[v]] amax = np.argmax(ious) if ious[amax] >= th: ispositive = True del gt[v][amax] if len(gt[v]) == 0: del gt[v] if ispositive: tp += 1 fn -= 1 else: fp += 1 pr[i + 1, 0] = float(tp) / float(tp + fp) pr[i + 1, 1] = float(tp) / float(tp + fn) res[d.labels[ilabel]] = pr # save results with open(eval_file, 'wb') as fid: pickle.dump(res, fid) # display results ap = 100 * np.array([pr_to_ap(res[label]) for label in d.labels]) print("frameAP") for il, _ in enumerate(d.labels): print("{:20s} {:8.2f}".format('', ap[il])) print("{:20s} {:8.2f}".format("mAP", np.mean(ap))) print("")
def load_frame_detections(d, vlist, dirname, nms): if isinstance(d, str): d = GetDataset(d) alldets = [ ] # list of numpy array with <video_index> <frame_index> <ilabel> <score> <x1> <y1> <x2> <y2> for iv, v in enumerate(vlist): h, w = d.resolution(v) # aggregate the results for each frame vdets = { i: np.empty((0, 6), dtype=np.float32) for i in range(1, 1 + d.nframes(v)) } # x1, y1, x2, y2, score, ilabel # load results for each starting frame for i in range(1, 1 + d.nframes(v) - K + 1): resname = os.path.join(dirname, d.frame_format(v, i) + '.pkl') if not os.path.isfile(resname): print("ERROR: Missing extracted tubelets " + resname) sys.exit() with open(resname, 'rb') as fid: dets, _ = pickle.load(fid) if dets.size == 0: continue for k in range(K): vdets[i + k] = np.concatenate( (vdets[i + k], dets[:, np.array([ 2 + 4 * k, 3 + 4 * k, 4 + 4 * k, 5 + 4 * k, 1, 0 ])]), axis=0) # Perform NMS in each frame for i in vdets: idx = np.empty((0, ), dtype=np.int32) for ilabel in range(d.nlabels): a = np.where(vdets[i][:, 5] == ilabel)[0] if a.size == 0: continue idx = np.concatenate((idx, a[nms2d( vdets[i][vdets[i][:, 5] == ilabel, :5], nms)]), axis=0) if idx.size == 0: continue alldets.append( np.concatenate( (iv * np.ones( (idx.size, 1), dtype=np.float32), i * np.ones( (idx.size, 1), dtype=np.float32), vdets[i][idx, :] [:, np.array([5, 4, 0, 1, 2, 3], dtype=np.int32)]), axis=1)) return np.concatenate(alldets, axis=0)
def setup(self, bottom, top): layer_params = eval(self.param_str) assert 'dataset_name' in layer_params dataset_name = layer_params['dataset_name'] self._dataset = GetDataset(dataset_name) assert 'K' in layer_params self._K = layer_params['K'] assert self._K > 0 # parse optional argument default_values = { 'rand_seed': 0, 'shuffle': True, 'batch_size': 32 // self._K, 'mean_values': [104, 117, 123], 'resize_height': 300, 'resize_width': 300, 'restart_iter': 0, 'flow': False, 'ninput': 1, } for k in default_values.keys(): if k in layer_params: lay_param = layer_params[k] else: lay_param = default_values[k] setattr(self, '_' + k, lay_param) if not self._flow and self._ninput > 1: raise NotImplementedError("ACT-detector: Not implemented: ninput > 1 with rgb frames") d = self._dataset K = self._K # build index (v,i) of valid starting chunk self._indices = [] for v in d.train_vlist(): vtubes = sum(d.gttubes(v).values(), []) self._indices += [(v,i) for i in range(1, d.nframes(v)+2-K) if tubelet_in_out_tubes(vtubes,i,K) and tubelet_has_gt(vtubes,i,K)] # self._indices += [(v,i) for i in range(1, d.nframes(v)+2-K) if all([ (i in t[:,0] and i+K-1 in t[:,0]) or all([not j in t[:,0] for j in xrange(i,i+K)]) for t in vtubes]) and any([ (i in t[:,0] and i+K-1 in t[:,0]) for t in vtubes]) ] self._nseqs = len(self._indices) self._iter = 0 self._nshuffles = 0 self.shuffle() if self._restart_iter > 0: assert self._next == 0 self._iter = self._restart_iter iimages = self._restart_iter * self._batch_size while iimages > self._nseqs: self.shuffle() iimages -= self._nseqs self._next = iimages for i in range(K): top[i].reshape(self._batch_size, 3 * self._ninput, self._resize_height, self._resize_width) top[K].reshape(1, 1, 1, 8)
def ACT_generate_prototxt(dname, K=6, flow=False): """ Generates the train, test, deploy and solver prototxts for the datasets used in ACT-detector. dname: 'UCFSports', 'JHMDB', 'JHMDB2', 'JHMDB3', 'UCF101', 'UCF101v2' K: length of the tubelet and input sequence. In ACT-detector K=6 flow: if true, then use modality = FLOW5; if false, then modality = RGB """ ######################### Frame PARAMS ######################### IMGSIZE = 300 ######################### General PARAMS ######################### modality_str = 'flow' if flow else '' mode_str = 'FLOW5' if flow else 'RGB' ######################### Dataset PARAMS ######################### dd = GetDataset(dname) num_classes = dd.nlabels + 1 # +1 for background if dname=='UCFSports': niter = 60000 lr_steps = [40000, 55000] elif dname in ['JHMDB', 'JHMDB2', 'JHMDB3']: niter = 240000 lr_steps = [160000, 220000] elif dname in ['UCF101', 'UCF101v2']: niter = 600000 lr_steps = [400000, 550000] elif dname=='AVA': niter = 240000 lr_steps = [160000, 220000] else: raise Exception("Unknown dataset " + dname) ######################### Model PATHS ######################### #dirname = os.path.join(os.path.dirname(__file__), "..", "models", "ACT-detector", 'generated_' + dd.NAME) dirname = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "models", "ACT-detector", 'generated_' + dd.NAME)) if not os.path.isdir(dirname): os.system('mkdir -p ' + dirname) deploy_net_file = "{}/deploy_{}.prototxt".format(dirname, mode_str) train_net_file = "{}/train_{}.prototxt".format(dirname, mode_str) solver_file = "{}/solver_{}.prototxt".format(dirname, mode_str) model_name = "ACTdetector_{}_{}".format(dname, mode_str) # The pretrained model. pretrain_model_file = os.path.join(dirname, "..", 'initialization_VGG_ILSVRC16_K{}_{}.caffemodel'.format(K, mode_str)) ############ BATCH NORM PARAMS ###################### # If true, use batch norm for all newly added layers. # Currently only the non batch norm version has been tested. use_batchnorm = False lr_mult = 1 # Use different initial learning rate. if use_batchnorm: base_lr = 0.0004 else: # A learning rate for batch_size = 1, num_gpus = 1. base_lr = 0.00004 ############ MultiBoxLoss PARAMS ###################### share_location = True background_label_id=0 train_on_diff_gt = True normalization_mode = P.Loss.VALID code_type = P.PriorBox.CENTER_SIZE ignore_cross_boundary_bbox = False mining_type = P.MultiBoxLoss.MAX_NEGATIVE neg_pos_ratio = 3. loc_weight = (neg_pos_ratio + 1.) / 4. multibox_loss_param = { 'loc_loss_type': P.MultiBoxLoss.SMOOTH_L1, 'conf_loss_type': P.MultiBoxLoss.SOFTMAX, 'loc_weight': loc_weight /float(K), 'num_classes': num_classes, 'share_location': share_location, 'match_type': P.MultiBoxLoss.PER_PREDICTION, 'overlap_threshold': 0.5, 'use_prior_for_matching': True, 'background_label_id': background_label_id, 'use_difficult_gt': train_on_diff_gt, 'neg_pos_ratio': neg_pos_ratio, 'neg_overlap': 0.5, 'code_type': code_type, } act_cuboid_loss_param = { 'sequence_length': K, } multibox_loss_param['ignore_cross_boundary_bbox'] = ignore_cross_boundary_bbox multibox_loss_param['mining_type'] = mining_type loss_param = { 'normalization': normalization_mode, } ############ PARAMS for generating PRIORS ###################### # minimum dimension of input image min_dim = IMGSIZE mbox_source_layers = ['conv4_3', 'fc7', 'conv6_2', 'conv7_2', 'conv8_2', 'conv9_2'] # in percent % min_ratio = 20 max_ratio = 90 step = int(math.floor((max_ratio - min_ratio) / (len(mbox_source_layers) - 2))) min_sizes = [] max_sizes = [] for ratio in xrange(min_ratio, max_ratio + 1, step): min_sizes.append(min_dim * ratio / 100.) max_sizes.append(min_dim * (ratio + step) / 100.) min_sizes = ([min_dim * 10 / 100.] + min_sizes) max_sizes = ([min_dim * 20 / 100.] + max_sizes) aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]] normalizations = [20, -1, -1, -1, -1, -1] steps = [8, 16, 32, 64, 100, 300] # variance used to encode/decode prior bboxes. if code_type == P.PriorBox.CENTER_SIZE: prior_variance = [0.1, 0.1, 0.2, 0.2] else: prior_variance = [0.1] flip = True clip = False ############# GPU & SOLVER PARAMS ###################### # Defining which GPUs to use. gpulist=[0] num_gpus = len(gpulist) # Divide the mini-batch to different GPUs.= batch_size = int(32 / K) accum_batch_size = batch_size iter_size = accum_batch_size / batch_size solver_mode = P.Solver.CPU device_id = 0 batch_size_per_device = batch_size if num_gpus > 0: batch_size_per_device = int(math.ceil(float(batch_size) / num_gpus)) iter_size = int(math.ceil(float(accum_batch_size) / (batch_size_per_device * num_gpus))) solver_mode = P.Solver.GPU device_id = int(gpulist[0]) if normalization_mode == P.Loss.NONE: base_lr /= batch_size_per_device elif normalization_mode == P.Loss.VALID: base_lr *= 25. / loc_weight elif normalization_mode == P.Loss.FULL: # Roughly there are 2000 prior bboxes per image. # TODO(weiliu89): Estimate the exact # of priors. base_lr *= 2000. # Which layers to freeze (no backward) during training. freeze_layers = [] solver_param = { # Train parameters 'base_lr': 0.0001, 'weight_decay': 0.0005, 'lr_policy': "multistep", 'stepvalue': lr_steps, 'gamma': 0.1, 'momentum': 0.9, 'max_iter': niter, 'snapshot': 10000, 'display': 10, 'average_loss': 10, 'type': "SGD", 'solver_mode': solver_mode, 'device_id': device_id, 'debug_info': False, 'snapshot_after_train': True, 'iter_size': 1, } # parameters for generating detection output. det_out_param = { 'num_classes': num_classes, 'share_location': share_location, 'background_label_id': background_label_id, 'nms_param': {'nms_threshold': 0.45, 'top_k': 400}, 'keep_top_k': 200, 'confidence_threshold': 0.01, 'code_type': code_type, } ######################### TRAIN PROTOTXT ######################### net = caffe.NetSpec() top_datalayer = ACT_DataLayer(dname, K, batch_size, resize_height=IMGSIZE, resize_width=IMGSIZE, restart_iter=0, flow=flow, ninput=5 if flow else 1) assert len(top_datalayer) == K + 1 for i in range(K): net['data_stream' + str(i) + modality_str] = top_datalayer[i] net['label'] = top_datalayer[K] ACT_VGGNetBody(net, from_layer='data', K=K, fully_conv=True, reduced=True, dilated=True, dropout=False, freeze_layers=freeze_layers, m=modality_str, lr_mult=1.0/float(K)) ACT_AddExtraLayers300(net, K, use_batchnorm, m=modality_str, lr_mult=lr_mult/float(K)) mbox_layers = ACT_CreateCuboidHead(net, K, data_layer='data_stream0' + modality_str, from_layers=mbox_source_layers, use_batchnorm=use_batchnorm, min_sizes=min_sizes, max_sizes=max_sizes, aspect_ratios=aspect_ratios, steps=steps, normalizations=normalizations, num_classes=num_classes, share_location=share_location, flip=flip, clip=clip, prior_variance=prior_variance, kernel_size=3, pad=1, lr_mult=lr_mult, m=modality_str) name = "mbox_loss" mbox_layers.append(net.label) # CUBOID loss net[name] = L.ACTCuboidLoss(*mbox_layers, multibox_loss_param=multibox_loss_param, act_cuboid_loss_param=act_cuboid_loss_param, loss_param=loss_param, include=dict(phase=caffe_pb2.Phase.Value('TRAIN')), propagate_down=[True, True, False, False]) # Saving .. with open(train_net_file, 'w') as f: print('name: "{}_train"'.format(model_name), file=f) print(net.to_proto(), file=f) ######################### DEPLOY PROTOTXT ######################### net = caffe.NetSpec() # Fake data layer that we delete later, just to have the output existing as top top_datalayer = ACT_DataLayer(dname, K, batch_size, resize_height=IMGSIZE, resize_width=IMGSIZE, restart_iter=0, flow=flow, ninput=5 if flow else 1) assert len(top_datalayer) == K + 1 for i in range(K): net['data_stream' + str(i) + modality_str] = top_datalayer[i] ACT_VGGNetBody(net, from_layer='data', K=K, fully_conv=True, reduced=True, dilated=True, dropout=False, freeze_layers=freeze_layers, m=modality_str, lr_mult=1.0/float(K)) ACT_AddExtraLayers300(net, K, use_batchnorm, m=modality_str, lr_mult=lr_mult/float(K)) mbox_layers = ACT_CreateCuboidHead(net, K, data_layer='data_stream0'+modality_str, from_layers=mbox_source_layers, use_batchnorm=use_batchnorm, min_sizes=min_sizes, max_sizes=max_sizes, aspect_ratios=aspect_ratios, steps=steps, normalizations=normalizations, num_classes=num_classes, share_location=share_location, flip=flip, clip=clip, prior_variance=prior_variance, kernel_size=3, pad=1, lr_mult=lr_mult, m=modality_str) # net and mbox_layers conf_name = "mbox_conf" if multibox_loss_param["conf_loss_type"] == P.MultiBoxLoss.SOFTMAX: reshape_name = "{}_reshape".format(conf_name) net[reshape_name] = L.Reshape(net[conf_name], shape=dict(dim=[0, -1, num_classes])) softmax_name = "{}_softmax".format(conf_name) net[softmax_name] = L.Softmax(net[reshape_name], axis=2) flatten_name = "{}_flatten".format(conf_name) net[flatten_name] = L.Flatten(net[softmax_name], axis=1) mbox_layers[1] = net[flatten_name] elif multibox_loss_param["conf_loss_type"] == P.MultiBoxLoss.LOGISTIC: sigmoid_name = "{}_sigmoid".format(conf_name) net[sigmoid_name] = L.Sigmoid(net[conf_name]) mbox_layers[1] = net[sigmoid_name] # Detection output layer: # Saving detections for ACT-detector # -- The RGB stream saves boxes after per-class nms at 0.45 and thresholding scores # -- The flow stream saves all the regressed cuboids (with their scores if modality_str == "": net.detection_out = L.ACTDetectionOutput(*mbox_layers, detection_output_param=det_out_param, act_detection_output_param={'sequence_length': K}, include=dict(phase=caffe_pb2.Phase.Value('TEST'))) else: net.detection_out_full = L.ACTDetectionOutput(*mbox_layers, detection_output_param=det_out_param, act_detection_output_param={'sequence_length': K, 'save_full': True}, include=dict(phase=caffe_pb2.Phase.Value('TEST'))) net_param = net.to_proto() del net_param.layer[0] net_param.name = '{}_deploy'.format(model_name) for stream in xrange(K): net_param.input.extend(['data_stream' + str(stream) + modality_str]) net_param.input_shape.extend([ caffe_pb2.BlobShape(dim=[1, 3 * (5 if flow else 1), IMGSIZE, IMGSIZE])]) # Saving .. with open(deploy_net_file, 'w') as f: print(net_param, file=f) ######################### SOLVER PROTOTXT ######################### solver = caffe_pb2.SolverParameter( train_net=train_net_file, snapshot_prefix=dirname, **solver_param) # Saving .. with open(solver_file, 'w') as f: print(solver, file=f)