def frameMABO(dname, redo=False):
    d = GetDataset(dname)
    dirname = os.path.join( os.path.dirname(__file__), '../results/ACT-detector/', dname)
    eval_file = os.path.join(dirname, "frameMABO.pkl")
    
    if os.path.isfile(eval_file) and not redo:
        with open(eval_file, 'rb') as fid:
            BO = pickle.load(fid)
    else:
        vlist = d.test_vlist()
        BO = {l: [] for l in d.labels} # best overlap
        
        for v in vlist:
            gt = d.gttubes(v)
            h, w = d.resolution(v)
            
            # load per-frame detections
            vdets = {i: np.empty((0,4), dtype=np.float32) for i in range(1, 1+d.nframes(v))}

            # load results for each chunk
            for i in xrange(1, 1 + d.nframes(v) - K + 1):
                resname = os.path.join(dirname, d.frame_format(v,i) + '.pkl')
                if not os.path.isfile(resname):
                    print("ERROR: Missing extracted tubelets " + resname)
                    sys.exit()
                
                with open(resname, 'rb') as fid:
                    dets, _ = pickle.load(fid)
                
                for k in xrange(K):
                    vdets[i+k] = np.concatenate((vdets[i + k], dets[:, 2+4*k:6+4*k]), axis=0)
            
            # for each frame
            for i in xrange(1, 1 + d.nframes(v)):
                for ilabel in gt:
                    label = d.labels[ilabel]
                    for t in gt[ilabel]:
                        # the gt tube does not cover frame i
                        if not i in t[:,0]:
                            continue

                        gtbox = t[t[:,0] == i, 1:5] # box of gt tube at frame i
                        
                        if vdets[i].size == 0: # we missed it
                            BO[label].append(0)
                            continue

                        ious = iou2d(vdets[i], gtbox)
                        BO[label].append( np.max(ious) )
            # save file
            with open(eval_file, 'wb') as fid:
                pickle.dump( BO, fid)

    # print MABO results
    ABO = {la: 100 * np.mean(np.array(BO[la])) for la in d.labels} # average best overlap
    
    for la in d.labels:
        print("{:20s} {:6.2f}".format(la, ABO[la]))

    print("{:20s} {:6.2f}".format("MABO", np.mean(np.array(ABO.values()))))
def load_frame_detections(d, vlist, dirname, nms):
    if isinstance(d, str):
        d = GetDataset(d)

    alldets = [] # list of numpy array with <video_index> <frame_index> <ilabel> <score> <x1> <y1> <x2> <y2>
    for iv, v in enumerate(vlist):
        h,w = d.resolution(v)
        
        # aggregate the results for each frame
        vdets = {i: np.empty((0,6), dtype=np.float32) for i in range(1, 1 + d.nframes(v))} # x1, y1, x2, y2, score, ilabel
        
        # load results for each starting frame
        for i in xrange(1, 1 + d.nframes(v) - K + 1):
            resname = os.path.join(dirname, d.frame_format(v,i) + '.pkl')
            
            if not os.path.isfile(resname):
                print("ERROR: Missing extracted tubelets "+resname)
                sys.exit()

            with open(resname, 'rb') as fid:
                dets, _ = pickle.load(fid)
            
            if dets.size == 0:
                continue

            for k in xrange(K):
                vdets[i+k] = np.concatenate( (vdets[i+k],dets[:,np.array([2+4*k,3+4*k,4+4*k,5+4*k,1,0])] ), axis=0)

        # Perform NMS in each frame
        for i in vdets:
            idx = np.empty((0,), dtype=np.int32)
            for ilabel in xrange(d.nlabels):
                a = np.where(vdets[i][:,5] == ilabel)[0]
                
                if a.size == 0:
                    continue
                
                idx = np.concatenate((idx, a[nms2d(vdets[i][vdets[i][:, 5] == ilabel, :5], nms)]), axis=0)
            
            if idx.size == 0:
                continue

            alldets.append(np.concatenate((iv * np.ones((idx.size, 1), dtype=np.float32), i * np.ones((idx.size, 1), dtype=np.float32), vdets[i][idx, :][:, np.array([5, 4, 0, 1, 2, 3], dtype=np.int32)]), axis=1))
    
    return np.concatenate(alldets, axis=0)
Example #3
0
def BuildTubes(dname, redo=False):
    d = GetDataset(dname)
    dirname = os.path.join(os.path.dirname(__file__),
                           '../results/ACT-detector/', dname)
    vlist = d.test_vlist()

    for iv, v in enumerate(vlist):
        print("Processing video {:d}/{:d}: {:s}".format(iv + 1, len(vlist), v))

        outfile = os.path.join(dirname, v + "_tubes.pkl")

        if os.path.isfile(outfile) and not redo:
            continue

        RES = {}
        nframes = d.nframes(v)

        # load detected tubelets
        VDets = {}
        for startframe in range(1, nframes + 2 - K):
            resname = os.path.join(dirname,
                                   d.frame_format(v, startframe) + '.pkl')

            if not os.path.isfile(resname):
                print("ERROR: Missing extracted tubelets " + resname)
                sys.exit()

            with open(resname, 'rb') as fid:
                _, VDets[startframe] = pickle.load(fid)

        for ilabel in range(d.nlabels):
            FINISHED_TUBES = []
            CURRENT_TUBES = []  # tubes is a list of tuple (frame, lstubelets)

            def tubescore(tt):
                return np.mean(np.array([tt[i][1][-1]
                                         for i in range(len(tt))]))

            for frame in range(1, d.nframes(v) + 2 - K):
                # load boxes of the new frame and do nms while keeping Nkeep highest scored
                ltubelets = VDets[
                    frame][:, range(4 * K) +
                           [4 * K + 1 + ilabel
                            ]]  # Nx(4K+1) with (x1 y1 x2 y2)*K ilabel-score
                idx = nms_tubelets(ltubelets, 0.3, top_k=10)
                ltubelets = ltubelets[idx, :]

                # just start new tubes
                if frame == 1:
                    for i in range(ltubelets.shape[0]):
                        CURRENT_TUBES.append([(1, ltubelets[i, :])])
                    continue

                # sort current tubes according to average score
                avgscore = [tubescore(t) for t in CURRENT_TUBES]
                argsort = np.argsort(-np.array(avgscore))
                CURRENT_TUBES = [CURRENT_TUBES[i] for i in argsort]

                # loop over tubes
                finished = []
                for it, t in enumerate(CURRENT_TUBES):
                    # compute ious between the last box of t and ltubelets
                    last_frame, last_tubelet = t[-1]
                    ious = []
                    offset = frame - last_frame
                    if offset < K:
                        nov = K - offset
                        ious = sum([
                            iou2d(
                                ltubelets[:, 4 * iov:4 * iov + 4],
                                last_tubelet[4 * (iov + offset):4 *
                                             (iov + offset + 1)])
                            for iov in range(nov)
                        ]) / float(nov)
                    else:
                        ious = iou2d(ltubelets[:, :4],
                                     last_tubelet[4 * K - 4:4 * K])

                    valid = np.where(ious >= 0.2)[0]

                    if valid.size > 0:
                        # take the one with maximum score
                        idx = valid[np.argmax(ltubelets[valid, -1])]
                        CURRENT_TUBES[it].append((frame, ltubelets[idx, :]))
                        ltubelets = np.delete(ltubelets, idx, axis=0)
                    else:
                        # skip
                        if offset >= 5:
                            finished.append(it)

                # finished tubes that are done
                for it in finished[::
                                   -1]:  # process in reverse order to delete them with the right index
                    FINISHED_TUBES.append(CURRENT_TUBES[it][:])
                    del CURRENT_TUBES[it]

                # start new tubes
                for i in range(ltubelets.shape[0]):
                    CURRENT_TUBES.append([(frame, ltubelets[i, :])])

            # all tubes are not finished
            FINISHED_TUBES += CURRENT_TUBES

            # build real tubes
            output = []
            for t in FINISHED_TUBES:
                score = tubescore(t)

                # just start new tubes
                if score < 0.01:
                    continue

                beginframe = t[0][0]
                endframe = t[-1][0] + K - 1
                length = endframe + 1 - beginframe

                # delete tubes with short duraton
                if length < 15:
                    continue

                # build final tubes by average the tubelets
                out = np.zeros((length, 6), dtype=np.float32)
                out[:, 0] = np.arange(beginframe, endframe + 1)
                n_per_frame = np.zeros((length, 1), dtype=np.int32)
                for i in range(len(t)):
                    frame, box = t[i]
                    for k in range(K):
                        out[frame - beginframe + k,
                            1:5] += box[4 * k:4 * k + 4]
                        out[frame - beginframe + k, -1] += box[-1]
                        n_per_frame[frame - beginframe + k, 0] += 1
                out[:, 1:] /= n_per_frame
                output.append((out, score))

            RES[ilabel] = output

        with open(outfile, 'wb') as fid:
            pickle.dump(RES, fid)
Example #4
0
def frameCLASSIF(dname, redo=False):
    d = GetDataset(dname)
    dirname = os.path.join(os.path.dirname(__file__),
                           '../results/ACT-detector/', dname)
    eval_file = os.path.join(dirname, "frameCLASSIF.pkl")

    if os.path.isfile(eval_file) and not redo:
        with open(eval_file, 'rb') as fid:
            CLASSIF = pickle.load(fid)
    else:
        vlist = d.test_vlist()
        CORRECT = [0 for ilabel in range(d.nlabels)]
        TOTAL = [0 for ilabel in range(d.nlabels)]

        for v in vlist:
            nframes = d.nframes(v)
            # load all tubelets
            VDets = {}
            for startframe in range(1, nframes + 2 - K):
                resname = os.path.join(dirname,
                                       d.frame_format(v, startframe) + '.pkl')

                if not os.path.isfile(resname):
                    print("ERROR: Missing extracted tubelets " + resname)
                    sys.exit()

                with open(resname, 'rb') as fid:
                    _, VDets[startframe] = pickle.load(fid)

            # iterate over ground-truth
            tubes = d.gttubes(v)
            for ilabel in tubes:
                for g in tubes[ilabel]:
                    for i in range(g.shape[0]):
                        frame = int(g[i, 0])

                        # just in case a tube is longer than the video
                        if frame > nframes:
                            continue

                        gtbox = g[i, 1:5]
                        scores = np.zeros((d.nlabels, ), dtype=np.float32)

                        # average the score over the 6 frames
                        for sf in range(max(1, frame - K + 1),
                                        min(nframes - K + 1, frame) + 1):
                            overlaps = iou2d(
                                VDets[sf][:, 4 * (frame - sf):4 *
                                          (frame - sf) + 4], gtbox)
                            scores += np.sum(VDets[sf][overlaps >= 0.7,
                                                       4 * K + 1:],
                                             axis=0)

                        # check classif
                        if np.argmax(scores) == ilabel:
                            CORRECT[ilabel] += 1

                        TOTAL[ilabel] += 1

        CLASSIF = [
            float(CORRECT[ilabel]) / float(TOTAL[ilabel])
            for ilabel in range(d.nlabels)
        ]

        with open(eval_file, 'wb') as fid:
            pickle.dump(CLASSIF, fid)

    # print classif results
    for il, la in enumerate(d.labels):
        print("{:20s} {:6.2f}".format(la, 100 * CLASSIF[il]))

    print("{:20s} {:6.2f}".format("CLASSIF", 100 * np.mean(np.array(CLASSIF))))
Example #5
0
def frameMABO(dname, redo=False):
    d = GetDataset(dname)
    dirname = os.path.join(os.path.dirname(__file__),
                           '../results/ACT-detector/', dname)
    eval_file = os.path.join(dirname, "frameMABO.pkl")

    if os.path.isfile(eval_file) and not redo:
        with open(eval_file, 'rb') as fid:
            BO = pickle.load(fid)
    else:
        vlist = d.test_vlist()
        BO = {l: [] for l in d.labels}  # best overlap

        for v in vlist:
            gt = d.gttubes(v)
            h, w = d.resolution(v)

            # load per-frame detections
            vdets = {
                i: np.empty((0, 4), dtype=np.float32)
                for i in range(1, 1 + d.nframes(v))
            }

            # load results for each chunk
            for i in range(1, 1 + d.nframes(v) - K + 1):
                resname = os.path.join(dirname, d.frame_format(v, i) + '.pkl')
                if not os.path.isfile(resname):
                    print("ERROR: Missing extracted tubelets " + resname)
                    sys.exit()

                with open(resname, 'rb') as fid:
                    dets, _ = pickle.load(fid)

                for k in range(K):
                    vdets[i + k] = np.concatenate(
                        (vdets[i + k], dets[:, 2 + 4 * k:6 + 4 * k]), axis=0)

            # for each frame
            for i in range(1, 1 + d.nframes(v)):
                for ilabel in gt:
                    label = d.labels[ilabel]
                    for t in gt[ilabel]:
                        # the gt tube does not cover frame i
                        if not i in t[:, 0]:
                            continue

                        gtbox = t[t[:, 0] == i,
                                  1:5]  # box of gt tube at frame i

                        if vdets[i].size == 0:  # we missed it
                            BO[label].append(0)
                            continue

                        ious = iou2d(vdets[i], gtbox)
                        BO[label].append(np.max(ious))
            # save file
            with open(eval_file, 'wb') as fid:
                pickle.dump(BO, fid)

    # print MABO results
    ABO = {la: 100 * np.mean(np.array(BO[la]))
           for la in d.labels}  # average best overlap

    for la in d.labels:
        print("{:20s} {:6.2f}".format(la, ABO[la]))

    print("{:20s} {:6.2f}".format("MABO", np.mean(np.array(ABO.values()))))
Example #6
0
def frameAP_error(dname, th=0.5, redo=False):
    d = GetDataset(dname)
    dirname = os.path.join(os.path.dirname(__file__),
                           '../results/ACT-detector/', dname)

    eval_file = os.path.join(dirname,
                             "frameAP{:g}ErrorAnalysis.pkl".format(th))

    if os.path.isfile(eval_file) and not redo:
        with open(eval_file, 'rb') as fid:
            res = pickle.load(fid)
    else:
        vlist = d.test_vlist()
        # load per-frame detections
        alldets = load_frame_detections(d, vlist, dirname, 0.3)
        res = {}

        # compute AP for each class
        for ilabel, label in enumerate(d.labels):
            # detections of this class
            detections = alldets[alldets[:, 2] == ilabel, :]

            gt = {}
            othergt = {}
            labellist = {}

            for iv, v in enumerate(vlist):
                tubes = d.gttubes(v)
                labellist[v] = tubes.keys()

                for il in tubes:
                    for tube in tubes[il]:
                        for i in range(tube.shape[0]):
                            k = (iv, int(tube[i, 0]))
                            if il == ilabel:
                                if k not in gt:
                                    gt[k] = []
                                gt[k].append(tube[i, 1:5].tolist())
                            else:
                                if k not in othergt:
                                    othergt[k] = []
                                othergt[k].append(tube[i, 1:5].tolist())

            for k in gt:
                gt[k] = np.array(gt[k])
            for k in othergt:
                othergt[k] = np.array(othergt[k])

            dupgt = deepcopy(gt)

            # pr will be an array containing precision-recall values and 4 types of errors:
            # localization, classification, timing, others
            pr = np.empty((detections.shape[0] + 1, 6),
                          dtype=np.float32)  # precision, recall
            pr[0, 0] = 1.0
            pr[0, 1:] = 0.0
            fn = sum([g.shape[0] for g in gt.values()])  # false negatives
            fp = 0  # false positives
            tp = 0  # true positives
            EL = 0  # localization errors
            EC = 0  # classification error: overlap >=0.5 with an another object
            EO = 0  # other errors
            ET = 0  # timing error: the video contains the action but not at this frame

            for i, j in enumerate(np.argsort(-detections[:, 3])):
                k = (int(detections[j, 0]), int(detections[j, 1]))
                box = detections[j, 4:8]
                ispositive = False

                if k in dupgt:
                    if k in gt:
                        ious = iou2d(gt[k], box)
                        amax = np.argmax(ious)

                    if k in gt and ious[amax] >= th:
                        ispositive = True
                        gt[k] = np.delete(gt[k], amax, 0)
                        if gt[k].size == 0:
                            del gt[k]
                    else:
                        EL += 1
                elif k in othergt:
                    ious = iou2d(othergt[k], box)
                    if np.max(ious) >= th:
                        EC += 1
                    else:
                        EO += 1
                elif ilabel in labellist[k[0]]:
                    ET += 1
                else:
                    EO += 1

                if ispositive:
                    tp += 1
                    fn -= 1
                else:
                    fp += 1

                pr[i + 1, 0] = float(tp) / float(tp + fp)
                pr[i + 1, 1] = float(tp) / float(tp + fn)
                pr[i + 1, 2] = float(EL) / float(tp + fp)
                pr[i + 1, 3] = float(EC) / float(tp + fp)
                pr[i + 1, 4] = float(ET) / float(tp + fp)
                pr[i + 1, 5] = float(EO) / float(tp + fp)

            res[label] = pr

        # save results
        with open(eval_file, 'wb') as fid:
            pickle.dump(res, fid)

    # display results
    AP = 100 * np.array(
        [pr_to_ap(res[label][:, [0, 1]]) for label in d.labels])
    othersap = [
        100 * np.array([pr_to_ap(res[label][:, [j, 1]]) for label in d.labels])
        for j in range(2, 6)
    ]

    EL = othersap[0]
    EC = othersap[1]
    ET = othersap[2]
    EO = othersap[3]
    EM = 100 - 100 * np.array([res[label][-1, 1] for label in d.labels
                               ])  # missed detections = 1 - recall

    LIST = [AP, EL, EC, ET, EO, EM]

    print("Error Analysis")

    print("")
    print("{:20s} {:8s} {:8s} {:8s} {:8s} {:8s} {:8s}".format(
        'label', '   AP   ', '  Loc.  ', '  Cls.  ', '  Time  ', ' Other ',
        ' missed '))
    print("")
    for il, label in enumerate(d.labels):
        print("{:20s} ".format(label) +
              " ".join(["{:8.2f}".format(L[il]) for L in LIST]))

    print("")
    print("{:20s} ".format("mean") +
          " ".join(["{:8.2f}".format(np.mean(L)) for L in LIST]))
    print("")
Example #7
0
def frameAP(dname, th=0.5, redo=False):
    d = GetDataset(dname)
    dirname = os.path.join(os.path.dirname(__file__),
                           '../results/ACT-detector/', dname)

    eval_file = os.path.join(dirname, "frameAP{:g}.pkl".format(th))

    if os.path.isfile(eval_file) and not redo:
        with open(eval_file, 'rb') as fid:
            res = pickle.load(fid)
    else:
        vlist = d.test_vlist()
        # load per-frame detections
        alldets = load_frame_detections(d, vlist, dirname, 0.3)
        res = {}

        # compute AP for each class
        for ilabel, label in enumerate(d.labels):
            # detections of this class
            detections = alldets[alldets[:, 2] == ilabel, :]

            # load ground-truth of this class
            gt = {}
            for iv, v in enumerate(vlist):
                tubes = d.gttubes(v)

                if not ilabel in tubes:
                    continue

                for tube in tubes[ilabel]:
                    for i in range(tube.shape[0]):
                        k = (iv, int(tube[i, 0]))
                        if not k in gt:
                            gt[k] = []
                        gt[k].append(tube[i, 1:5].tolist())

            for k in gt:
                gt[k] = np.array(gt[k])

            # pr will be an array containing precision-recall values
            pr = np.empty((detections.shape[0] + 1, 2),
                          dtype=np.float32)  # precision,recall
            pr[0, 0] = 1.0
            pr[0, 1] = 0.0
            fn = sum([g.shape[0] for g in gt.values()])  # false negatives
            fp = 0  # false positives
            tp = 0  # true positives

            for i, j in enumerate(np.argsort(-detections[:, 3])):
                k = (int(detections[j, 0]), int(detections[j, 1]))
                box = detections[j, 4:8]
                ispositive = False

                if k in gt:
                    ious = iou2d(gt[k], box)
                    amax = np.argmax(ious)

                    if ious[amax] >= th:
                        ispositive = True
                        gt[k] = np.delete(gt[k], amax, 0)

                        if gt[k].size == 0:
                            del gt[k]

                if ispositive:
                    tp += 1
                    fn -= 1
                else:
                    fp += 1

                pr[i + 1, 0] = float(tp) / float(tp + fp)
                pr[i + 1, 1] = float(tp) / float(tp + fn)

            res[label] = pr

        # save results
        with open(eval_file, 'wb') as fid:
            pickle.dump(res, fid)

    # display results
    ap = 100 * np.array([pr_to_ap(res[label]) for label in d.labels])
    print("frameAP")

    for il, _ in enumerate(d.labels):
        print("{:20s} {:8.2f}".format('', ap[il]))

    print("{:20s} {:8.2f}".format("mAP", np.mean(ap)))
    print("")
Example #8
0
def extract_tubelets(dname, gpu=-1, redo=False):
    """Extract the tubelets for a given dataset

    args:
        - dname: dataset name (example: 'JHMDB')
        - gpu (default -1): use gpu given in argument, or use cpu if -1
        - redo: wheter or not to recompute already computed files

    save a pickle file for each frame
    the file contains a tuple (dets, dets_all)
        - dets is a numpy array with 2+4*K columns containing the tubelets starting at this frame after per-class nms at 0.45 and thresholding the scores at 0.01
          the columns are <label> <score> and then <x1> <y1> <x2> <y2> for each of the frame in the tubelet
        - dets_all contains the tubelets obtained after a global nms at 0.7 and thresholding the scores at 0.01
            it is a numpy arrray with 4*K + L + 1 containing the coordinates of the tubelets and the scores for all labels

    note: this version is inefficient: it is better to estimate the per-frame features once
    """
    d = GetDataset(dname)

    if gpu >= 0:
        caffe.set_mode_gpu()
        caffe.set_device(gpu)

    model_dir = os.path.join(os.path.dirname(__file__),
                             '../models/ACT-detector/', dname)
    output_dir = os.path.join(os.path.dirname(__file__),
                              '../results/ACT-detector/', dname)

    # load the RGB network
    rgb_proto = os.path.join(model_dir, "deploy_RGB.prototxt")
    rgb_model = os.path.join(model_dir, "RGB.caffemodel")
    net_rgb = caffe.Net(rgb_proto, caffe.TEST, weights=rgb_model)

    # load the FLOW5 network
    flo_proto = os.path.join(model_dir, "deploy_FLOW5.prototxt")
    flo_model = os.path.join(model_dir, "FLOW5.caffemodel")
    net_flo = caffe.Net(flo_proto, caffe.TEST, weights=flo_model)

    vlist = d.test_vlist()
    for iv, v in enumerate(vlist):
        print("Processing video {:d}/{:d}: {:s}".format(iv + 1, len(vlist), v))
        h, w = d.resolution(v)

        # network output is normalized between 0,1 ; so we will multiply it by the following array
        resolution_array = np.array([w, h, w, h] * K, dtype=np.float32)

        # now process each frame
        for i in range(1, 1 + d.nframes(v) - K + 1):
            outfile = os.path.join(output_dir, d.frame_format(v, i) + ".pkl")

            # skip if already computed
            if os.path.isfile(outfile) and not redo:
                continue

            # read the frames for the forward
            kwargs_rgb = {}
            kwargs_flo = {}
            for j in range(K):
                im = cv2.imread(d.imfile(v, i + j))
                if im is None:
                    print("Image {:s} does not exist".format(d.imfile(
                        v, i + j)))
                    return
                imscale = cv2.resize(im, (IMGSIZE, IMGSIZE),
                                     interpolation=cv2.INTER_LINEAR)
                kwargs_rgb['data_stream' + str(j)] = np.transpose(
                    imscale - MEAN, (2, 0, 1))[None, :, :, :]
                imf = [
                    cv2.imread(d.flowfile(v, min(d.nframes(v), i + j + iflow)))
                    for iflow in range(NFLOWS)
                ]
                if np.any(imf) is None:
                    print("Flow image {:s} does not exist".format(
                        d.flowfile(v, i + j)))
                    return
                imscalef = [
                    cv2.resize(im, (IMGSIZE, IMGSIZE),
                               interpolation=cv2.INTER_LINEAR) for im in imf
                ]
                timscale = [
                    np.transpose(im - MEAN, (2, 0, 1))[None, :, :, :]
                    for im in imscalef
                ]
                kwargs_flo['data_stream' + str(j) + 'flow'] = np.concatenate(
                    timscale, axis=1)

            # compute rgb and flow scores
            # two forward passes: one for the rgb and one for the flow
            net_rgb.forward(
                end="mbox_conf_flatten",
                **kwargs_rgb)  # forward of rgb with confidence and regression
            net_flo.forward(
                end="mbox_conf_flatten", **
                kwargs_flo)  # forward of flow5 with confidence and regression

            # compute late fusion of rgb and flow scores (keep regression from rgb)
            # use net_rgb for standard detections, net_flo for having all boxes
            scores = 0.5 * (net_rgb.blobs['mbox_conf_flatten'].data +
                            net_flo.blobs['mbox_conf_flatten'].data)
            net_rgb.blobs['mbox_conf_flatten'].data[...] = scores
            net_flo.blobs['mbox_conf_flatten'].data[...] = scores
            net_flo.blobs['mbox_loc'].data[
                ...] = net_rgb.blobs['mbox_loc'].data

            # two forward passes, only for the last layer
            # dets is the detections after per-class NMS and thresholding (stardard)
            # dets_all contains all the scores and regressions for all tubelets
            dets = net_rgb.forward(
                start='detection_out')['detection_out'][0, 0, :, 1:]
            dets_all = net_flo.forward(
                start='detection_out_full')['detection_out_full'][0, 0, :, 1:]

            # parse detections with per-class NMS
            if dets.shape[0] == 1 and np.all(dets == -1):
                dets = np.empty((0, dets.shape[1]), dtype=np.float32)

            dets[:,
                 2:] *= resolution_array  # network output was normalized in [0..1]
            dets[:,
                 0] -= 1  # label 0 was background, come back to label in [0..nlabels-1]
            dets[:, 2::2] = np.maximum(0, np.minimum(w, dets[:, 2::2]))
            dets[:, 3::2] = np.maximum(0, np.minimum(h, dets[:, 3::2]))

            # parse detections with global NMS at 0.7 (top 300)
            # coordinates were normalized in [0..1]
            dets_all[:, 0:4 * K] *= resolution_array
            dets_all[:, 0:4 * K:2] = np.maximum(
                0, np.minimum(w, dets_all[:, 0:4 * K:2]))
            dets_all[:, 1:4 * K:2] = np.maximum(
                0, np.minimum(h, dets_all[:, 1:4 * K:2]))
            idx = nms_tubelets(
                np.concatenate(
                    (dets_all[:, :4 * K],
                     np.max(dets_all[:, 4 * K + 1:], axis=1)[:, None]),
                    axis=1), 0.7, 300)
            dets_all = dets_all[idx, :]

            # save file
            if not os.path.isdir(os.path.dirname(outfile)):
                os.system('mkdir -p ' + os.path.dirname(outfile))

            with open(outfile, 'wb') as fid:
                pickle.dump((dets, dets_all), fid)
Example #9
0
import tensorflow as tf
import numpy as np
from Dataset import GetDataset
from Embeddings import GetEmbeddings
from datetime import datetime

trainDataset, testDataset = GetDataset(10000)
handle = tf.placeholder(tf.string, shape=[])
iterator = tf.data.Iterator.from_string_handle(handle,
                                               trainDataset.output_types,
                                               trainDataset.output_shapes)
text, label, tokens = iterator.get_next()

trainIterator = trainDataset.make_initializable_iterator()
testIterator = testDataset.make_initializable_iterator()

embeddings = GetEmbeddings()
vocabSize, numberOfEmbeddings = embeddings.shape
embeddedTokens = tf.keras.layers.Embedding(
    vocabSize + 1,
    numberOfEmbeddings,
    embeddings_initializer=tf.keras.initializers.Constant(embeddings),
    mask_zero=True,
    trainable=False)(tokens)

cell = tf.keras.layers.LSTMCell(50)
rnn = tf.keras.layers.RNN(cell)
semantics = rnn(embeddedTokens)

prediction = tf.keras.layers.Dense(1, activation='sigmoid')(semantics)
def videoAP(dname, th=0.5, redo=False):
    d = GetDataset(dname)
    dirname = os.path.join( os.path.dirname(__file__), '../results/ACT-detector/', dname)
    eval_file = os.path.join(dirname, "videoAP{:g}.pkl".format(th))
    
    if os.path.isfile(eval_file) and not redo:
        with open(eval_file, 'rb') as fid:
            res = pickle.load(fid)
    else:
        vlist = d.test_vlist()
        
        # load detections
        # alldets = for each label in 1..nlabels, list of tuple (v,score,tube as Kx5 array)
        alldets = {ilabel: [] for ilabel in xrange(d.nlabels)}
        for v in vlist:
            tubename = os.path.join(dirname, v + '_tubes.pkl')
            if not os.path.isfile(tubename):
                print("ERROR: Missing extracted tubes " + tubename)
                sys.exit()

            
            with open(tubename, 'rb') as fid:
                tubes = pickle.load(fid)
            
            for ilabel in xrange(d.nlabels):
                ltubes = tubes[ilabel]
                idx = nms3dt(ltubes, 0.3)
                alldets[ilabel] += [(v,ltubes[i][1], ltubes[i][0]) for i in idx]
        
        # compute AP for each class
        res = {}
        for ilabel in xrange(d.nlabels):
            detections = alldets[ilabel]
            # load ground-truth
            gt = {}
            for v in vlist:
                tubes = d.gttubes(v)
                
                if not ilabel in tubes:
                    continue
                
                gt[v] = tubes[ilabel]
                
                if len(gt[v])==0:
                    del gt[v]
            
            # precision,recall
            pr = np.empty((len(detections) + 1, 2), dtype=np.float32)
            pr[0,0] = 1.0
            pr[0,1] = 0.0

            fn = sum([ len(g) for g in gt.values()]) # false negatives
            fp = 0 # false positives
            tp = 0 # true positives

            for i, j in enumerate( np.argsort(-np.array([dd[1] for dd in detections]))):
                v, score, tube = detections[j]
                ispositive = False
                
                if v in gt:
                    ious = [iou3dt(g, tube) for g in gt[v]]
                    amax = np.argmax(ious)
                    if ious[amax] >= th:
                        ispositive = True
                        del gt[v][amax]
                        if len(gt[v]) == 0:
                            del gt[v]
                
                if ispositive:
                    tp += 1
                    fn -= 1
                else:
                    fp += 1
                
                pr[i+1,0] = float(tp) / float(tp + fp)
                pr[i+1,1] = float(tp) / float(tp + fn)
            
            res[d.labels[ilabel]] = pr
        
        # save results
        with open(eval_file, 'wb') as fid:
            pickle.dump(res, fid)

    # display results
    ap = 100 * np.array([pr_to_ap(res[label]) for label in d.labels])
    print "frameAP"
    for il, _ in enumerate(d.labels):
        print("{:20s} {:8.2f}".format('', ap[il]))

    print("{:20s} {:8.2f}".format("mAP", np.mean(ap)))
    print("")
def BuildTubes(dname, redo=False):
    d = GetDataset(dname)
    dirname = os.path.join( os.path.dirname(__file__), '../results/ACT-detector/', dname)
    vlist = d.test_vlist()

    for iv, v in enumerate(vlist):
        print("Processing video {:d}/{:d}: {:s}".format(iv + 1, len(vlist), v))

        outfile = os.path.join(dirname, v + "_tubes.pkl")
        
        if os.path.isfile(outfile) and not redo:
            continue
        
        RES = {}
        nframes = d.nframes(v)
        
        # load detected tubelets
        VDets = {}
        for startframe in xrange(1, nframes + 2 - K):
            resname = os.path.join(dirname, d.frame_format(v, startframe) + '.pkl')
            
            if not os.path.isfile(resname):
                print("ERROR: Missing extracted tubelets " + resname)
                sys.exit()
            
            with open(resname, 'rb') as fid:
                _, VDets[startframe] = pickle.load(fid)
        
        for ilabel in xrange(d.nlabels):
            FINISHED_TUBES = []
            CURRENT_TUBES = [] # tubes is a list of tuple (frame, lstubelets)

            def tubescore(tt):
                return np.mean(np.array([tt[i][1][-1] for i in xrange(len(tt))]))

            for frame in xrange(1, d.nframes(v) + 2 - K):
                # load boxes of the new frame and do nms while keeping Nkeep highest scored
                ltubelets = VDets[frame][:,range(4*K) + [4*K + 1 + ilabel]] # Nx(4K+1) with (x1 y1 x2 y2)*K ilabel-score
                idx = nms_tubelets(ltubelets, 0.3, top_k=10)
                ltubelets = ltubelets[idx,:]
                
                # just start new tubes
                if frame == 1:
                    for i in xrange(ltubelets.shape[0]):
                        CURRENT_TUBES.append( [(1,ltubelets[i,:])] )
                    continue

                # sort current tubes according to average score
                avgscore = [tubescore(t) for t in CURRENT_TUBES ]
                argsort = np.argsort(-np.array(avgscore))
                CURRENT_TUBES = [CURRENT_TUBES[i] for i in argsort]
                
                # loop over tubes
                finished = []
                for it, t in enumerate(CURRENT_TUBES):
                    # compute ious between the last box of t and ltubelets
                    last_frame, last_tubelet = t[-1]
                    ious = []
                    offset = frame - last_frame
                    if offset < K:
                        nov = K - offset
                        ious = sum([iou2d(ltubelets[:, 4*iov:4*iov+4], last_tubelet[4*(iov+offset):4*(iov+offset+1)]) for iov in xrange(nov)])/float(nov)
                    else:
                        ious = iou2d(ltubelets[:, :4], last_tubelet[4*K-4:4*K])
                    
                    valid = np.where(ious >= 0.2)[0]
                    
                    if valid.size>0:
                        # take the one with maximum score
                        idx = valid[ np.argmax(ltubelets[valid, -1])]
                        CURRENT_TUBES[it].append((frame, ltubelets[idx,:]))
                        ltubelets = np.delete(ltubelets, idx, axis=0)
                    else:
                        # skip
                        if offset>=5:
                            finished.append(it)

                # finished tubes that are done
                for it in finished[::-1]: # process in reverse order to delete them with the right index
                    FINISHED_TUBES.append( CURRENT_TUBES[it][:])
                    del CURRENT_TUBES[it]
                
                # start new tubes
                for i in xrange(ltubelets.shape[0]):
                    CURRENT_TUBES.append([(frame,ltubelets[i,:])])

            # all tubes are not finished
            FINISHED_TUBES += CURRENT_TUBES

            # build real tubes
            output = []
            for t in FINISHED_TUBES:
                score = tubescore(t)
                
                # just start new tubes
                if score< 0.01:
                    continue
                
                beginframe = t[0][0]
                endframe = t[-1][0]+K-1
                length = endframe+1-beginframe
                
                # delete tubes with short duraton
                if length < 15:
                    continue

                # build final tubes by average the tubelets
                out = np.zeros((length, 6), dtype=np.float32)
                out[:, 0] = np.arange(beginframe,endframe+1)
                n_per_frame = np.zeros((length, 1), dtype=np.int32)
                for i in xrange(len(t)):
                    frame, box = t[i]
                    for k in xrange(K):
                        out[frame-beginframe+k, 1:5] += box[4*k:4*k+4]
                        out[frame-beginframe+k, -1] += box[-1]
                        n_per_frame[frame-beginframe+k ,0] += 1
                out[:,1:] /= n_per_frame
                output.append((out, score))

            RES[ilabel] = output
        
        with open(outfile, 'wb') as fid:
            pickle.dump(RES, fid)
def frameCLASSIF(dname, redo=False):
    d = GetDataset(dname)
    dirname = os.path.join(os.path.dirname(__file__), '../results/ACT-detector/', dname)
    eval_file = os.path.join(dirname, "frameCLASSIF.pkl")
    
    if os.path.isfile(eval_file) and not redo:
        with open(eval_file, 'rb') as fid:
            CLASSIF = pickle.load(fid)
    else:
        vlist = d.test_vlist()
        #print(vlist)
        CORRECT = [0 for ilabel in xrange(d.nlabels)]
        TOTAL   = [0 for ilabel in xrange(d.nlabels)]
        
        for v in vlist:
            nframes = d.nframes(v)
            # load all tubelets
            VDets = {}
            for startframe in xrange(1, nframes + 2 - K):
                resname = os.path.join(dirname, d.frame_format(v, startframe) + '.pkl')
                
                if not os.path.isfile(resname):
                    print("ERROR: Missing extracted tubelets " + resname)
                    sys.exit()
                
                with open(resname, 'rb') as fid:
                    _, VDets[startframe] = pickle.load(fid)
            
            # iterate over ground-truth
            tubes = d.gttubes(v)
            for ilabel in tubes:
                for g in tubes[ilabel]:
                    for i in xrange(g.shape[0]):
                        frame = int(g[i, 0])
                        
                        # just in case a tube is longer than the video
                        if frame > nframes:
                            continue

                        gtbox = g[i, 1:5]
                        scores = np.zeros((d.nlabels,), dtype=np.float32)
                        
                        # average the score over the 6 frames
                        for sf in xrange(max(1, frame - K + 1), min(nframes - K + 1, frame) + 1):
                            overlaps = iou2d(VDets[sf][:, 4*(frame-sf):4*(frame-sf)+4], gtbox)
                            scores += np.sum(VDets[sf][overlaps >= 0.7, 4*K + 1:],axis=0)
                        
                        # check classif
                        if np.argmax(scores) == ilabel:
                            CORRECT[ilabel] += 1

                        TOTAL[ilabel] += 1
        print(TOTAL)
        print(CORRECT)
        CLASSIF = [float(CORRECT[ilabel]) / float(TOTAL[ilabel]) for ilabel in xrange(d.nlabels) if TOTAL[ilabel] != 0 ]
        
        with open(eval_file, 'wb') as fid:
            pickle.dump(CLASSIF, fid)

    # print classif results
    for il, la in enumerate(d.labels):
        print("{:20s} {:6.2f}".format(la, 100*CLASSIF[il]))

    print("{:20s} {:6.2f}".format("CLASSIF", 100*np.mean(np.array(CLASSIF))))
def frameAP(dname, th=0.5, redo=False):
    d = GetDataset(dname)
    dirname = os.path.join(os.path.dirname(__file__), '../results/ACT-detector/', dname)
    
    eval_file = os.path.join(dirname, "frameAP{:g}.pkl".format(th))
    
    if os.path.isfile(eval_file) and not redo:
        with open(eval_file, 'rb') as fid:
            res = pickle.load(fid)
    else:
        vlist = d.test_vlist()
        print(vlist)
        # load per-frame detections
        alldets = load_frame_detections(d, vlist, dirname, 0.3)
        res = {}
        
        # compute AP for each class
        for ilabel,label in enumerate(d.labels):
            # detections of this class
            detections = alldets[alldets[:, 2] == ilabel, :]
            
            # load ground-truth of this class
            gt = {}
            for iv, v in enumerate(vlist):
                tubes = d.gttubes(v)
                
                if not ilabel in tubes:
                    continue
                
                for tube in tubes[ilabel]:
                    for i in xrange(tube.shape[0]):
                        k = (iv, int(tube[i, 0]))
                        if not k in gt:
                            gt[k] = []
                        gt[k].append(tube[i, 1:5].tolist())

            for k in gt:
                gt[k] = np.array( gt[k] )
            
            # pr will be an array containing precision-recall values
            pr = np.empty((detections.shape[0] + 1, 2), dtype=np.float32)# precision,recall
            pr[0, 0] = 1.0
            pr[0, 1] = 0.0
            fn = sum([g.shape[0] for g in gt.values()]) # false negatives
            fp = 0 # false positives
            tp = 0 # true positives
            
            for i, j in enumerate(np.argsort(-detections[:,3])):
                k = (int(detections[j,0]), int(detections[j,1]))
                box = detections[j, 4:8]
                ispositive = False
                
                if k in gt:
                    ious = iou2d(gt[k], box)
                    amax = np.argmax(ious)
                    
                    if ious[amax] >= th:
                        ispositive = True
                        gt[k] = np.delete(gt[k], amax, 0)
                        
                        if gt[k].size == 0:
                            del gt[k]
                
                if ispositive:
                    tp += 1
                    fn -= 1
                else:
                    fp += 1
                if((tp+fn)>0):
                    pr[i+1, 0] = float(tp) / float(tp + fp)
                    pr[i+1, 1] = float(tp) / float(tp + fn)
                else:
                    pr[i+1, 0] = 0
                    pr[i+1, 1] = 0
            res[label] = pr
        
        # save results
        with open(eval_file, 'wb') as fid:
            pickle.dump(res, fid)
    
    # display results
    ap = 100*np.array([pr_to_ap(res[label]) for label in d.labels])
    print "frameAP"
    
    for il, _ in enumerate(d.labels):
        print("{:20s} {:8.2f}".format('', ap[il]))
    
    print("{:20s} {:8.2f}".format("mAP", np.mean(ap)))
    print("")
def extract_tubelets(dname, gpu=-1, redo=False):
    """Extract the tubelets for a given dataset

    args:
        - dname: dataset name (example: 'JHMDB')
        - gpu (default -1): use gpu given in argument, or use cpu if -1
        - redo: wheter or not to recompute already computed files

    save a pickle file for each frame
    the file contains a tuple (dets, dets_all)
        - dets is a numpy array with 2+4*K columns containing the tubelets starting at this frame after per-class nms at 0.45 and thresholding the scores at 0.01
          the columns are <label> <score> and then <x1> <y1> <x2> <y2> for each of the frame in the tubelet
        - dets_all contains the tubelets obtained after a global nms at 0.7 and thresholding the scores at 0.01
            it is a numpy arrray with 4*K + L + 1 containing the coordinates of the tubelets and the scores for all labels

    note: this version is inefficient: it is better to estimate the per-frame features once
    """
    d = GetDataset(dname)

    if gpu >= 0:
        caffe.set_mode_gpu()
        caffe.set_device(gpu)

    model_dir = os.path.join(os.path.dirname(__file__), '../models/ACT-detector/', dname)
    output_dir = os.path.join(os.path.dirname(__file__), '../results/ACT-detector/', dname)
    
    # load the RGB network
    rgb_proto = os.path.join(model_dir, "deploy_RGB.prototxt")
    rgb_model = os.path.join(model_dir, "../generated_AVA_iter_118662.caffemodel")
    net_rgb = caffe.Net(rgb_proto, caffe.TEST, weights=rgb_model)
    
    # load the FLOW5 network
    flo_proto = os.path.join(model_dir, "deploy_FLOW5.prototxt")
    flo_model = os.path.join(model_dir, "../generated_AVA_iter_59463.caffemodel")
    net_flo = caffe.Net(flo_proto, caffe.TEST, weights=flo_model)

    vlist = d.test_vlist()
    for iv, v in enumerate(vlist):
        print("Processing video {:d}/{:d}: {:s}".format( iv+1, len(vlist), v))
        h, w = d.resolution(v)
        
        # network output is normalized between 0,1 ; so we will multiply it by the following array
        resolution_array = np.array([w,h,w,h]*K, dtype=np.float32)
        
        # now process each frame
        for i in xrange(1, 1 + d.nframes(v) - K + 1):
            outfile = os.path.join(output_dir, d.frame_format(v,i) + ".pkl")
            
            # skip if already computed
            if os.path.isfile(outfile) and not redo:
                continue
            
            # read the frames for the forward
            kwargs_rgb  = {}
            kwargs_flo = {}
            for j in xrange(K):
                cap = cv2.VideoCapture(d.vidfile(v,0))
                #print(frame)
                #print(int(cap.get(7)))
                cap.set(1,i + j - 1)
                im = cap.read()[1]
                cap.release()
                #im = cv2.imread(d.imfile(v, i + j))
                if im is None:
                    print "Image {:s} does not exist".format(d.imfile(v, i+j))
                    return
                imscale = cv2.resize(im, (IMGSIZE, IMGSIZE), interpolation=cv2.INTER_LINEAR)
                kwargs_rgb['data_stream' + str(j)] = np.transpose(imscale-MEAN, (2, 0, 1))[None, :, :, :]
                imf = [cv2.imread(d.flowfile(v.split(".")[0], min(d.nframes(v), i + j + iflow))) for iflow in xrange(NFLOWS)]
                if np.any(imf) is None:
                    print "Flow image {:s} does not exist".format(d.flowfile(v, i+j))
                    return
                imscalef = [cv2.resize(im, (IMGSIZE, IMGSIZE), interpolation=cv2.INTER_LINEAR) for im in imf]
                timscale = [np.transpose(im-MEAN, (2, 0, 1))[None, :, :, :] for im in imscalef]
                kwargs_flo['data_stream' + str(j) + 'flow'] = np.concatenate(timscale, axis=1)
            
            # compute rgb and flow scores
            # two forward passes: one for the rgb and one for the flow 
            net_rgb.forward(end="mbox_conf_flatten", **kwargs_rgb) # forward of rgb with confidence and regression
            net_flo.forward(end="mbox_conf_flatten", **kwargs_flo) # forward of flow5 with confidence and regression
            
            # compute late fusion of rgb and flow scores (keep regression from rgb)
            # use net_rgb for standard detections, net_flo for having all boxes
            scores = 0.5 * (net_rgb.blobs['mbox_conf_flatten'].data + net_flo.blobs['mbox_conf_flatten'].data)
            net_rgb.blobs['mbox_conf_flatten'].data[...] = scores
            net_flo.blobs['mbox_conf_flatten'].data[...] = scores
            net_flo.blobs['mbox_loc'].data[...] = net_rgb.blobs['mbox_loc'].data
            
            # two forward passes, only for the last layer 
            # dets is the detections after per-class NMS and thresholding (stardard)
            # dets_all contains all the scores and regressions for all tubelets 
            dets = net_rgb.forward(start='detection_out')['detection_out'][0, 0, :, 1:]
            dets_all = net_flo.forward(start='detection_out_full')['detection_out_full'][0, 0, :, 1:]
            
            # parse detections with per-class NMS
            if dets.shape[0] == 1 and np.all(dets == -1):
                dets = np.empty((0, dets.shape[1]), dtype=np.float32)

            dets[:, 2:] *= resolution_array # network output was normalized in [0..1]
            dets[:, 0] -= 1 # label 0 was background, come back to label in [0..nlabels-1]
            dets[:, 2::2] = np.maximum(0, np.minimum(w, dets[:, 2::2]))
            dets[:, 3::2] = np.maximum(0, np.minimum(h, dets[:, 3::2]))

            # parse detections with global NMS at 0.7 (top 300)
            # coordinates were normalized in [0..1]
            dets_all[:, 0:4*K] *= resolution_array 
            dets_all[:, 0:4*K:2] = np.maximum(0, np.minimum(w, dets_all[:, 0:4*K:2]))
            dets_all[:, 1:4*K:2] = np.maximum(0, np.minimum(h, dets_all[:, 1:4*K:2]))
            idx = nms_tubelets(np.concatenate((dets_all[:, :4*K], np.max(dets_all[:, 4*K+1:], axis=1)[:, None]), axis=1), 0.7, 300)
            dets_all = dets_all[idx, :]
            
            # save file
            if not os.path.isdir(os.path.dirname(outfile)):
                os.system('mkdir -p ' + os.path.dirname(outfile))

            with open(outfile, 'wb') as fid:
                pickle.dump((dets, dets_all), fid)
Example #15
0
def videoAP(dname, th=0.5, redo=False):
    d = GetDataset(dname)
    dirname = os.path.join(os.path.dirname(__file__),
                           '../results/ACT-detector/', dname)
    eval_file = os.path.join(dirname, "videoAP{:g}.pkl".format(th))

    if os.path.isfile(eval_file) and not redo:
        with open(eval_file, 'rb') as fid:
            res = pickle.load(fid)
    else:
        vlist = d.test_vlist()

        # load detections
        # alldets = for each label in 1..nlabels, list of tuple (v,score,tube as Kx5 array)
        alldets = {ilabel: [] for ilabel in range(d.nlabels)}
        for v in vlist:
            tubename = os.path.join(dirname, v + '_tubes.pkl')
            if not os.path.isfile(tubename):
                print("ERROR: Missing extracted tubes " + tubename)
                sys.exit()

            with open(tubename, 'rb') as fid:
                tubes = pickle.load(fid)

            for ilabel in range(d.nlabels):
                ltubes = tubes[ilabel]
                idx = nms3dt(ltubes, 0.3)
                alldets[ilabel] += [(v, ltubes[i][1], ltubes[i][0])
                                    for i in idx]

        # compute AP for each class
        res = {}
        for ilabel in range(d.nlabels):
            detections = alldets[ilabel]
            # load ground-truth
            gt = {}
            for v in vlist:
                tubes = d.gttubes(v)

                if not ilabel in tubes:
                    continue

                gt[v] = tubes[ilabel]

                if len(gt[v]) == 0:
                    del gt[v]

            # precision,recall
            pr = np.empty((len(detections) + 1, 2), dtype=np.float32)
            pr[0, 0] = 1.0
            pr[0, 1] = 0.0

            fn = sum([len(g) for g in gt.values()])  # false negatives
            fp = 0  # false positives
            tp = 0  # true positives

            for i, j in enumerate(
                    np.argsort(-np.array([dd[1] for dd in detections]))):
                v, score, tube = detections[j]
                ispositive = False

                if v in gt:
                    ious = [iou3dt(g, tube) for g in gt[v]]
                    amax = np.argmax(ious)
                    if ious[amax] >= th:
                        ispositive = True
                        del gt[v][amax]
                        if len(gt[v]) == 0:
                            del gt[v]

                if ispositive:
                    tp += 1
                    fn -= 1
                else:
                    fp += 1

                pr[i + 1, 0] = float(tp) / float(tp + fp)
                pr[i + 1, 1] = float(tp) / float(tp + fn)

            res[d.labels[ilabel]] = pr

        # save results
        with open(eval_file, 'wb') as fid:
            pickle.dump(res, fid)

    # display results
    ap = 100 * np.array([pr_to_ap(res[label]) for label in d.labels])
    print("frameAP")
    for il, _ in enumerate(d.labels):
        print("{:20s} {:8.2f}".format('', ap[il]))

    print("{:20s} {:8.2f}".format("mAP", np.mean(ap)))
    print("")
Example #16
0
def load_frame_detections(d, vlist, dirname, nms):
    if isinstance(d, str):
        d = GetDataset(d)

    alldets = [
    ]  # list of numpy array with <video_index> <frame_index> <ilabel> <score> <x1> <y1> <x2> <y2>
    for iv, v in enumerate(vlist):
        h, w = d.resolution(v)

        # aggregate the results for each frame
        vdets = {
            i: np.empty((0, 6), dtype=np.float32)
            for i in range(1, 1 + d.nframes(v))
        }  # x1, y1, x2, y2, score, ilabel

        # load results for each starting frame
        for i in range(1, 1 + d.nframes(v) - K + 1):
            resname = os.path.join(dirname, d.frame_format(v, i) + '.pkl')

            if not os.path.isfile(resname):
                print("ERROR: Missing extracted tubelets " + resname)
                sys.exit()

            with open(resname, 'rb') as fid:
                dets, _ = pickle.load(fid)

            if dets.size == 0:
                continue

            for k in range(K):
                vdets[i + k] = np.concatenate(
                    (vdets[i + k],
                     dets[:,
                          np.array([
                              2 + 4 * k, 3 + 4 * k, 4 + 4 * k, 5 + 4 * k, 1, 0
                          ])]),
                    axis=0)

        # Perform NMS in each frame
        for i in vdets:
            idx = np.empty((0, ), dtype=np.int32)
            for ilabel in range(d.nlabels):
                a = np.where(vdets[i][:, 5] == ilabel)[0]

                if a.size == 0:
                    continue

                idx = np.concatenate((idx, a[nms2d(
                    vdets[i][vdets[i][:, 5] == ilabel, :5], nms)]),
                                     axis=0)

            if idx.size == 0:
                continue

            alldets.append(
                np.concatenate(
                    (iv * np.ones(
                        (idx.size, 1), dtype=np.float32), i * np.ones(
                            (idx.size, 1), dtype=np.float32), vdets[i][idx, :]
                     [:, np.array([5, 4, 0, 1, 2, 3], dtype=np.int32)]),
                    axis=1))

    return np.concatenate(alldets, axis=0)
Example #17
0
    def setup(self, bottom, top):
        layer_params = eval(self.param_str)

        assert 'dataset_name' in layer_params
        dataset_name = layer_params['dataset_name']
        self._dataset = GetDataset(dataset_name)

        assert 'K' in layer_params
        self._K = layer_params['K']
        assert self._K > 0

        # parse optional argument
        default_values = {
            'rand_seed': 0,
            'shuffle': True,
            'batch_size': 32 // self._K,
            'mean_values': [104, 117, 123],
            'resize_height': 300,
            'resize_width': 300,
            'restart_iter': 0,
            'flow': False,
            'ninput': 1,
        }

        for k in default_values.keys():
            if k in layer_params:
                lay_param = layer_params[k]
            else:
                lay_param = default_values[k]
            setattr(self, '_' + k, lay_param)

        if not self._flow and self._ninput > 1:
            raise NotImplementedError("ACT-detector: Not implemented: ninput > 1 with rgb frames")

        d = self._dataset
        K = self._K

        # build index (v,i) of valid starting chunk
        self._indices = []
        for v in d.train_vlist():
            vtubes = sum(d.gttubes(v).values(), [])

            self._indices += [(v,i) for i in range(1, d.nframes(v)+2-K) if tubelet_in_out_tubes(vtubes,i,K) and tubelet_has_gt(vtubes,i,K)]
            # self._indices += [(v,i) for i in range(1, d.nframes(v)+2-K) if all([ (i in t[:,0] and i+K-1 in t[:,0]) or all([not j in t[:,0] for j in xrange(i,i+K)]) for t in vtubes]) and any([ (i in t[:,0] and i+K-1 in t[:,0]) for t in vtubes]) ]
            
        self._nseqs = len(self._indices)

        self._iter = 0
        self._nshuffles = 0
        self.shuffle()

        if self._restart_iter > 0:
            assert self._next == 0

            self._iter = self._restart_iter
            iimages = self._restart_iter * self._batch_size

            while iimages > self._nseqs:
                self.shuffle()
                iimages -= self._nseqs

            self._next = iimages

        for i in range(K):
            top[i].reshape(self._batch_size, 3 * self._ninput, self._resize_height, self._resize_width)

        top[K].reshape(1, 1, 1, 8)
def ACT_generate_prototxt(dname, K=6, flow=False):
    """ Generates the train, test, deploy and solver prototxts for the datasets used in ACT-detector. 
        dname: 'UCFSports', 'JHMDB', 'JHMDB2', 'JHMDB3', 'UCF101', 'UCF101v2'
        K: length of the tubelet and input sequence. In ACT-detector K=6
        flow: if true, then use modality = FLOW5; if false, then modality = RGB
    """

    ######################### Frame PARAMS #########################
    IMGSIZE = 300

    ######################### General PARAMS #########################
    modality_str = 'flow' if flow else ''
    mode_str = 'FLOW5' if flow else 'RGB'

    ######################### Dataset PARAMS #########################
    dd = GetDataset(dname)
    num_classes = dd.nlabels + 1 # +1 for background
    if dname=='UCFSports':
        niter = 60000
        lr_steps = [40000, 55000]
    elif dname in ['JHMDB', 'JHMDB2', 'JHMDB3']:
        niter = 240000
        lr_steps = [160000, 220000]
    elif dname in ['UCF101', 'UCF101v2']:
        niter = 600000
        lr_steps = [400000, 550000]
    elif dname=='AVA':
        niter = 240000
        lr_steps = [160000, 220000]
    else:
        raise Exception("Unknown dataset " + dname)
    
    ######################### Model PATHS #########################
    #dirname = os.path.join(os.path.dirname(__file__), "..", "models", "ACT-detector", 'generated_' + dd.NAME)
    dirname = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "models", "ACT-detector", 'generated_' + dd.NAME))
    if not os.path.isdir(dirname): 
        os.system('mkdir -p ' + dirname)
    deploy_net_file = "{}/deploy_{}.prototxt".format(dirname, mode_str)
    train_net_file = "{}/train_{}.prototxt".format(dirname, mode_str)
    solver_file = "{}/solver_{}.prototxt".format(dirname, mode_str)
    model_name = "ACTdetector_{}_{}".format(dname, mode_str)
    # The pretrained model. 
    pretrain_model_file = os.path.join(dirname, "..", 'initialization_VGG_ILSVRC16_K{}_{}.caffemodel'.format(K, mode_str))

    ############ BATCH NORM PARAMS ######################
    # If true, use batch norm for all newly added layers.
    # Currently only the non batch norm version has been tested.
    use_batchnorm = False
    lr_mult = 1
    # Use different initial learning rate.
    if use_batchnorm:
        base_lr = 0.0004
    else:
        # A learning rate for batch_size = 1, num_gpus = 1.
        base_lr = 0.00004

    ############ MultiBoxLoss PARAMS ######################
    share_location = True
    background_label_id=0
    train_on_diff_gt = True
    normalization_mode = P.Loss.VALID
    code_type = P.PriorBox.CENTER_SIZE
    ignore_cross_boundary_bbox = False
    mining_type = P.MultiBoxLoss.MAX_NEGATIVE
    neg_pos_ratio = 3.
    loc_weight = (neg_pos_ratio + 1.) / 4.
    multibox_loss_param = {
        'loc_loss_type': P.MultiBoxLoss.SMOOTH_L1,
        'conf_loss_type': P.MultiBoxLoss.SOFTMAX,
        'loc_weight': loc_weight /float(K),
        'num_classes': num_classes,
        'share_location': share_location,
        'match_type': P.MultiBoxLoss.PER_PREDICTION,
        'overlap_threshold': 0.5,
        'use_prior_for_matching': True,
        'background_label_id': background_label_id,
        'use_difficult_gt': train_on_diff_gt,
        'neg_pos_ratio': neg_pos_ratio,
        'neg_overlap': 0.5,
        'code_type': code_type,
        }
    act_cuboid_loss_param = {
        'sequence_length': K,
    }
    multibox_loss_param['ignore_cross_boundary_bbox'] = ignore_cross_boundary_bbox
    multibox_loss_param['mining_type'] = mining_type
    loss_param = {
        'normalization': normalization_mode,
    }

    ############ PARAMS for generating PRIORS ######################
    # minimum dimension of input image
    min_dim = IMGSIZE
    mbox_source_layers = ['conv4_3', 'fc7', 'conv6_2', 'conv7_2', 'conv8_2', 'conv9_2']
    # in percent %
    min_ratio = 20
    max_ratio = 90
    step = int(math.floor((max_ratio - min_ratio) / (len(mbox_source_layers) - 2)))
    min_sizes = []
    max_sizes = []
    for ratio in xrange(min_ratio, max_ratio + 1, step):
        min_sizes.append(min_dim * ratio / 100.) 
        max_sizes.append(min_dim * (ratio + step) / 100.)
    min_sizes = ([min_dim * 10 / 100.] + min_sizes)
    max_sizes = ([min_dim * 20 / 100.] + max_sizes)
    aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]]
    normalizations = [20, -1, -1, -1, -1, -1]
    steps = [8, 16, 32, 64, 100, 300] 
    # variance used to encode/decode prior bboxes.
    if code_type == P.PriorBox.CENTER_SIZE:
        prior_variance = [0.1, 0.1, 0.2, 0.2]
    else:
        prior_variance = [0.1]
    flip = True
    clip = False

    ############# GPU & SOLVER PARAMS ######################
    # Defining which GPUs to use.
    gpulist=[0]
    num_gpus = len(gpulist)

    # Divide the mini-batch to different GPUs.=
    batch_size = int(32 / K)
    accum_batch_size = batch_size
    iter_size = accum_batch_size / batch_size
    solver_mode = P.Solver.CPU
    device_id = 0
    batch_size_per_device = batch_size
    if num_gpus > 0:
        batch_size_per_device = int(math.ceil(float(batch_size) / num_gpus))
        iter_size = int(math.ceil(float(accum_batch_size) / (batch_size_per_device * num_gpus)))
        solver_mode = P.Solver.GPU
        device_id = int(gpulist[0])

    if normalization_mode == P.Loss.NONE:
        base_lr /= batch_size_per_device
    elif normalization_mode == P.Loss.VALID:
        base_lr *= 25. / loc_weight
    elif normalization_mode == P.Loss.FULL:
        # Roughly there are 2000 prior bboxes per image.
        # TODO(weiliu89): Estimate the exact # of priors.
        base_lr *= 2000.

    # Which layers to freeze (no backward) during training.
    freeze_layers = []

    solver_param = {
        # Train parameters
        'base_lr': 0.0001,
        'weight_decay': 0.0005,
        'lr_policy': "multistep",
        'stepvalue': lr_steps,
        'gamma': 0.1,
        'momentum': 0.9,
        'max_iter': niter,
        'snapshot': 10000,
        'display': 10,
        'average_loss': 10,
        'type': "SGD",
        'solver_mode': solver_mode,
        'device_id': device_id,
        'debug_info': False,
        'snapshot_after_train': True,
        'iter_size': 1,
        }

    # parameters for generating detection output.
    det_out_param = {
        'num_classes': num_classes,
        'share_location': share_location,
        'background_label_id': background_label_id,
        'nms_param': {'nms_threshold': 0.45, 'top_k': 400},
        'keep_top_k': 200,
        'confidence_threshold': 0.01,
        'code_type': code_type,
        }


    ######################### TRAIN PROTOTXT #########################
    net = caffe.NetSpec()

    top_datalayer = ACT_DataLayer(dname, K, batch_size, resize_height=IMGSIZE, resize_width=IMGSIZE, restart_iter=0, flow=flow, ninput=5 if flow else 1)
    assert len(top_datalayer) == K + 1

    for i in range(K):
        net['data_stream' + str(i) + modality_str] = top_datalayer[i]
    net['label'] = top_datalayer[K]

    ACT_VGGNetBody(net, from_layer='data', K=K, fully_conv=True, reduced=True, dilated=True,
        dropout=False, freeze_layers=freeze_layers, m=modality_str, lr_mult=1.0/float(K))

    ACT_AddExtraLayers300(net, K, use_batchnorm, m=modality_str, lr_mult=lr_mult/float(K))
    mbox_layers = ACT_CreateCuboidHead(net, K, data_layer='data_stream0' + modality_str, from_layers=mbox_source_layers,
            use_batchnorm=use_batchnorm, min_sizes=min_sizes, max_sizes=max_sizes,
            aspect_ratios=aspect_ratios, steps=steps, normalizations=normalizations,
            num_classes=num_classes, share_location=share_location, flip=flip, clip=clip,
            prior_variance=prior_variance, kernel_size=3, pad=1, lr_mult=lr_mult, m=modality_str)
    name = "mbox_loss"
    mbox_layers.append(net.label)

    # CUBOID loss 
    net[name] = L.ACTCuboidLoss(*mbox_layers, multibox_loss_param=multibox_loss_param,
            act_cuboid_loss_param=act_cuboid_loss_param,
            loss_param=loss_param, include=dict(phase=caffe_pb2.Phase.Value('TRAIN')),
            propagate_down=[True, True, False, False])

    # Saving ..
    with open(train_net_file, 'w') as f:
        print('name: "{}_train"'.format(model_name), file=f)
        print(net.to_proto(), file=f)

    ######################### DEPLOY PROTOTXT #########################
    net = caffe.NetSpec()

    # Fake data layer that we delete later, just to have the output existing as top
    top_datalayer = ACT_DataLayer(dname, K, batch_size, resize_height=IMGSIZE, resize_width=IMGSIZE, restart_iter=0, flow=flow, ninput=5 if flow else 1)
    assert len(top_datalayer) == K + 1
    
    for i in range(K):
        net['data_stream' + str(i) + modality_str] = top_datalayer[i]
    
    ACT_VGGNetBody(net, from_layer='data', K=K, fully_conv=True, reduced=True, dilated=True,
        dropout=False, freeze_layers=freeze_layers, m=modality_str, lr_mult=1.0/float(K))

    ACT_AddExtraLayers300(net, K, use_batchnorm, m=modality_str, lr_mult=lr_mult/float(K))
    mbox_layers = ACT_CreateCuboidHead(net, K, data_layer='data_stream0'+modality_str, from_layers=mbox_source_layers,
        use_batchnorm=use_batchnorm, min_sizes=min_sizes, max_sizes=max_sizes,
        aspect_ratios=aspect_ratios, steps=steps, normalizations=normalizations,
        num_classes=num_classes, share_location=share_location, flip=flip, clip=clip,
        prior_variance=prior_variance, kernel_size=3, pad=1, lr_mult=lr_mult, m=modality_str)

    # net and mbox_layers
    conf_name = "mbox_conf"
    if multibox_loss_param["conf_loss_type"] == P.MultiBoxLoss.SOFTMAX:
        reshape_name = "{}_reshape".format(conf_name)
        net[reshape_name] = L.Reshape(net[conf_name], shape=dict(dim=[0, -1, num_classes]))
        softmax_name = "{}_softmax".format(conf_name)
        net[softmax_name] = L.Softmax(net[reshape_name], axis=2)
        flatten_name = "{}_flatten".format(conf_name)
        net[flatten_name] = L.Flatten(net[softmax_name], axis=1)
        mbox_layers[1] = net[flatten_name]
    elif multibox_loss_param["conf_loss_type"] == P.MultiBoxLoss.LOGISTIC:
        sigmoid_name = "{}_sigmoid".format(conf_name)
        net[sigmoid_name] = L.Sigmoid(net[conf_name])
        mbox_layers[1] = net[sigmoid_name]

    # Detection output layer:
    # Saving detections for ACT-detector
    # -- The RGB stream saves boxes after per-class nms at 0.45 and thresholding scores
    # -- The flow stream saves all the regressed cuboids (with their scores
    if modality_str == "":
        net.detection_out = L.ACTDetectionOutput(*mbox_layers,
            detection_output_param=det_out_param,
            act_detection_output_param={'sequence_length': K},
            include=dict(phase=caffe_pb2.Phase.Value('TEST')))
    else:        
        net.detection_out_full = L.ACTDetectionOutput(*mbox_layers,
            detection_output_param=det_out_param,
            act_detection_output_param={'sequence_length': K, 'save_full': True},
            include=dict(phase=caffe_pb2.Phase.Value('TEST')))
          
    net_param = net.to_proto()
    del net_param.layer[0]
    net_param.name = '{}_deploy'.format(model_name)
    for stream in xrange(K):
        net_param.input.extend(['data_stream' + str(stream) + modality_str])
        net_param.input_shape.extend([
            caffe_pb2.BlobShape(dim=[1, 3 * (5 if flow else 1), IMGSIZE, IMGSIZE])])
            
    # Saving .. 
    with open(deploy_net_file, 'w') as f:
        print(net_param, file=f)

    ######################### SOLVER PROTOTXT #########################
    solver = caffe_pb2.SolverParameter(
        train_net=train_net_file, snapshot_prefix=dirname, **solver_param)

    # Saving ..
    with open(solver_file, 'w') as f:
        print(solver, file=f)