def __init__(self, hparam):
     self.load_from_checkpoint(hparam['pretrain_path'])
     self.backbone = BackBone()
     self.bbox_regress = ComponentRegress()
     self.roi_align = RoIAlign(output_size=(32, 32),
                               spatial_scale=128. / 512.,
                               sampling_ratio=-1)
     self.inner_Module = inner_Module(num=4)
     self.outer_Seg = outer_Seg(n_class=2)
     self.criterion = CrossEntropyLoss()
 def __init__(self, hparam):
     super(Hybird, self).__init__()
     self.backbone = BackBone()
     self.bbox_regress = ComponentRegress()
     self.roi_align = RoIAlign(output_size=(32, 32),
                               spatial_scale=128. / 512.,
                               sampling_ratio=-1)
     self.inner_Module = inner_Module(num=4)
     self.outer_Seg = outer_Seg(n_class=3)
     self.load_from_checkpoint(hparam.pretrain_path,
                               torch.device(f"cuda:{hparam.cuda}"))
Exemple #3
0
    def __init__(self):
        super(Net, self).__init__()
        # define private variables
        self.time_step = C.RPIN.INPUT_SIZE
        self.ve_feat_dim = C.RPIN.VE_FEAT_DIM  # visual encoder feature dimension
        self.in_feat_dim = C.RPIN.IN_FEAT_DIM  # interaction net feature dimension
        self.num_objs = C.RPIN.MAX_NUM_OBJS
        self.mask_size = C.RPIN.MASK_SIZE

        # build image encoder
        self.backbone = build_backbone(C.RPIN.BACKBONE, self.ve_feat_dim, C.INPUT.IMAGE_CHANNEL)

        # extract object feature -> convert to object state
        pool_size = C.RPIN.ROI_POOL_SIZE
        self.roi_align = RoIAlign(
            (pool_size, pool_size),
            spatial_scale=C.RPIN.ROI_POOL_SPATIAL_SCALE,
            sampling_ratio=C.RPIN.ROI_POOL_SAMPLE_R,
        )

        roi2state = [nn.Conv2d(self.ve_feat_dim, self.in_feat_dim, kernel_size=3, padding=1),
                     nn.ReLU(inplace=True)]

        for _ in range(C.RPIN.N_EXTRA_ROI_F):
            roi2state.append(nn.Conv2d(self.ve_feat_dim, self.in_feat_dim,
                                       kernel_size=C.RPIN.EXTRA_F_KERNEL, stride=1, padding=C.RPIN.EXTRA_F_PADDING))
            roi2state.append(nn.ReLU(inplace=True))
        self.roi2state = nn.Sequential(*roi2state)

        graph = []
        for i in range(self.time_step):
            graph.append(InterNet(self.in_feat_dim))
        self.graph = nn.ModuleList(graph)

        predictor = [nn.Conv2d(self.in_feat_dim * self.time_step, self.in_feat_dim, kernel_size=1), nn.ReLU()]

        for _ in range(C.RPIN.N_EXTRA_PRED_F):
            predictor.append(nn.Conv2d(self.in_feat_dim, self.in_feat_dim,
                                       kernel_size=C.RPIN.EXTRA_F_KERNEL, stride=1, padding=C.RPIN.EXTRA_F_PADDING))
            predictor.append(nn.ReLU(inplace=True))
        self.predictor = nn.Sequential(*predictor)

        self.decoder_output = 4
        self.bbox_decoder = nn.Linear(self.in_feat_dim * pool_size * pool_size, self.decoder_output)

        if C.RPIN.MASK_LOSS_WEIGHT > 0:
            self.mask_decoder = nn.Sequential(
                nn.Linear(self.in_feat_dim * pool_size * pool_size, self.in_feat_dim),
                nn.ReLU(inplace=True),
                nn.Linear(self.in_feat_dim, self.mask_size * self.mask_size),
                nn.Sigmoid(),
            )
Exemple #4
0
    def __init__(self, pretrained=True, average_pool=True, semantic=True, final_dim=1024):
        """
        :param average_pool: whether or not to average pool the representations
        :param pretrained: Whether we need to load from scratch
        :param semantic: Whether or not we want to introduce the mask and the class label early on (default Yes)
        """
        super(SimpleDetector, self).__init__()
        # huge thx to https://github.com/ruotianluo/pytorch-faster-rcnn/blob/master/lib/nets/resnet_v1.py
        backbone = _load_resnet_imagenet(pretrained=pretrained) if USE_IMAGENET_PRETRAINED else _load_resnet(
            pretrained=pretrained)

        self.backbone = nn.Sequential(
            backbone.conv1,
            backbone.bn1,
            backbone.relu,
            backbone.maxpool,
            backbone.layer1,
            backbone.layer2,
            backbone.layer3,
            # backbone.layer4
        )
        self.roi_align = RoIAlign((7, 7) if USE_IMAGENET_PRETRAINED else (14, 14),
                                  spatial_scale=1 / 16, sampling_ratio=0)

        if semantic:
            self.mask_dims = 32
            self.object_embed = torch.nn.Embedding(num_embeddings=81, embedding_dim=128)
            self.mask_upsample = torch.nn.Conv2d(1, self.mask_dims, kernel_size=3,
                                                 stride=2 if USE_IMAGENET_PRETRAINED else 1,
                                                 padding=1, bias=True)
        else:
            self.object_embed = None
            self.mask_upsample = None

        after_roi_align = [backbone.layer4]
        self.final_dim = final_dim
        if average_pool:
            after_roi_align += [nn.AvgPool2d(7, stride=1), Flattener()]

        self.after_roi_align = torch.nn.Sequential(*after_roi_align)

        self.obj_downsample = torch.nn.Sequential(
            torch.nn.Dropout(p=0.1),
            torch.nn.Linear(2048 + (128 if semantic else 0), final_dim),
            torch.nn.ReLU(inplace=True),
        )
        self.regularizing_predictor = torch.nn.Linear(2048, 81)
Exemple #5
0
    def __init__(self, anchors, is_train, num_class):
        '''
        It is tiny FasterRcnn. Only for feature_map shape [1024,50,80]
        '''
        super().__init__()
        self.is_train = is_train
        self.num_class = num_class

        self.backbone = TinyBackbone()

        self.rpn = TinyRPN(anchors, self.is_train)

        self.roi_align = RoIAlign(output_size=(14, 14),
                                  spatial_scale=0.0625,
                                  sampling_ratio=0)

        self.fastrcnn_fc_layer = nn.Linear(1024 * 14 * 14, 1024)
        self.class_head = nn.Linear(1024, num_class + 1)  # Add bg
        self.box_head = nn.Linear(1024, num_class * 4)
Exemple #6
0
    def __init__(self, anchors, is_train, num_class):
        '''
        It is tiny FasterRcnn. Only for feature_map shape [1024,50,80]
        '''
        super().__init__()
        self.is_train = is_train
        self.num_class = num_class

        self.backbone = TinyBackbone()

        self.rpn = TinyRPN(anchors, self.is_train)

        self.roi_align = RoIAlign(output_size=(14, 14),
                                  spatial_scale=0.0625,
                                  sampling_ratio=0)

        self.fastrcnn_fc_layer = nn.Linear(1024 * 14 * 14, 1024)
        self.class_head = nn.Linear(1024, num_class + 1)  # Add bg
        self.box_head = nn.Linear(1024, num_class * 4)
        self.maks_head = nn.Sequential(
            nn.ConvTranspose2d(1024, 256, kernel_size=2, stride=2, padding=0),
            nn.Conv2d(256, num_class, kernel_size=1, stride=1))
Exemple #7
0
    def __init__(self):
        super(Net, self).__init__()
        # define private variables
        self.time_step = C.RIN.INPUT_SIZE
        self.ve_feat_dim = C.RIN.VE_FEAT_DIM  # visual encoder feature dimension
        self.in_feat_dim = C.RIN.IN_FEAT_DIM  # interaction net feature dimension
        self.num_objs = C.RIN.NUM_OBJS
        self.mask_size = C.RIN.MASK_SIZE
        self.po_feat_dim = (
            self.in_feat_dim if C.RIN.COOR_FEATURE_EMBEDDING or C.RIN.COOR_FEATURE_SINUSOID else 2
        ) if C.RIN.COOR_FEATURE else 0  # position feature dimension

        # build image encoder
        self.backbone = build_backbone(C.RIN.BACKBONE, self.ve_feat_dim, C.INPUT.IMAGE_CHANNEL)

        # extract object feature -> convert to object state
        pool_size = C.RIN.ROI_POOL_SIZE
        self.roi_align = RoIAlign(
            (pool_size, pool_size),
            spatial_scale=C.RIN.ROI_POOL_SPATIAL_SCALE,
            sampling_ratio=C.RIN.ROI_POOL_SAMPLE_R,
        )
        roi2state = [nn.Linear(self.ve_feat_dim * pool_size * pool_size, self.in_feat_dim), nn.ReLU()]
        for _ in range(C.RIN.N_EXTRA_ROI_F):
            roi2state.append(nn.Linear(self.in_feat_dim, self.in_feat_dim))
            roi2state.append(nn.ReLU(inplace=True))
        self.roi2state = nn.Sequential(*roi2state)

        graph = []
        for i in range(self.time_step):
            graph.append(InterNet(self.in_feat_dim))
        self.graph = nn.ModuleList(graph)

        predictor = [nn.Linear(self.in_feat_dim * self.time_step, self.in_feat_dim), nn.ReLU()]
        for _ in range(C.RIN.N_EXTRA_PRED_F):
            predictor.append(nn.Linear(self.in_feat_dim, self.in_feat_dim))
            predictor.append(nn.ReLU(inplace=True))
        self.predictor = nn.Sequential(*predictor)

        self.decoder_output = 4
        self.bbox_decoder = nn.Linear(self.in_feat_dim, self.decoder_output)

        if C.RIN.MASK_LOSS_WEIGHT > 0:
            self.mask_decoder = nn.Sequential(
                nn.Linear(self.in_feat_dim, self.in_feat_dim),
                nn.Linear(self.in_feat_dim, self.mask_size * self.mask_size),
                nn.Sigmoid(),
            )

        self.image2prior = nn.Sequential(
            nn.Conv2d(self.ve_feat_dim * 2, self.ve_feat_dim, 3, 2, 1),
            nn.ReLU(),
            nn.Conv2d(self.ve_feat_dim, self.ve_feat_dim, 3, 2, 1),
            nn.ReLU(),
            nn.Conv2d(self.ve_feat_dim, self.ve_feat_dim, 3, 2, 1),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d((1, 1)),
        )
        self.vae_dim = 8
        self.lstm_layers = 1
        self.vae_lstm = nn.LSTM(self.vae_dim, self.vae_dim, self.lstm_layers)
        self.vae_mu_head = nn.Linear(self.ve_feat_dim, 8)
        self.vae_logvar_head = nn.Linear(self.ve_feat_dim, 8)
        self.red_prior = nn.Linear(self.in_feat_dim + 8, self.in_feat_dim)
Exemple #8
0
    def __init__(self, trans, conv):
        super(Net, self).__init__()
        # a bunch of temporary flag, the useful setting will be merge to config file
        # transformer parameters
        self.trans = trans
        self.conv = conv
        # here is just to easily setup experiments
        self.use_ln = False
        self.norm_before_relu = False
        self.pos_feat_ar = False
        # define private variables
        self.time_step = C.RIN.INPUT_SIZE
        self.ve_feat_dim = C.RIN.VE_FEAT_DIM  # visual encoder feature dimension
        self.in_feat_dim = C.RIN.IN_FEAT_DIM  # interaction net feature dimension
        self.num_objs = C.RIN.NUM_OBJS
        self.mask_size = C.RIN.MASK_SIZE
        self.po_feat_dim = self.in_feat_dim if C.RIN.COOR_FEATURE else 0  # position feature dimension

        # build image encoder
        self.backbone = build_backbone(C.RIN.BACKBONE, self.ve_feat_dim, C.INPUT.IMAGE_CHANNEL)

        # extract object feature -> convert to object state
        pool_size = C.RIN.ROI_POOL_SIZE
        self.roi_align = RoIAlign(
            (pool_size, pool_size),
            spatial_scale=C.RIN.ROI_POOL_SPATIAL_SCALE,
            sampling_ratio=C.RIN.ROI_POOL_SAMPLE_R,
        )
        roi2state = [nn.Conv2d(self.ve_feat_dim, self.in_feat_dim, kernel_size=3, padding=1),
                     nn.ReLU(inplace=True)]
        assert C.RIN.N_EXTRA_ROI_F > 0
        for _ in range(C.RIN.N_EXTRA_ROI_F):
            roi2state.append(nn.Conv2d(self.ve_feat_dim, self.in_feat_dim,
                                       kernel_size=C.RIN.EXTRA_F_KERNEL, stride=1, padding=C.RIN.EXTRA_F_PADDING))
            if self.norm_before_relu and _ == C.RIN.N_EXTRA_ROI_F - 1:
                continue
            roi2state.append(nn.ReLU(inplace=True))
        self.roi2state = nn.Sequential(*roi2state)

        graph = []
        for i in range(self.time_step):
            graph.append(InterNet(self.in_feat_dim, trans=self.trans, conv=self.conv))
        self.graph = nn.ModuleList(graph)

        assert C.RIN.N_EXTRA_PRED_F == 0
        if self.norm_before_relu:
            predictor = [nn.Conv2d(self.in_feat_dim * self.time_step, self.in_feat_dim, kernel_size=1)]
        else:
            predictor = [nn.Conv2d(self.in_feat_dim * self.time_step, self.in_feat_dim, kernel_size=1), nn.ReLU()]

        for _ in range(C.RIN.N_EXTRA_PRED_F):
            predictor.append(nn.Conv2d(self.in_feat_dim, self.in_feat_dim,
                                       kernel_size=C.RIN.EXTRA_F_KERNEL, stride=1, padding=C.RIN.EXTRA_F_PADDING))
            predictor.append(nn.ReLU(inplace=True))
        self.predictor = nn.Sequential(*predictor)

        self.decoder_output = 4
        self.bbox_decoder = nn.Linear(self.in_feat_dim * pool_size * pool_size, self.decoder_output)

        if C.RIN.COOR_FEATURE:
            self.pos_encoder = nn.Sequential(
                nn.Linear(4, self.po_feat_dim),
                nn.ReLU(inplace=True),
                nn.Linear(self.po_feat_dim, self.po_feat_dim),
                nn.ReLU(inplace=True),
            )
            if self.norm_before_relu:
                self.pos_merger = nn.Sequential(
                    nn.Conv2d(self.in_feat_dim + self.po_feat_dim, self.in_feat_dim, kernel_size=3, stride=1,
                              padding=1),
                )
            else:
                self.pos_merger = nn.Sequential(
                    nn.Conv2d(self.in_feat_dim + self.po_feat_dim, self.in_feat_dim, kernel_size=3, stride=1,
                              padding=1),
                    nn.ReLU(inplace=True),
                )
        
        if C.RIN.MASK_LOSS_WEIGHT > 0:
            self.mask_decoder = nn.Sequential(
                nn.Linear(self.in_feat_dim * pool_size * pool_size, self.in_feat_dim),
                nn.ReLU(inplace=True),
                nn.Linear(self.in_feat_dim, self.mask_size * self.mask_size),
                nn.Sigmoid(),
            )

        if C.RIN.SEQ_CLS_LOSS_WEIGHT > 0:
            self.seq_feature = nn.Sequential(
                nn.Linear(self.in_feat_dim * pool_size * pool_size, self.in_feat_dim * 4),
                nn.ReLU(inplace=True),
                nn.Linear(self.in_feat_dim * 4, self.in_feat_dim),
                nn.ReLU(inplace=True),
            )
            self.seq_score = nn.Sequential(
                nn.Linear(self.in_feat_dim, 1),
                nn.Sigmoid()
            )

        if self.use_ln:
            norms = [nn.LayerNorm([self.in_feat_dim, 5, 5]) for _ in range(self.time_step)]
            self.norms = nn.ModuleList(norms)
    def __init__(self):
        super(Net, self).__init__()
        # a bunch of temporary flag, the useful setting will be merge to config file
        # here is just to easily setup experiments
        self.use_conv_for_mask = False
        # define private variables
        self.time_step = C.RIN.INPUT_SIZE
        self.ve_feat_dim = C.RIN.VE_FEAT_DIM  # visual encoder feature dimension
        self.in_feat_dim = C.RIN.IN_FEAT_DIM  # interaction net feature dimension
        self.num_objs = C.RIN.NUM_OBJS
        self.mask_size = C.RIN.MASK_SIZE
        self.po_feat_dim = self.in_feat_dim if C.RIN.COOR_FEATURE else 0  # position feature dimension

        # build image encoder
        self.backbone = build_backbone(C.RIN.BACKBONE, self.ve_feat_dim,
                                       C.INPUT.IMAGE_CHANNEL)

        # extract object feature -> convert to object state
        pool_size = C.RIN.ROI_POOL_SIZE
        self.roi_align = RoIAlign(
            (pool_size, pool_size),
            spatial_scale=C.RIN.ROI_POOL_SPATIAL_SCALE,
            sampling_ratio=C.RIN.ROI_POOL_SAMPLE_R,
        )
        roi2state = [
            nn.Conv2d(self.ve_feat_dim,
                      self.in_feat_dim,
                      kernel_size=3,
                      padding=1),
            nn.ReLU(inplace=True)
        ]
        for _ in range(C.RIN.N_EXTRA_ROI_F):
            roi2state.append(
                nn.Conv2d(self.ve_feat_dim,
                          self.in_feat_dim,
                          kernel_size=C.RIN.EXTRA_F_KERNEL,
                          stride=1,
                          padding=C.RIN.EXTRA_F_PADDING))
            roi2state.append(nn.ReLU(inplace=True))
        self.roi2state = nn.Sequential(*roi2state)

        graph = []
        for i in range(self.time_step):
            graph.append(InterNet(self.in_feat_dim))
        self.graph = nn.ModuleList(graph)

        predictor = [
            nn.Conv2d(self.in_feat_dim * self.time_step,
                      self.in_feat_dim,
                      kernel_size=1),
            nn.ReLU()
        ]
        for _ in range(C.RIN.N_EXTRA_PRED_F):
            predictor.append(
                nn.Conv2d(self.in_feat_dim,
                          self.in_feat_dim,
                          kernel_size=C.RIN.EXTRA_F_KERNEL,
                          stride=1,
                          padding=C.RIN.EXTRA_F_PADDING))
            predictor.append(nn.ReLU(inplace=True))
        self.predictor = nn.Sequential(*predictor)

        self.decoder_output = 4
        self.bbox_decoder = nn.Linear(self.in_feat_dim * pool_size * pool_size,
                                      self.decoder_output)

        if C.RIN.COOR_FEATURE:
            self.pos_encoder = nn.Sequential(
                nn.Linear(4, self.in_feat_dim),
                nn.ReLU(inplace=True),
                nn.Linear(self.in_feat_dim, self.in_feat_dim),
                nn.ReLU(inplace=True),
            )
            self.pos_merger = nn.Sequential(
                nn.Conv2d(self.in_feat_dim + self.po_feat_dim,
                          self.in_feat_dim,
                          kernel_size=3,
                          stride=1,
                          padding=1),
                nn.ReLU(inplace=True),
            )

        if C.RIN.MASK_LOSS_WEIGHT > 0:
            if self.use_conv_for_mask:
                self.mask_decoder = nn.Sequential(
                    nn.Conv2d(self.in_feat_dim,
                              self.in_feat_dim,
                              kernel_size=1),
                    nn.ReLU(inplace=True),
                    nn.Conv2d(self.in_feat_dim,
                              self.in_feat_dim,
                              kernel_size=3,
                              padding=1),
                    nn.ReLU(inplace=True),
                    nn.UpsamplingNearest2d(size=(C.RIN.MASK_SIZE,
                                                 C.RIN.MASK_SIZE)),
                    nn.Conv2d(self.in_feat_dim,
                              self.in_feat_dim,
                              kernel_size=3,
                              padding=1),
                    nn.ReLU(inplace=True),
                    nn.Conv2d(self.in_feat_dim, 1, kernel_size=3, padding=1),
                    nn.Sigmoid(),
                )
            else:
                self.mask_decoder = nn.Sequential(
                    nn.Linear(self.in_feat_dim * pool_size * pool_size,
                              self.in_feat_dim),
                    nn.ReLU(inplace=True),
                    nn.Linear(self.in_feat_dim,
                              self.mask_size * self.mask_size),
                    nn.Sigmoid(),
                )

        self.image2prior = nn.Sequential(
            nn.Conv2d(self.ve_feat_dim * 2, self.ve_feat_dim, 3, 2, 1),
            nn.ReLU(),
            nn.Conv2d(self.ve_feat_dim, self.ve_feat_dim, 3, 2, 1),
            nn.ReLU(),
            nn.Conv2d(self.ve_feat_dim, self.ve_feat_dim, 3, 2, 1),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d((1, 1)),
        )
        self.vae_dim = 8
        self.lstm_layers = 1
        self.vae_lstm = nn.LSTM(self.vae_dim, self.vae_dim, self.lstm_layers)
        self.vae_mu_head = nn.Linear(self.ve_feat_dim, 8)
        self.vae_logvar_head = nn.Linear(self.ve_feat_dim, 8)
        self.red_prior = nn.Conv2d(self.in_feat_dim + 8,
                                   self.in_feat_dim,
                                   kernel_size=1)
def get_visual_data(save_dir, data_dir, video_dir, video_name, resnext101, device):
    tracks_path = os.path.join(save_dir, 'tracks.pkl')
    length_path = os.path.join(save_dir, 'length.pkl')
    video_path = os.path.join(video_dir, video_name + '.avi')
    visual_feature_dir = os.path.join(data_dir, 'objects')
    class_feature_dir = os.path.join(data_dir, 'category')
    visual_feature_path = os.path.join(visual_feature_dir, video_name + '.npy')
    class_feature_path = os.path.join(class_feature_dir, video_name + '.npy')
    objects_mask_path = os.path.join(visual_feature_dir, video_name + '_mask.pkl')
    hastracks_path = os.path.join(visual_feature_dir, video_name + '_hastracks.pkl')

    with open(tracks_path, 'rb') as f:
        tracks = pickle.load(f)
    with open(length_path, 'rb') as f:
        frames_numb = pickle.load(f)

    interval = frames_numb // 15
    tracks = [item for item in tracks if len(item['frame_ids']) >= interval]
    tracks = sorted(tracks, key=lambda x: len(x['frame_ids']), reverse=True)
    if len(tracks) > 20:
        tracks = tracks[:20]
    num_tracks = len(tracks)

    if num_tracks == 0:
        has_tracks = False
        visual_features = np.zeros([15, 1, 2048, 4, 4])
        objects_mask = np.zeros([15, 1])
        class_features = np.zeros([1, 1000], dtype=int)

        np.save(visual_feature_path, visual_features)
        np.save(class_feature_path, class_features)
        with open(objects_mask_path, 'wb') as f:
            pickle.dump(objects_mask, f)
        with open(hastracks_path, 'wb') as f:
            pickle.dump(has_tracks, f)
        return

    frames_store = []
    import cv2
    vdo = cv2.VideoCapture()
    vdo.open(video_path)
    im_width, im_height = int(vdo.get(cv2.CAP_PROP_FRAME_WIDTH)), int(vdo.get(cv2.CAP_PROP_FRAME_HEIGHT))
    while vdo.grab():
        _, ori_im = vdo.retrieve()
        im = cv2.cvtColor(ori_im, cv2.COLOR_BGR2RGB)
        im = Image.fromarray(im)
        frames_store.append(trans(im).unsqueeze(0))
    new_height, new_width = frames_store[0].shape[-2], frames_store[0].shape[-1]
    assert len(frames_store) == frames_numb, \
        'expect len(frames_store) == frames_numb, ' \
        'but got {} and {}'.format(len(frames_store), frames_numb)

    visit = np.zeros(frames_numb + 5, dtype=int)
    for track in tracks:
        frames_ids = track['frame_ids']
        for id in frames_ids:
            new_id = id - 1
            assert new_id >= 0
            visit[new_id] = 1

    class_store = []
    for track in tracks:
        class_id = track['class_id']
        class_vec = np.zeros(1000, dtype=int)
        class_vec[class_id] = 1
        class_store.append(class_vec)
    class_store = np.concatenate([item[np.newaxis, ...] for item in class_store], axis=0)
    assert class_store.shape == (num_tracks, 1000), \
        'expected class_store.shape == ({}, 1000), but got {}'.\
            format(num_tracks, class_store.shape)

    time_span = 15
    step = max(int(visit.sum()) // time_span, 1)
    anchor_idxs = []
    has_tracks = True
    cnt = 0
    for i in range(frames_numb):
        if visit[i] == 0: continue
        cnt += 1
        if cnt % step == 0:
            anchor_idxs.append(i)
    origin_anchors_len = len(anchor_idxs)
    idxs = np.linspace(0, len(anchor_idxs), time_span, endpoint=False, dtype=int).tolist()
    anchor_idxs = [anchor_idxs[i] for i in idxs] if origin_anchors_len > time_span else anchor_idxs
    if origin_anchors_len >= time_span:
        assert len(anchor_idxs) == time_span, \
            'expected len(anchor_idxs) == time_span, but got {}, and now step is {}'.\
                format(len(anchor_idxs), step)
    else:
        assert int(visit.sum()) < time_span, \
            'expect visit.sum() < time_span, but get {} and step is {}'.format(int(visit.sum()), step)

    pool_size = 4
    spatial_scale = 1.0 / 32.0
    fake_feature = np.zeros([1, resnext_dim, pool_size, pool_size])
    roi_align = RoIAlign((pool_size, pool_size), spatial_scale=spatial_scale, sampling_ratio=1).to(device)

    result_feature = []
    objects_mask = np.zeros([time_span, len(tracks)])
    for i in range(time_span):
        idx = anchor_idxs[i] if i < len(anchor_idxs) else None
        temp_store = []
        feature_map = resnext101(frames_store[idx].to(device)) if idx is not None else None  # (1, 2048, H/32, W/32)
        for j, item in enumerate(tracks):
            frames_ids, positions, class_id = item['frame_ids'], item['positions'], item['class_id']
            if idx is None or idx not in frames_ids:
                objects_mask[i, j] = 0
                temp_store.append(fake_feature)
            else:
                ptr = frames_ids.index(idx)
                position = positions[ptr]
                position = process_pos(position, im_width, im_height, new_width, new_height)
                x1, y1, x2, y2 = position[0], position[1], position[2], position[3]
                bbox = torch.FloatTensor([x1, y1, x2, y2]).unsqueeze(dim=0).to(device)
                roi_feature = roi_align(feature_map, [bbox])
                assert roi_feature.shape == (1, resnext_dim, pool_size, pool_size), \
                    'expected roi_feature.shape is  {} but got {}'.\
                    format((1, resnext_dim, pool_size, pool_size), roi_feature.shape)

                objects_mask[i, j] = 1
                temp_store.append(roi_feature.detach().cpu().numpy())

        temp_store = np.concatenate([item for item in temp_store], axis=0)
        assert temp_store.shape == (num_tracks, resnext_dim, pool_size, pool_size), \
            'expected temp_store.shape == {}, but got {}'.\
                format((num_tracks, resnext_dim, pool_size, pool_size), temp_store.shape)
        result_feature.append(temp_store)

    result_feature = np.concatenate([item[np.newaxis, ...] for item in result_feature], axis=0)
    assert result_feature.shape == (time_span, num_tracks, resnext_dim, pool_size, pool_size), \
        'expected result_feature.shape == {}, but got {}'.\
            format((time_span, num_tracks, resnext_dim, pool_size, pool_size), result_feature.shape)
    assert objects_mask[len(anchor_idxs):, ...].sum() == 0., \
        'expect 0. in objects_mask[len(anchor_idxs):, ...], but got {}'.\
            format(objects_mask[len(anchor_idxs):, ...].sum())

    np.save(visual_feature_path, result_feature)
    np.save(class_feature_path, class_store)
    with open(objects_mask_path, 'wb') as f:
        pickle.dump(objects_mask, f)
    with open(hastracks_path, 'wb') as f:
        pickle.dump(has_tracks, f)

    print('{video_name} has {x} objects'.format(video_name=video_name, x=len(tracks) if has_tracks else 0))