def __init__(self, hparam): self.load_from_checkpoint(hparam['pretrain_path']) self.backbone = BackBone() self.bbox_regress = ComponentRegress() self.roi_align = RoIAlign(output_size=(32, 32), spatial_scale=128. / 512., sampling_ratio=-1) self.inner_Module = inner_Module(num=4) self.outer_Seg = outer_Seg(n_class=2) self.criterion = CrossEntropyLoss()
def __init__(self, hparam): super(Hybird, self).__init__() self.backbone = BackBone() self.bbox_regress = ComponentRegress() self.roi_align = RoIAlign(output_size=(32, 32), spatial_scale=128. / 512., sampling_ratio=-1) self.inner_Module = inner_Module(num=4) self.outer_Seg = outer_Seg(n_class=3) self.load_from_checkpoint(hparam.pretrain_path, torch.device(f"cuda:{hparam.cuda}"))
def __init__(self): super(Net, self).__init__() # define private variables self.time_step = C.RPIN.INPUT_SIZE self.ve_feat_dim = C.RPIN.VE_FEAT_DIM # visual encoder feature dimension self.in_feat_dim = C.RPIN.IN_FEAT_DIM # interaction net feature dimension self.num_objs = C.RPIN.MAX_NUM_OBJS self.mask_size = C.RPIN.MASK_SIZE # build image encoder self.backbone = build_backbone(C.RPIN.BACKBONE, self.ve_feat_dim, C.INPUT.IMAGE_CHANNEL) # extract object feature -> convert to object state pool_size = C.RPIN.ROI_POOL_SIZE self.roi_align = RoIAlign( (pool_size, pool_size), spatial_scale=C.RPIN.ROI_POOL_SPATIAL_SCALE, sampling_ratio=C.RPIN.ROI_POOL_SAMPLE_R, ) roi2state = [nn.Conv2d(self.ve_feat_dim, self.in_feat_dim, kernel_size=3, padding=1), nn.ReLU(inplace=True)] for _ in range(C.RPIN.N_EXTRA_ROI_F): roi2state.append(nn.Conv2d(self.ve_feat_dim, self.in_feat_dim, kernel_size=C.RPIN.EXTRA_F_KERNEL, stride=1, padding=C.RPIN.EXTRA_F_PADDING)) roi2state.append(nn.ReLU(inplace=True)) self.roi2state = nn.Sequential(*roi2state) graph = [] for i in range(self.time_step): graph.append(InterNet(self.in_feat_dim)) self.graph = nn.ModuleList(graph) predictor = [nn.Conv2d(self.in_feat_dim * self.time_step, self.in_feat_dim, kernel_size=1), nn.ReLU()] for _ in range(C.RPIN.N_EXTRA_PRED_F): predictor.append(nn.Conv2d(self.in_feat_dim, self.in_feat_dim, kernel_size=C.RPIN.EXTRA_F_KERNEL, stride=1, padding=C.RPIN.EXTRA_F_PADDING)) predictor.append(nn.ReLU(inplace=True)) self.predictor = nn.Sequential(*predictor) self.decoder_output = 4 self.bbox_decoder = nn.Linear(self.in_feat_dim * pool_size * pool_size, self.decoder_output) if C.RPIN.MASK_LOSS_WEIGHT > 0: self.mask_decoder = nn.Sequential( nn.Linear(self.in_feat_dim * pool_size * pool_size, self.in_feat_dim), nn.ReLU(inplace=True), nn.Linear(self.in_feat_dim, self.mask_size * self.mask_size), nn.Sigmoid(), )
def __init__(self, pretrained=True, average_pool=True, semantic=True, final_dim=1024): """ :param average_pool: whether or not to average pool the representations :param pretrained: Whether we need to load from scratch :param semantic: Whether or not we want to introduce the mask and the class label early on (default Yes) """ super(SimpleDetector, self).__init__() # huge thx to https://github.com/ruotianluo/pytorch-faster-rcnn/blob/master/lib/nets/resnet_v1.py backbone = _load_resnet_imagenet(pretrained=pretrained) if USE_IMAGENET_PRETRAINED else _load_resnet( pretrained=pretrained) self.backbone = nn.Sequential( backbone.conv1, backbone.bn1, backbone.relu, backbone.maxpool, backbone.layer1, backbone.layer2, backbone.layer3, # backbone.layer4 ) self.roi_align = RoIAlign((7, 7) if USE_IMAGENET_PRETRAINED else (14, 14), spatial_scale=1 / 16, sampling_ratio=0) if semantic: self.mask_dims = 32 self.object_embed = torch.nn.Embedding(num_embeddings=81, embedding_dim=128) self.mask_upsample = torch.nn.Conv2d(1, self.mask_dims, kernel_size=3, stride=2 if USE_IMAGENET_PRETRAINED else 1, padding=1, bias=True) else: self.object_embed = None self.mask_upsample = None after_roi_align = [backbone.layer4] self.final_dim = final_dim if average_pool: after_roi_align += [nn.AvgPool2d(7, stride=1), Flattener()] self.after_roi_align = torch.nn.Sequential(*after_roi_align) self.obj_downsample = torch.nn.Sequential( torch.nn.Dropout(p=0.1), torch.nn.Linear(2048 + (128 if semantic else 0), final_dim), torch.nn.ReLU(inplace=True), ) self.regularizing_predictor = torch.nn.Linear(2048, 81)
def __init__(self, anchors, is_train, num_class): ''' It is tiny FasterRcnn. Only for feature_map shape [1024,50,80] ''' super().__init__() self.is_train = is_train self.num_class = num_class self.backbone = TinyBackbone() self.rpn = TinyRPN(anchors, self.is_train) self.roi_align = RoIAlign(output_size=(14, 14), spatial_scale=0.0625, sampling_ratio=0) self.fastrcnn_fc_layer = nn.Linear(1024 * 14 * 14, 1024) self.class_head = nn.Linear(1024, num_class + 1) # Add bg self.box_head = nn.Linear(1024, num_class * 4)
def __init__(self, anchors, is_train, num_class): ''' It is tiny FasterRcnn. Only for feature_map shape [1024,50,80] ''' super().__init__() self.is_train = is_train self.num_class = num_class self.backbone = TinyBackbone() self.rpn = TinyRPN(anchors, self.is_train) self.roi_align = RoIAlign(output_size=(14, 14), spatial_scale=0.0625, sampling_ratio=0) self.fastrcnn_fc_layer = nn.Linear(1024 * 14 * 14, 1024) self.class_head = nn.Linear(1024, num_class + 1) # Add bg self.box_head = nn.Linear(1024, num_class * 4) self.maks_head = nn.Sequential( nn.ConvTranspose2d(1024, 256, kernel_size=2, stride=2, padding=0), nn.Conv2d(256, num_class, kernel_size=1, stride=1))
def __init__(self): super(Net, self).__init__() # define private variables self.time_step = C.RIN.INPUT_SIZE self.ve_feat_dim = C.RIN.VE_FEAT_DIM # visual encoder feature dimension self.in_feat_dim = C.RIN.IN_FEAT_DIM # interaction net feature dimension self.num_objs = C.RIN.NUM_OBJS self.mask_size = C.RIN.MASK_SIZE self.po_feat_dim = ( self.in_feat_dim if C.RIN.COOR_FEATURE_EMBEDDING or C.RIN.COOR_FEATURE_SINUSOID else 2 ) if C.RIN.COOR_FEATURE else 0 # position feature dimension # build image encoder self.backbone = build_backbone(C.RIN.BACKBONE, self.ve_feat_dim, C.INPUT.IMAGE_CHANNEL) # extract object feature -> convert to object state pool_size = C.RIN.ROI_POOL_SIZE self.roi_align = RoIAlign( (pool_size, pool_size), spatial_scale=C.RIN.ROI_POOL_SPATIAL_SCALE, sampling_ratio=C.RIN.ROI_POOL_SAMPLE_R, ) roi2state = [nn.Linear(self.ve_feat_dim * pool_size * pool_size, self.in_feat_dim), nn.ReLU()] for _ in range(C.RIN.N_EXTRA_ROI_F): roi2state.append(nn.Linear(self.in_feat_dim, self.in_feat_dim)) roi2state.append(nn.ReLU(inplace=True)) self.roi2state = nn.Sequential(*roi2state) graph = [] for i in range(self.time_step): graph.append(InterNet(self.in_feat_dim)) self.graph = nn.ModuleList(graph) predictor = [nn.Linear(self.in_feat_dim * self.time_step, self.in_feat_dim), nn.ReLU()] for _ in range(C.RIN.N_EXTRA_PRED_F): predictor.append(nn.Linear(self.in_feat_dim, self.in_feat_dim)) predictor.append(nn.ReLU(inplace=True)) self.predictor = nn.Sequential(*predictor) self.decoder_output = 4 self.bbox_decoder = nn.Linear(self.in_feat_dim, self.decoder_output) if C.RIN.MASK_LOSS_WEIGHT > 0: self.mask_decoder = nn.Sequential( nn.Linear(self.in_feat_dim, self.in_feat_dim), nn.Linear(self.in_feat_dim, self.mask_size * self.mask_size), nn.Sigmoid(), ) self.image2prior = nn.Sequential( nn.Conv2d(self.ve_feat_dim * 2, self.ve_feat_dim, 3, 2, 1), nn.ReLU(), nn.Conv2d(self.ve_feat_dim, self.ve_feat_dim, 3, 2, 1), nn.ReLU(), nn.Conv2d(self.ve_feat_dim, self.ve_feat_dim, 3, 2, 1), nn.ReLU(), nn.AdaptiveAvgPool2d((1, 1)), ) self.vae_dim = 8 self.lstm_layers = 1 self.vae_lstm = nn.LSTM(self.vae_dim, self.vae_dim, self.lstm_layers) self.vae_mu_head = nn.Linear(self.ve_feat_dim, 8) self.vae_logvar_head = nn.Linear(self.ve_feat_dim, 8) self.red_prior = nn.Linear(self.in_feat_dim + 8, self.in_feat_dim)
def __init__(self, trans, conv): super(Net, self).__init__() # a bunch of temporary flag, the useful setting will be merge to config file # transformer parameters self.trans = trans self.conv = conv # here is just to easily setup experiments self.use_ln = False self.norm_before_relu = False self.pos_feat_ar = False # define private variables self.time_step = C.RIN.INPUT_SIZE self.ve_feat_dim = C.RIN.VE_FEAT_DIM # visual encoder feature dimension self.in_feat_dim = C.RIN.IN_FEAT_DIM # interaction net feature dimension self.num_objs = C.RIN.NUM_OBJS self.mask_size = C.RIN.MASK_SIZE self.po_feat_dim = self.in_feat_dim if C.RIN.COOR_FEATURE else 0 # position feature dimension # build image encoder self.backbone = build_backbone(C.RIN.BACKBONE, self.ve_feat_dim, C.INPUT.IMAGE_CHANNEL) # extract object feature -> convert to object state pool_size = C.RIN.ROI_POOL_SIZE self.roi_align = RoIAlign( (pool_size, pool_size), spatial_scale=C.RIN.ROI_POOL_SPATIAL_SCALE, sampling_ratio=C.RIN.ROI_POOL_SAMPLE_R, ) roi2state = [nn.Conv2d(self.ve_feat_dim, self.in_feat_dim, kernel_size=3, padding=1), nn.ReLU(inplace=True)] assert C.RIN.N_EXTRA_ROI_F > 0 for _ in range(C.RIN.N_EXTRA_ROI_F): roi2state.append(nn.Conv2d(self.ve_feat_dim, self.in_feat_dim, kernel_size=C.RIN.EXTRA_F_KERNEL, stride=1, padding=C.RIN.EXTRA_F_PADDING)) if self.norm_before_relu and _ == C.RIN.N_EXTRA_ROI_F - 1: continue roi2state.append(nn.ReLU(inplace=True)) self.roi2state = nn.Sequential(*roi2state) graph = [] for i in range(self.time_step): graph.append(InterNet(self.in_feat_dim, trans=self.trans, conv=self.conv)) self.graph = nn.ModuleList(graph) assert C.RIN.N_EXTRA_PRED_F == 0 if self.norm_before_relu: predictor = [nn.Conv2d(self.in_feat_dim * self.time_step, self.in_feat_dim, kernel_size=1)] else: predictor = [nn.Conv2d(self.in_feat_dim * self.time_step, self.in_feat_dim, kernel_size=1), nn.ReLU()] for _ in range(C.RIN.N_EXTRA_PRED_F): predictor.append(nn.Conv2d(self.in_feat_dim, self.in_feat_dim, kernel_size=C.RIN.EXTRA_F_KERNEL, stride=1, padding=C.RIN.EXTRA_F_PADDING)) predictor.append(nn.ReLU(inplace=True)) self.predictor = nn.Sequential(*predictor) self.decoder_output = 4 self.bbox_decoder = nn.Linear(self.in_feat_dim * pool_size * pool_size, self.decoder_output) if C.RIN.COOR_FEATURE: self.pos_encoder = nn.Sequential( nn.Linear(4, self.po_feat_dim), nn.ReLU(inplace=True), nn.Linear(self.po_feat_dim, self.po_feat_dim), nn.ReLU(inplace=True), ) if self.norm_before_relu: self.pos_merger = nn.Sequential( nn.Conv2d(self.in_feat_dim + self.po_feat_dim, self.in_feat_dim, kernel_size=3, stride=1, padding=1), ) else: self.pos_merger = nn.Sequential( nn.Conv2d(self.in_feat_dim + self.po_feat_dim, self.in_feat_dim, kernel_size=3, stride=1, padding=1), nn.ReLU(inplace=True), ) if C.RIN.MASK_LOSS_WEIGHT > 0: self.mask_decoder = nn.Sequential( nn.Linear(self.in_feat_dim * pool_size * pool_size, self.in_feat_dim), nn.ReLU(inplace=True), nn.Linear(self.in_feat_dim, self.mask_size * self.mask_size), nn.Sigmoid(), ) if C.RIN.SEQ_CLS_LOSS_WEIGHT > 0: self.seq_feature = nn.Sequential( nn.Linear(self.in_feat_dim * pool_size * pool_size, self.in_feat_dim * 4), nn.ReLU(inplace=True), nn.Linear(self.in_feat_dim * 4, self.in_feat_dim), nn.ReLU(inplace=True), ) self.seq_score = nn.Sequential( nn.Linear(self.in_feat_dim, 1), nn.Sigmoid() ) if self.use_ln: norms = [nn.LayerNorm([self.in_feat_dim, 5, 5]) for _ in range(self.time_step)] self.norms = nn.ModuleList(norms)
def __init__(self): super(Net, self).__init__() # a bunch of temporary flag, the useful setting will be merge to config file # here is just to easily setup experiments self.use_conv_for_mask = False # define private variables self.time_step = C.RIN.INPUT_SIZE self.ve_feat_dim = C.RIN.VE_FEAT_DIM # visual encoder feature dimension self.in_feat_dim = C.RIN.IN_FEAT_DIM # interaction net feature dimension self.num_objs = C.RIN.NUM_OBJS self.mask_size = C.RIN.MASK_SIZE self.po_feat_dim = self.in_feat_dim if C.RIN.COOR_FEATURE else 0 # position feature dimension # build image encoder self.backbone = build_backbone(C.RIN.BACKBONE, self.ve_feat_dim, C.INPUT.IMAGE_CHANNEL) # extract object feature -> convert to object state pool_size = C.RIN.ROI_POOL_SIZE self.roi_align = RoIAlign( (pool_size, pool_size), spatial_scale=C.RIN.ROI_POOL_SPATIAL_SCALE, sampling_ratio=C.RIN.ROI_POOL_SAMPLE_R, ) roi2state = [ nn.Conv2d(self.ve_feat_dim, self.in_feat_dim, kernel_size=3, padding=1), nn.ReLU(inplace=True) ] for _ in range(C.RIN.N_EXTRA_ROI_F): roi2state.append( nn.Conv2d(self.ve_feat_dim, self.in_feat_dim, kernel_size=C.RIN.EXTRA_F_KERNEL, stride=1, padding=C.RIN.EXTRA_F_PADDING)) roi2state.append(nn.ReLU(inplace=True)) self.roi2state = nn.Sequential(*roi2state) graph = [] for i in range(self.time_step): graph.append(InterNet(self.in_feat_dim)) self.graph = nn.ModuleList(graph) predictor = [ nn.Conv2d(self.in_feat_dim * self.time_step, self.in_feat_dim, kernel_size=1), nn.ReLU() ] for _ in range(C.RIN.N_EXTRA_PRED_F): predictor.append( nn.Conv2d(self.in_feat_dim, self.in_feat_dim, kernel_size=C.RIN.EXTRA_F_KERNEL, stride=1, padding=C.RIN.EXTRA_F_PADDING)) predictor.append(nn.ReLU(inplace=True)) self.predictor = nn.Sequential(*predictor) self.decoder_output = 4 self.bbox_decoder = nn.Linear(self.in_feat_dim * pool_size * pool_size, self.decoder_output) if C.RIN.COOR_FEATURE: self.pos_encoder = nn.Sequential( nn.Linear(4, self.in_feat_dim), nn.ReLU(inplace=True), nn.Linear(self.in_feat_dim, self.in_feat_dim), nn.ReLU(inplace=True), ) self.pos_merger = nn.Sequential( nn.Conv2d(self.in_feat_dim + self.po_feat_dim, self.in_feat_dim, kernel_size=3, stride=1, padding=1), nn.ReLU(inplace=True), ) if C.RIN.MASK_LOSS_WEIGHT > 0: if self.use_conv_for_mask: self.mask_decoder = nn.Sequential( nn.Conv2d(self.in_feat_dim, self.in_feat_dim, kernel_size=1), nn.ReLU(inplace=True), nn.Conv2d(self.in_feat_dim, self.in_feat_dim, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.UpsamplingNearest2d(size=(C.RIN.MASK_SIZE, C.RIN.MASK_SIZE)), nn.Conv2d(self.in_feat_dim, self.in_feat_dim, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.Conv2d(self.in_feat_dim, 1, kernel_size=3, padding=1), nn.Sigmoid(), ) else: self.mask_decoder = nn.Sequential( nn.Linear(self.in_feat_dim * pool_size * pool_size, self.in_feat_dim), nn.ReLU(inplace=True), nn.Linear(self.in_feat_dim, self.mask_size * self.mask_size), nn.Sigmoid(), ) self.image2prior = nn.Sequential( nn.Conv2d(self.ve_feat_dim * 2, self.ve_feat_dim, 3, 2, 1), nn.ReLU(), nn.Conv2d(self.ve_feat_dim, self.ve_feat_dim, 3, 2, 1), nn.ReLU(), nn.Conv2d(self.ve_feat_dim, self.ve_feat_dim, 3, 2, 1), nn.ReLU(), nn.AdaptiveAvgPool2d((1, 1)), ) self.vae_dim = 8 self.lstm_layers = 1 self.vae_lstm = nn.LSTM(self.vae_dim, self.vae_dim, self.lstm_layers) self.vae_mu_head = nn.Linear(self.ve_feat_dim, 8) self.vae_logvar_head = nn.Linear(self.ve_feat_dim, 8) self.red_prior = nn.Conv2d(self.in_feat_dim + 8, self.in_feat_dim, kernel_size=1)
def get_visual_data(save_dir, data_dir, video_dir, video_name, resnext101, device): tracks_path = os.path.join(save_dir, 'tracks.pkl') length_path = os.path.join(save_dir, 'length.pkl') video_path = os.path.join(video_dir, video_name + '.avi') visual_feature_dir = os.path.join(data_dir, 'objects') class_feature_dir = os.path.join(data_dir, 'category') visual_feature_path = os.path.join(visual_feature_dir, video_name + '.npy') class_feature_path = os.path.join(class_feature_dir, video_name + '.npy') objects_mask_path = os.path.join(visual_feature_dir, video_name + '_mask.pkl') hastracks_path = os.path.join(visual_feature_dir, video_name + '_hastracks.pkl') with open(tracks_path, 'rb') as f: tracks = pickle.load(f) with open(length_path, 'rb') as f: frames_numb = pickle.load(f) interval = frames_numb // 15 tracks = [item for item in tracks if len(item['frame_ids']) >= interval] tracks = sorted(tracks, key=lambda x: len(x['frame_ids']), reverse=True) if len(tracks) > 20: tracks = tracks[:20] num_tracks = len(tracks) if num_tracks == 0: has_tracks = False visual_features = np.zeros([15, 1, 2048, 4, 4]) objects_mask = np.zeros([15, 1]) class_features = np.zeros([1, 1000], dtype=int) np.save(visual_feature_path, visual_features) np.save(class_feature_path, class_features) with open(objects_mask_path, 'wb') as f: pickle.dump(objects_mask, f) with open(hastracks_path, 'wb') as f: pickle.dump(has_tracks, f) return frames_store = [] import cv2 vdo = cv2.VideoCapture() vdo.open(video_path) im_width, im_height = int(vdo.get(cv2.CAP_PROP_FRAME_WIDTH)), int(vdo.get(cv2.CAP_PROP_FRAME_HEIGHT)) while vdo.grab(): _, ori_im = vdo.retrieve() im = cv2.cvtColor(ori_im, cv2.COLOR_BGR2RGB) im = Image.fromarray(im) frames_store.append(trans(im).unsqueeze(0)) new_height, new_width = frames_store[0].shape[-2], frames_store[0].shape[-1] assert len(frames_store) == frames_numb, \ 'expect len(frames_store) == frames_numb, ' \ 'but got {} and {}'.format(len(frames_store), frames_numb) visit = np.zeros(frames_numb + 5, dtype=int) for track in tracks: frames_ids = track['frame_ids'] for id in frames_ids: new_id = id - 1 assert new_id >= 0 visit[new_id] = 1 class_store = [] for track in tracks: class_id = track['class_id'] class_vec = np.zeros(1000, dtype=int) class_vec[class_id] = 1 class_store.append(class_vec) class_store = np.concatenate([item[np.newaxis, ...] for item in class_store], axis=0) assert class_store.shape == (num_tracks, 1000), \ 'expected class_store.shape == ({}, 1000), but got {}'.\ format(num_tracks, class_store.shape) time_span = 15 step = max(int(visit.sum()) // time_span, 1) anchor_idxs = [] has_tracks = True cnt = 0 for i in range(frames_numb): if visit[i] == 0: continue cnt += 1 if cnt % step == 0: anchor_idxs.append(i) origin_anchors_len = len(anchor_idxs) idxs = np.linspace(0, len(anchor_idxs), time_span, endpoint=False, dtype=int).tolist() anchor_idxs = [anchor_idxs[i] for i in idxs] if origin_anchors_len > time_span else anchor_idxs if origin_anchors_len >= time_span: assert len(anchor_idxs) == time_span, \ 'expected len(anchor_idxs) == time_span, but got {}, and now step is {}'.\ format(len(anchor_idxs), step) else: assert int(visit.sum()) < time_span, \ 'expect visit.sum() < time_span, but get {} and step is {}'.format(int(visit.sum()), step) pool_size = 4 spatial_scale = 1.0 / 32.0 fake_feature = np.zeros([1, resnext_dim, pool_size, pool_size]) roi_align = RoIAlign((pool_size, pool_size), spatial_scale=spatial_scale, sampling_ratio=1).to(device) result_feature = [] objects_mask = np.zeros([time_span, len(tracks)]) for i in range(time_span): idx = anchor_idxs[i] if i < len(anchor_idxs) else None temp_store = [] feature_map = resnext101(frames_store[idx].to(device)) if idx is not None else None # (1, 2048, H/32, W/32) for j, item in enumerate(tracks): frames_ids, positions, class_id = item['frame_ids'], item['positions'], item['class_id'] if idx is None or idx not in frames_ids: objects_mask[i, j] = 0 temp_store.append(fake_feature) else: ptr = frames_ids.index(idx) position = positions[ptr] position = process_pos(position, im_width, im_height, new_width, new_height) x1, y1, x2, y2 = position[0], position[1], position[2], position[3] bbox = torch.FloatTensor([x1, y1, x2, y2]).unsqueeze(dim=0).to(device) roi_feature = roi_align(feature_map, [bbox]) assert roi_feature.shape == (1, resnext_dim, pool_size, pool_size), \ 'expected roi_feature.shape is {} but got {}'.\ format((1, resnext_dim, pool_size, pool_size), roi_feature.shape) objects_mask[i, j] = 1 temp_store.append(roi_feature.detach().cpu().numpy()) temp_store = np.concatenate([item for item in temp_store], axis=0) assert temp_store.shape == (num_tracks, resnext_dim, pool_size, pool_size), \ 'expected temp_store.shape == {}, but got {}'.\ format((num_tracks, resnext_dim, pool_size, pool_size), temp_store.shape) result_feature.append(temp_store) result_feature = np.concatenate([item[np.newaxis, ...] for item in result_feature], axis=0) assert result_feature.shape == (time_span, num_tracks, resnext_dim, pool_size, pool_size), \ 'expected result_feature.shape == {}, but got {}'.\ format((time_span, num_tracks, resnext_dim, pool_size, pool_size), result_feature.shape) assert objects_mask[len(anchor_idxs):, ...].sum() == 0., \ 'expect 0. in objects_mask[len(anchor_idxs):, ...], but got {}'.\ format(objects_mask[len(anchor_idxs):, ...].sum()) np.save(visual_feature_path, result_feature) np.save(class_feature_path, class_store) with open(objects_mask_path, 'wb') as f: pickle.dump(objects_mask, f) with open(hastracks_path, 'wb') as f: pickle.dump(has_tracks, f) print('{video_name} has {x} objects'.format(video_name=video_name, x=len(tracks) if has_tracks else 0))