def __init__(self, cfg): super(GCNnet_collective, self).__init__() self.cfg=cfg D=self.cfg.emb_features K=self.cfg.crop_size[0] NFB=self.cfg.num_features_boxes NFR, NFG=self.cfg.num_features_relation, self.cfg.num_features_gcn self.backbone=MyInception_v3(transform_input=False,pretrained=True) if not self.cfg.train_backbone: for p in self.backbone.parameters(): p.requires_grad=False self.roi_align=RoIAlign(*self.cfg.crop_size) self.fc_emb_1=nn.Linear(K*K*D,NFB) self.nl_emb_1=nn.LayerNorm([NFB]) self.gcn_list = torch.nn.ModuleList([ GCN_Module(self.cfg) for i in range(self.cfg.gcn_layers) ]) self.dropout_global=nn.Dropout(p=self.cfg.train_dropout_prob) self.fc_actions=nn.Linear(NFG,self.cfg.num_actions) self.fc_activities=nn.Linear(NFG,self.cfg.num_activities) for m in self.modules(): if isinstance(m,nn.Linear): nn.init.kaiming_normal_(m.weight) if m.bias is not None: nn.init.zeros_(m.bias)
def __init__(self, cfg): super(Basenet_collective, self).__init__() self.cfg=cfg D=self.cfg.emb_features K=self.cfg.crop_size[0] NFB=self.cfg.num_features_boxes NFR, NFG=self.cfg.num_features_relation, self.cfg.num_features_gcn self.backbone=MyInception_v3(transform_input=False,pretrained=True) # self.backbone=MyVGG16(pretrained=True) if not self.cfg.train_backbone: for p in self.backbone.parameters(): p.requires_grad=False self.roi_align=RoIAlign(*self.cfg.crop_size) self.fc_emb_1=nn.Linear(K*K*D,NFB) self.dropout_emb_1 = nn.Dropout(p=self.cfg.train_dropout_prob) # self.nl_emb_1=nn.LayerNorm([NFB]) self.fc_actions=nn.Linear(NFB,self.cfg.num_actions) self.fc_activities=nn.Linear(NFB,self.cfg.num_activities) for m in self.modules(): if isinstance(m,nn.Linear): nn.init.kaiming_normal_(m.weight)
def run_roi_align(frame_feature_map, bbox_in, crop_size=(7, 7)): roi_align = RoIAlign(crop_size[0], crop_size[1]) B, C, H, W = frame_feature_map.size() input_size = bbox_in.size() boxes_in_flat = torch.reshape(bbox_in, (-1, 4)) #B*T*N, 4 boxes_idx = [ i * torch.ones(input_size[2], dtype=torch.int) for i in range(input_size[0] * input_size[1]) ] boxes_idx = torch.stack(boxes_idx).to(device=bbox_in.device) # B*T, N boxes_idx_flat = torch.reshape( boxes_idx, (input_size[0] * input_size[1] * input_size[2], )) #B*T*N, del boxes_idx boxes_in_flat.requires_grad = False boxes_idx_flat.requires_grad = False # RoI Align boxes_features = roi_align(frame_feature_map, boxes_in_flat, boxes_idx_flat) del boxes_in_flat boxes_features = torch.reshape( boxes_features, (input_size[0], C, -1, crop_size[0], crop_size[1])) return boxes_features
def __init__(self, cfg): super(Basenet_volleyball, self).__init__() self.cfg=cfg NFB=self.cfg.num_features_boxes D=self.cfg.emb_features K=self.cfg.crop_size[0] if cfg.backbone=='inv3': self.backbone=MyInception_v3(transform_input=False,pretrained=True) elif cfg.backbone=='vgg16': self.backbone=MyVGG16(pretrained=True) elif cfg.backbone=='vgg19': self.backbone=MyVGG19(pretrained=True) else: assert False self.roi_align=RoIAlign(*self.cfg.crop_size) self.fc_emb = nn.Linear(K*K*D,NFB) self.dropout_emb = nn.Dropout(p=self.cfg.train_dropout_prob) self.fc_actions=nn.Linear(NFB,self.cfg.num_actions) self.fc_activities=nn.Linear(NFB,self.cfg.num_activities) for m in self.modules(): if isinstance(m,nn.Linear): nn.init.kaiming_normal_(m.weight) nn.init.zeros_(m.bias)
def __init__(self): super(Proposal, self).__init__() self.width = 1 self.height = 1 self.region_width = 70 self.region_height = 70 self.stride = 1 # using 5 layers PatchGAN self.receptive_field = 70. self.roialign = RoIAlign(self.region_height, self.region_width, transform_fpcoor=True)
def __init__(self, crop_dim, max_bboxes): super(RoIFeats, self).__init__() # Check if scale ratio needs to be given as input here or the coordinates should be modified from outside itself. # We'll do a square crop of the image features for each bounding box self.crop_dim = crop_dim self.max_bboxes = max_bboxes self.roi_align = RoIAlign(self.crop_dim, self.crop_dim, transform_fpcoor=False) self.avg_pool = nn.AvgPool2d(self.crop_dim)
def __init__(self, config): super(Proposal, self).__init__() self.width = config['window_width'] self.height = config['window_height'] self.region_width = config['region_width'] self.region_height = config['region_height'] self.stride = 1 # using 5 layers PatchGAN self.receptive_field = config['receptive_field'] self.roialign = RoIAlign(self.region_height, self.region_width, transform_fpcoor=True) ### # use mask operation or not self.mask_opt = config['mask_operation']
def __init__(self, cfg): super(GCNnet_volleyball, self).__init__() self.cfg=cfg T, N=self.cfg.num_frames, self.cfg.num_boxes D=self.cfg.emb_features K=self.cfg.crop_size[0] NFB=self.cfg.num_features_boxes NFR, NFG=self.cfg.num_features_relation, self.cfg.num_features_gcn if cfg.backbone=='inv3': self.backbone=MyInception_v3(transform_input=False,pretrained=True) elif cfg.backbone=='vgg16': self.backbone=MyVGG16(pretrained=True) elif cfg.backbone=='vgg19': self.backbone=MyVGG19(pretrained=False) else: assert False if not cfg.train_backbone: for p in self.backbone.parameters(): p.requires_grad=False self.roi_align=RoIAlign(*self.cfg.crop_size) self.fc_emb_1=nn.Linear(K*K*D,NFB) self.nl_emb_1=nn.LayerNorm([NFB]) self.dropout_global=nn.Dropout(p=self.cfg.train_dropout_prob) self.fc_activities=nn.Linear(NFG,self.cfg.num_activities) self.gcn_list= GCN_Module(cfg) self.encoder = st_gcn(cfg,in_channels=1024,out_channels=64,kernel_size=(3,1),stride=1) self.encoder_a = st_gcn_short(cfg,in_channels=1024,out_channels=64,kernel_size=(1,1),stride=1) self.decoder_a = st_gcn_decoder(cfg,in_channels=1024,out_channels=64,kernel_size=(1,1),stride=1) self.encoder_p = ConvGraphicalEncoder(in_channels=2, out_channels=1024, \ kernel_size=1) self.decoder_p = ConvGraphicalDecoder(in_channels=1024, out_channels=2, \ kernel_size=1) for m in self.modules(): if isinstance(m,nn.Linear): nn.init.kaiming_normal_(m.weight)
def __init__(self, args): super(BBoxModule, self).__init__() self.args = args for supervision in args.supervision: if supervision['name'] == 'bbox': self.crop_height = int(supervision['other']['pool_size']) self.crop_width = int(supervision['other']['pool_size']) self.roi_align = RoIAlign(self.crop_height, self.crop_width, transform_fpcoor=True) self.down_sampling_rate = self.args.down_sampling_rate self.feat_dim = args.feat_dim * self.crop_width * self.crop_height self.num_class = args.num_base_class self.regress = nn.Linear(self.feat_dim, 4)
def __init__(self, cfg): super(Basenet_collective, self).__init__() self.cfg=cfg D=self.cfg.emb_features K=self.cfg.crop_size[0] NFB=self.cfg.num_features_boxes # START: Original code by Zijian and Xinran if cfg.backbone == 'inv3': self.backbone = MyInception_v3(transform_input=False, pretrained=True) elif cfg.backbone == 'vgg16': self.backbone = MyVGG16(pretrained=True) elif cfg.backbone == 'vgg19': self.backbone = MyVGG19(pretrained=True) elif cfg.backbone == 'mobilenet': self.backbone = MyMobileNet(pretrained=True) else: assert False # END: Original code by Zijian and Xinran if not self.cfg.train_backbone: for p in self.backbone.parameters(): p.requires_grad=False self.roi_align=RoIAlign(*self.cfg.crop_size) # START: Original code by Zijian and Xinran if cfg.backbone == 'inv3': self.fc_emb_1 = nn.Linear(K * K * D, NFB) elif cfg.backbone == 'mobilenet': self.fc_emb_1 = nn.Linear(32000, NFB) # END: Original code by Zijian and Xinran self.dropout_emb_1 = nn.Dropout(p=self.cfg.train_dropout_prob) self.fc_actions=nn.Linear(NFB,self.cfg.num_actions) self.fc_activities=nn.Linear(NFB,self.cfg.num_activities) for m in self.modules(): if isinstance(m,nn.Linear): nn.init.kaiming_normal_(m.weight)
def test_roialign(is_cuda=True): # generate data crop_height = 3 crop_width = 3 image_data, boxes_data, box_index_data = generate_data( batch_size=2, depth=2, im_height=10, im_width=10, n_boxes=2, xyxy=True, box_normalize=False) max_inp = np.abs(image_data).max() print('max_input:', max_inp) image_torch = to_varabile(image_data, requires_grad=True, is_cuda=is_cuda) boxes = to_varabile(boxes_data, requires_grad=False, is_cuda=is_cuda) box_index = to_varabile(box_index_data, requires_grad=False, is_cuda=is_cuda) roi_align = RoIAlign(crop_height, crop_width, transform_fpcoor=False) gradcheck(roi_align, (image_torch, boxes, box_index), eps=max_inp/500) print('test ok')
def __init__(self, args, backbone, classifier): super(BaseLearningModule, self).__init__() self.args = args self.backbone = backbone self.classifier = classifier self.crop_height = int(args.crop_height) self.crop_width = args.crop_width self.roi_align = RoIAlign(args.crop_height, args.crop_width, transform_fpcoor=True) self.down_sampling_rate = args.down_sampling_rate # supervision modules are generated in train # args.module example: # [{'name': 'seg', 'module': seg_module}] if hasattr(args, 'module'): for module in args.module: setattr(self, module['name'], module['module']) self.mode = 'train' if self.classifier is not None: self.classifier.mode = self.mode
images = ImageReader() image_data = DataLoader(dataset=images, num_workers=4, batch_size=1, shuffle=False, drop_last=False) #How to make it work for batch size >1 # In[ ]: os.environ["CUDA_VISIBLE_DEVICES"] = str(2) crop_height = 7 crop_width = 7 roi_dir = '/scratch/medhini2/paragraph_generation/dataset/RoI/' for batch_idx, (filename, featuremap_data, boxes_data, boxes_idx) in enumerate(image_data): featuremap = Variable(featuremap_data, requires_grad=True).cuda() if boxes_idx[0]!=0: boxes = Variable(boxes_data, requires_grad=False).cuda() box_index = Variable(boxes_idx.type(torch.cuda.IntTensor), requires_grad=False).cuda() #RoIAlign layer roi_align = RoIAlign(crop_height, crop_width) crops = roi_align(featuremap, boxes[0], box_index) roi_file = os.path.join(roi_dir,filename[0]) torch.save(crops.data,roi_file) # In[ ]:
def main(args): # Image preprocessing transform = transforms.Compose([transforms.ToTensor()]) num_classes = 80 yolov3 = Darknet(args.cfg_file) yolov3.load_weights(args.weights_file) yolov3.net_info["height"] = args.reso inp_dim = int(yolov3.net_info["height"]) assert inp_dim % 32 == 0 assert inp_dim > 32 print("yolo-v3 network successfully loaded") attribute_size = [15, 7, 3, 5, 8, 4, 15, 7, 3, 5, 3, 3, 4] encoder = EncoderClothing(args.embed_size, device, args.roi_size, attribute_size) yolov3.to(device) encoder.to(device) yolov3.eval() encoder.eval() encoder.load_state_dict(torch.load(args.encoder_path)) #cap = cv2.VideoCapture('demo2.mp4') cap = cv2.VideoCapture(0) assert cap.isOpened(), 'Cannot capture source' frames = 0 start = time.time() counter = Counter() color_stream = list() pattern_stream = list() gender_stream = list() season_stream = list() class_stream = list() sleeves_stream = list() ret, frame = cap.read() if ret: image, orig_img, dim = prep_image2(frame, inp_dim) im_dim = torch.FloatTensor(dim).repeat(1, 2) image_tensor = image.to(device) detections = yolov3(image_tensor, device, True) os.system('clear') cv2.imshow("frame", orig_img) cv2.moveWindow("frame", 50, 50) text_img = np.zeros((200, 1750, 3)) cv2.imshow("text", text_img) cv2.moveWindow("text", 50, dim[1] + 110) while cap.isOpened(): ret, frame = cap.read() #### ret, frame = ros_message_cam_image() if ret: image, orig_img, dim = prep_image2(frame, inp_dim) im_dim = torch.FloatTensor(dim).repeat(1, 2) image_tensor = image.to(device) im_dim = im_dim.to(device) # Generate an caption from the image detections = yolov3(image_tensor, device, True) # prediction mode for yolo-v3 detections = write_results(detections, args.confidence, num_classes, device, nms=True, nms_conf=args.nms_thresh) #### detections = ros_message_rois() #### ros_rois --> [0,0, x1, y1, x2, y2] # original image dimension --> im_dim #view_image(detections) text_img = np.zeros((200, 1750, 3)) if type(detections) != int: if detections.shape[0]: bboxs = detections[:, 1:5].clone() im_dim = im_dim.repeat(detections.shape[0], 1) scaling_factor = torch.min(inp_dim / im_dim, 1)[0].view(-1, 1) detections[:, [1, 3]] -= (inp_dim - scaling_factor * im_dim[:, 0].view(-1, 1)) / 2 detections[:, [2, 4]] -= (inp_dim - scaling_factor * im_dim[:, 1].view(-1, 1)) / 2 detections[:, 1:5] /= scaling_factor small_object_ratio = torch.FloatTensor(detections.shape[0]) for i in range(detections.shape[0]): detections[i, [1, 3]] = torch.clamp( detections[i, [1, 3]], 0.0, im_dim[i, 0]) detections[i, [2, 4]] = torch.clamp( detections[i, [2, 4]], 0.0, im_dim[i, 1]) object_area = (detections[i, 3] - detections[i, 1]) * ( detections[i, 4] - detections[i, 2]) orig_img_area = im_dim[i, 0] * im_dim[i, 1] small_object_ratio[i] = object_area / orig_img_area detections = detections[small_object_ratio > 0.05] im_dim = im_dim[small_object_ratio > 0.05] if detections.size(0) > 0: feature = yolov3.get_feature() feature = feature.repeat(detections.size(0), 1, 1, 1) orig_img_dim = im_dim[:, 1:] orig_img_dim = orig_img_dim.repeat(1, 2) scaling_val = 16 bboxs /= scaling_val bboxs = bboxs.round() bboxs_index = torch.arange(bboxs.size(0), dtype=torch.int) bboxs_index = bboxs_index.to(device) bboxs = bboxs.to(device) roi_align = RoIAlign(args.roi_size, args.roi_size, transform_fpcoor=True).to(device) roi_features = roi_align(feature, bboxs, bboxs_index) outputs = encoder(roi_features) for i in range(detections.shape[0]): sampled_caption = [] #attr_fc = outputs[] for j in range(len(outputs)): max_index = torch.max(outputs[j][i].data, 0)[1] word = attribute_pool[j][max_index] sampled_caption.append(word) c11 = sampled_caption[11] sampled_caption[11] = sampled_caption[10] sampled_caption[10] = c11 sentence = ' '.join(sampled_caption) sys.stdout.write( ' ' + '\r') sys.stdout.write(sentence + ' ' + '\r') sys.stdout.flush() write(detections[i], orig_img, sentence, i + 1, coco_classes, colors) cv2.putText(text_img, sentence, (0, i * 40 + 35), cv2.FONT_HERSHEY_PLAIN, 2, [255, 255, 255], 1) cv2.imshow("frame", orig_img) cv2.imshow("text", text_img) key = cv2.waitKey(1) if key & 0xFF == ord('q'): break if key & 0xFF == ord('w'): wait(0) if key & 0xFF == ord('s'): continue frames += 1 #print("FPS of the video is {:5.2f}".format( frames / (time.time() - start))) else: break
import numpy as np import torch from torch.autograd import Variable from roi_align.roi_align import RoIAlign def to_varabile(arr, requires_grad=False, is_cuda=True): tensor = torch.from_numpy(arr) if is_cuda: tensor = tensor.cuda() var = Variable(tensor, requires_grad=requires_grad) return var # the data you want is_cuda = False image_data = np.tile(np.arange(7, dtype=np.float32), 7).reshape(7, 7) image_data = image_data[np.newaxis, np.newaxis] boxes_data = np.asarray([[0, 0, 6, 6]], dtype=np.float32) box_index_data = np.asarray([0], dtype=np.int32) image_torch = to_varabile(image_data, requires_grad=True, is_cuda=is_cuda) boxes = to_varabile(boxes_data, requires_grad=False, is_cuda=is_cuda) box_index = to_varabile(box_index_data, requires_grad=False, is_cuda=is_cuda) # set transform_fpcoor to False is the crop_and_resize roi_align = RoIAlign(7, 7, transform_fpcoor=True) print(roi_align(image_torch, boxes, box_index))
def main(args): # Image preprocessing transform = transforms.Compose([transforms.ToTensor()]) # Load vocabulary wrapper # Build the models #CUDA = torch.cuda.is_available() num_classes = 80 yolov3 = Darknet(args.cfg_file) yolov3.load_weights(args.weights_file) yolov3.net_info["height"] = args.reso inp_dim = int(yolov3.net_info["height"]) assert inp_dim % 32 == 0 assert inp_dim > 32 print("yolo-v3 network successfully loaded") attribute_size = [15, 7, 3, 5, 8, 4, 15, 7, 3, 5, 3, 3, 4] encoder = EncoderClothing(args.embed_size, device, args.roi_size, attribute_size) # Prepare an image images = "test" try: list_dir = os.listdir(images) # list_dir.sort(key=lambda x: int(x[:-4])) imlist = [ osp.join(osp.realpath('.'), images, img) for img in list_dir if os.path.splitext(img)[1] == '.jpg' or os.path.splitext(img)[1] == '.JPG' or os.path.splitext(img)[1] == '.png' ] except NotADirectoryError: imlist = [] imlist.append(osp.join(osp.realpath('.'), images)) print('Not a directory error') except FileNotFoundError: print("No file or directory with the name {}".format(images)) exit() yolov3.to(device) encoder.to(device) yolov3.eval() encoder.eval() encoder.load_state_dict(torch.load(args.encoder_path)) for inx, image in enumerate(imlist): #print(image) image, orig_img, im_dim = prep_image(image, inp_dim) im_dim = torch.FloatTensor(im_dim).repeat(1, 2) image_tensor = image.to(device) im_dim = im_dim.to(device) # Generate an caption from the image detections = yolov3(image_tensor, device, True) # prediction mode for yolo-v3 detections = write_results(detections, args.confidence, num_classes, device, nms=True, nms_conf=args.nms_thresh) # original image dimension --> im_dim #view_image(detections) os.system('clear') if type(detections) != int: if detections.shape[0]: bboxs = detections[:, 1:5].clone() im_dim = im_dim.repeat(detections.shape[0], 1) scaling_factor = torch.min(inp_dim / im_dim, 1)[0].view(-1, 1) detections[:, [1, 3]] -= ( inp_dim - scaling_factor * im_dim[:, 0].view(-1, 1)) / 2 detections[:, [2, 4]] -= ( inp_dim - scaling_factor * im_dim[:, 1].view(-1, 1)) / 2 detections[:, 1:5] /= scaling_factor small_object_ratio = torch.FloatTensor(detections.shape[0]) for i in range(detections.shape[0]): detections[i, [1, 3]] = torch.clamp(detections[i, [1, 3]], 0.0, im_dim[i, 0]) detections[i, [2, 4]] = torch.clamp(detections[i, [2, 4]], 0.0, im_dim[i, 1]) object_area = (detections[i, 3] - detections[i, 1]) * ( detections[i, 4] - detections[i, 2]) orig_img_area = im_dim[i, 0] * im_dim[i, 1] small_object_ratio[i] = object_area / orig_img_area detections = detections[small_object_ratio > 0.02] im_dim = im_dim[small_object_ratio > 0.02] if detections.size(0) > 0: feature = yolov3.get_feature() feature = feature.repeat(detections.size(0), 1, 1, 1) #orig_img_dim = im_dim[:, 1:] #orig_img_dim = orig_img_dim.repeat(1, 2) scaling_val = 16 bboxs /= scaling_val bboxs = bboxs.round() bboxs_index = torch.arange(bboxs.size(0), dtype=torch.int) bboxs_index = bboxs_index.to(device) bboxs = bboxs.to(device) roi_align = RoIAlign(args.roi_size, args.roi_size, transform_fpcoor=True).to(device) roi_features = roi_align(feature, bboxs, bboxs_index) # print(roi_features) # print(roi_features.size()) #roi_features = roi_features.reshape(roi_features.size(0), -1) #roi_align_feature = encoder(roi_features) outputs = encoder(roi_features) #attribute_size = [15, 7, 3, 5, 7, 4, 15, 7, 3, 5, 4, 3, 4] #losses = [criteria[i](outputs[i], targets[i]) for i in range(len(attribute_size))] for i in range(detections.shape[0]): sampled_caption = [] #attr_fc = outputs[] for j in range(len(outputs)): #temp = outputs[j][i].data max_index = torch.max(outputs[j][i].data, 0)[1] word = attribute_pool[j][max_index] sampled_caption.append(word) c11 = sampled_caption[11] sampled_caption[11] = sampled_caption[10] sampled_caption[10] = c11 sentence = ' '.join(sampled_caption) # again sampling for testing #print ('---------------------------') print(str(i + 1) + ': ' + sentence) write(detections[i], orig_img, sentence, i + 1, coco_classes, colors) #list(map(lambda x: write(x, orig_img, captions), detections[i].unsqueeze(0))) cv2.imshow("frame", orig_img) key = cv2.waitKey(0) os.system('clear') if key & 0xFF == ord('q'): break
import torch import cv2 import numpy as np from torch.autograd import Variable def to_varabile(data,requires_grad,is_cuda): if is_cuda: data = data.cuda() data = Variable(data,requires_grad=requires_grad) return data # input data is_cuda = torch.cuda.is_available() # image_data = cv2.imread('/data/2019AAAI/data/ctw15/test/text_image/1002.jpg') image_data = np.ones((100,100,3)) image_data = image_data.transpose((2, 0, 1)).astype(np.float32) image_data = torch.from_numpy((image_data)) boxes_data = torch.Tensor([[0,0,200,200],[0,0,200,200]]) box_index_data = torch.IntTensor([0]) image = to_varabile(image_data, requires_grad=True, is_cuda=is_cuda) image = image.unsqueeze(0) print(image.size()) boxes = to_varabile(boxes_data, requires_grad=False, is_cuda=is_cuda) box_index = to_varabile(box_index_data, requires_grad=False, is_cuda=is_cuda) print(image,boxes,box_index) # RoIAlign layer roi_align = RoIAlign(7, 7,extrapolation_value=0) crops = roi_align(image, boxes, box_index) print(crops)
import numpy as np import torch from torch.autograd import Variable from roi_align.roi_align import RoIAlign def to_varabile(arr, requires_grad=False, is_cuda=True): tensor = torch.from_numpy(arr) if is_cuda: tensor = tensor.cuda() var = Variable(tensor, requires_grad=requires_grad) return var # the data you want is_cuda = False image_data = np.tile(np.arange(7, dtype=np.float32), 7).reshape(7, 7) image_data = image_data[np.newaxis, np.newaxis] boxes_data = np.asarray([[0, 0, 3, 3]], dtype=np.float32) box_index_data = np.asarray([0], dtype=np.int32) image_torch = to_varabile(image_data, requires_grad=True, is_cuda=is_cuda) boxes = to_varabile(boxes_data, requires_grad=False, is_cuda=is_cuda) box_index = to_varabile(box_index_data, requires_grad=False, is_cuda=is_cuda) # set transform_fpcoor to False is the crop_and_resize print( RoIAlign.apply(image_torch, boxes, box_index, 3, 3, transform_fpcoor=True))
def __init__(self): super(LPR, self).__init__() self.header = torch.IntTensor([0, 0, 0, 0]) # set transform_fpcoor to False is the crop_and_resize #self.roi_align = RoIAlign(28, 28, transform_fpcoor=True) self.roi_align = RoIAlign(28, 28) self.conv_1 = LPConv(3, 32, 3, 1, 1, bias=False) self.conv_2 = LPConv(32, 64, 3, 2, 1, bias=False) self.res1 = ResBlock(32, 64, bias=False) self.conv_3 = LPConv(64, 128, 3, 2, 1, bias=False) self.res2_1 = ResBlock(64, 128, bias=False) self.res2_2 = ResBlock(64, 128, bias=False) self.conv_4 = LPConv(128, 256, 3, 2, 1, bias=False) self.res3_1 = ResBlock(128, 256, bias=False) self.res3_2 = ResBlock(128, 256, bias=False) self.res3_3 = ResBlock(128, 256, bias=False) self.res3_4 = ResBlock(128, 256, bias=False) self.res3_5 = ResBlock(128, 256, bias=False) self.res3_6 = ResBlock(128, 256, bias=False) self.res3_7 = ResBlock(128, 256, bias=False) self.res3_8 = ResBlock(128, 256, bias=False) self.conv_5 = LPConv(256, 512, 3, 2, 1, bias=False) self.res4_1 = ResBlock(256, 512, bias=False) self.res4_2 = ResBlock(256, 512, bias=False) self.res4_3 = ResBlock(256, 512, bias=False) self.res4_4 = ResBlock(256, 512, bias=False) self.res4_5 = ResBlock(256, 512, bias=False) self.res4_6 = ResBlock(256, 512, bias=False) self.res4_7 = ResBlock(256, 512, bias=False) self.res4_8 = ResBlock(256, 512, bias=False) self.conv_6 = LPConv(512, 1024, 3, 2, 1, bias=False) self.res5_1 = ResBlock(512, 1024, bias=False) self.res5_2 = ResBlock(512, 1024, bias=False) self.res5_3 = ResBlock(512, 1024, bias=False) self.res5_4 = ResBlock(512, 1024, bias=False) self.detection3 = DetectionBlock(512, 1024, bias=False) self.concat3 = UpsampleBlock(256, 512, bias=False) self.detection2 = DetectionBlock(256, 512, 256, bias=False) self.concat2 = UpsampleBlock(128, 256, bias=False) self.detection1 = DetectionBlock(128, 256, 128, bias=False) self.feature1 = nn.Sequential( self.conv_1, self.conv_2, self.res1, self.conv_3, self.res2_1, self.res2_2, self.conv_4, self.res3_1, self.res3_2, self.res3_3, self.res3_4, self.res3_5, self.res3_6, self.res3_7, self.res3_8, ) self.feature2 = nn.Sequential( self.conv_5, self.res4_1, self.res4_2, self.res4_3, self.res4_4, self.res4_5, self.res4_6, self.res4_7, self.res4_8, ) self.feature3 = nn.Sequential( self.conv_6, self.res5_1, self.res5_2, self.res5_3, self.res5_4, ) self.module_list = nn.ModuleList() self.module_list.append(self.feature1) self.module_list.append(self.feature2) self.module_list.append(self.feature3) self.module_list.append(self.detection3) self.module_list.append(self.concat3) self.module_list.append(self.detection2) self.module_list.append(self.concat2) self.module_list.append(self.detection1) for p in self.parameters(): p.requires_grad=False self.lpdetection1 = LprDetection1(64, 256, bias=False) self.lpdetection1_1 = LprDetection1(64, 128, bias=False) self.lpdetection2 = LPR_Classifer1() # CornerNet cnv_dim = 128 out_dim = 1 curr_dim = 128 self.cnvs = nn.ModuleList([ make_cnv_layer(curr_dim, cnv_dim) ]) self.ct_cnvs = nn.ModuleList([ make_ct_layer(cnv_dim) ]) ## keypoint heatmaps self.ct_heats = nn.ModuleList([ make_kp_layer(cnv_dim, curr_dim, out_dim) ]) for ct_heat in self.ct_heats: ct_heat[-1].bias.data.fill_(-2.19) self.ct_regrs = nn.ModuleList([ make_kp_layer(cnv_dim, curr_dim, 2) ]) self.regboxes = nn.ModuleList([ make_kp_layer(cnv_dim, curr_dim, 2) ]) self.affine = nn.ModuleList([ make_kp_layer(cnv_dim, curr_dim, 6) ]) self.loss = AELoss2(pull_weight=1e-1, wh_weight=2e-1, focal_loss=_neg_loss)
import numpy as np import torch from torch.autograd import Variable from roi_align.roi_align import RoIAlign def to_varabile(arr, requires_grad=False, is_cuda=True): tensor = torch.from_numpy(arr) if is_cuda: tensor = tensor.cuda() var = Variable(tensor, requires_grad=requires_grad) return var # the data you want is_cuda = False image_data = np.tile(np.arange(7, dtype=np.float32), 7).reshape(7, 7) image_data = image_data[np.newaxis, np.newaxis] boxes_data = np.asarray([[0, 0, 3, 3]], dtype=np.float32) box_index_data = np.asarray([0], dtype=np.int32) image_torch = to_varabile(image_data, requires_grad=True, is_cuda=is_cuda) boxes = to_varabile(boxes_data, requires_grad=False, is_cuda=is_cuda) box_index = to_varabile(box_index_data, requires_grad=False, is_cuda=is_cuda) # set transform_fpcoor to False is the crop_and_resize roi_align = RoIAlign(3, 3, transform_fpcoor=True) print(roi_align(image_torch, boxes, box_index))
def detect_attributes(image, yolo_dim, yolov3, encoder): ''' detect_attributes ''' text_results = [] image, orig_img, im_dim = prep_image(image, yolo_dim) im_dim = torch.FloatTensor(im_dim).repeat(1, 2) image_tensor = image.to(device) im_dim = im_dim.to(device) # Generate an caption from the image # prediction mode for yolo-v3 detections = yolov3(image_tensor, device, True) detections = write_results( detections, args.confidence, device, num_classes=80, nms=True, nms_conf=args.nms_thresh, ) # original image dimension --> im_dim # view_image(detections) os.system("clear") if not isinstance(detections, int): if detections.shape[0]: bboxs = detections[:, 1:5].clone() im_dim = im_dim.repeat(detections.shape[0], 1) scaling_factor = torch.min(yolo_dim / im_dim, 1)[0].view(-1, 1) detections[:, [1, 3]] -= ( yolo_dim - scaling_factor * im_dim[:, 0].view(-1, 1)) / 2 detections[:, [2, 4]] -= ( yolo_dim - scaling_factor * im_dim[:, 1].view(-1, 1)) / 2 detections[:, 1:5] /= scaling_factor small_object_ratio = torch.FloatTensor(detections.shape[0]) for i in range(detections.shape[0]): detections[i, [1, 3]] = torch.clamp(detections[i, [1, 3]], 0.0, im_dim[i, 0]) detections[i, [2, 4]] = torch.clamp(detections[i, [2, 4]], 0.0, im_dim[i, 1]) object_area = (detections[i, 3] - detections[i, 1]) * ( detections[i, 4] - detections[i, 2]) orig_img_area = im_dim[i, 0] * im_dim[i, 1] small_object_ratio[i] = object_area / orig_img_area detections = detections[small_object_ratio > 0.02] im_dim = im_dim[small_object_ratio > 0.02] if detections.size(0) > 0: feature = yolov3.get_feature() feature = feature.repeat(detections.size(0), 1, 1, 1) scaling_val = 16 bboxs /= scaling_val bboxs = bboxs.round() bboxs_index = torch.arange(bboxs.size(0), dtype=torch.int) bboxs_index = bboxs_index.to(device) bboxs = bboxs.to(device) roi_align = RoIAlign(args.roi_size, args.roi_size, transform_fpcoor=True).to(device) roi_features = roi_align(feature, bboxs, bboxs_index) outputs = encoder(roi_features) for i in range(detections.shape[0]): sampled_caption = [] for j in range(len(outputs) - 1): max_index = torch.max(outputs[j][i].data, 0)[1] word = attribute_pool[j][max_index] sampled_caption.append(word) # for reversion lower length and lower type c11 = sampled_caption[11] sampled_caption[11] = sampled_caption[10] sampled_caption[10] = c11 sentence = " ".join(sampled_caption) print(str(i + 1) + ": " + sentence) write( detections[i], orig_img, sentence, i + 1, coco_classes, colors, ) return text_results, orig_img