Beispiel #1
0
 def __init__(self, cfg):
     super(GCNnet_collective, self).__init__()
     self.cfg=cfg
     
     D=self.cfg.emb_features
     K=self.cfg.crop_size[0]
     NFB=self.cfg.num_features_boxes
     NFR, NFG=self.cfg.num_features_relation, self.cfg.num_features_gcn
     
     self.backbone=MyInception_v3(transform_input=False,pretrained=True)
     
     if not self.cfg.train_backbone:
         for p in self.backbone.parameters():
             p.requires_grad=False
     
     self.roi_align=RoIAlign(*self.cfg.crop_size)
     
     self.fc_emb_1=nn.Linear(K*K*D,NFB)
     self.nl_emb_1=nn.LayerNorm([NFB])
     
     
     self.gcn_list = torch.nn.ModuleList([ GCN_Module(self.cfg)  for i in range(self.cfg.gcn_layers) ])    
     
     
     self.dropout_global=nn.Dropout(p=self.cfg.train_dropout_prob)
 
     self.fc_actions=nn.Linear(NFG,self.cfg.num_actions)
     self.fc_activities=nn.Linear(NFG,self.cfg.num_activities)
     
     for m in self.modules():
         if isinstance(m,nn.Linear):
             nn.init.kaiming_normal_(m.weight)
             if m.bias is not None:
                 nn.init.zeros_(m.bias)
Beispiel #2
0
    def __init__(self, cfg):
        super(Basenet_collective, self).__init__()
        self.cfg=cfg
        
        D=self.cfg.emb_features
        K=self.cfg.crop_size[0]
        NFB=self.cfg.num_features_boxes
        NFR, NFG=self.cfg.num_features_relation, self.cfg.num_features_gcn
        
        self.backbone=MyInception_v3(transform_input=False,pretrained=True)
#         self.backbone=MyVGG16(pretrained=True)
        
        if not self.cfg.train_backbone:
            for p in self.backbone.parameters():
                p.requires_grad=False
        
        self.roi_align=RoIAlign(*self.cfg.crop_size)
        
        self.fc_emb_1=nn.Linear(K*K*D,NFB)
        self.dropout_emb_1 = nn.Dropout(p=self.cfg.train_dropout_prob)
#         self.nl_emb_1=nn.LayerNorm([NFB])
        
        
        self.fc_actions=nn.Linear(NFB,self.cfg.num_actions)
        self.fc_activities=nn.Linear(NFB,self.cfg.num_activities)
        
        for m in self.modules():
            if isinstance(m,nn.Linear):
                nn.init.kaiming_normal_(m.weight)
def run_roi_align(frame_feature_map, bbox_in, crop_size=(7, 7)):
    roi_align = RoIAlign(crop_size[0], crop_size[1])

    B, C, H, W = frame_feature_map.size()
    input_size = bbox_in.size()
    boxes_in_flat = torch.reshape(bbox_in, (-1, 4))  #B*T*N, 4

    boxes_idx = [
        i * torch.ones(input_size[2], dtype=torch.int)
        for i in range(input_size[0] * input_size[1])
    ]
    boxes_idx = torch.stack(boxes_idx).to(device=bbox_in.device)  # B*T, N
    boxes_idx_flat = torch.reshape(
        boxes_idx, (input_size[0] * input_size[1] * input_size[2], ))  #B*T*N,
    del boxes_idx
    boxes_in_flat.requires_grad = False
    boxes_idx_flat.requires_grad = False

    # RoI Align
    boxes_features = roi_align(frame_feature_map, boxes_in_flat,
                               boxes_idx_flat)
    del boxes_in_flat
    boxes_features = torch.reshape(
        boxes_features, (input_size[0], C, -1, crop_size[0], crop_size[1]))
    return boxes_features
Beispiel #4
0
    def __init__(self, cfg):
        super(Basenet_volleyball, self).__init__()
        self.cfg=cfg
        
        NFB=self.cfg.num_features_boxes
        D=self.cfg.emb_features
        K=self.cfg.crop_size[0]
        

        if cfg.backbone=='inv3':
            self.backbone=MyInception_v3(transform_input=False,pretrained=True)
        elif cfg.backbone=='vgg16':
            self.backbone=MyVGG16(pretrained=True)
        elif cfg.backbone=='vgg19':
            self.backbone=MyVGG19(pretrained=True)
        else:
            assert False
        
        self.roi_align=RoIAlign(*self.cfg.crop_size)
        
        
        self.fc_emb = nn.Linear(K*K*D,NFB)
        self.dropout_emb = nn.Dropout(p=self.cfg.train_dropout_prob)
        
        self.fc_actions=nn.Linear(NFB,self.cfg.num_actions)
        self.fc_activities=nn.Linear(NFB,self.cfg.num_activities)
        
        
        for m in self.modules():
            if isinstance(m,nn.Linear):
                nn.init.kaiming_normal_(m.weight)
                nn.init.zeros_(m.bias)
Beispiel #5
0
 def __init__(self):
     super(Proposal, self).__init__()
     self.width = 1
     self.height = 1
     self.region_width = 70
     self.region_height = 70
     self.stride = 1
     # using 5 layers PatchGAN
     self.receptive_field = 70.
     self.roialign = RoIAlign(self.region_height, self.region_width, transform_fpcoor=True)
Beispiel #6
0
	def __init__(self, crop_dim, max_bboxes):
		
		super(RoIFeats, self).__init__()
		
		# Check if scale ratio needs to be given as input here or the coordinates should be modified from outside itself.

		# We'll do a square crop of the image features for each bounding box
		self.crop_dim = crop_dim
		self.max_bboxes = max_bboxes

		self.roi_align = RoIAlign(self.crop_dim, self.crop_dim, transform_fpcoor=False)
		self.avg_pool = nn.AvgPool2d(self.crop_dim)
Beispiel #7
0
 def __init__(self, config):
     super(Proposal, self).__init__()
     self.width = config['window_width']
     self.height = config['window_height']
     self.region_width = config['region_width']
     self.region_height = config['region_height']
     self.stride = 1
     # using 5 layers PatchGAN
     self.receptive_field = config['receptive_field']
     self.roialign = RoIAlign(self.region_height,
                              self.region_width,
                              transform_fpcoor=True)  ###
     # use mask operation or not
     self.mask_opt = config['mask_operation']
Beispiel #8
0
    def __init__(self, cfg):
        super(GCNnet_volleyball, self).__init__()
        self.cfg=cfg
        
        T, N=self.cfg.num_frames, self.cfg.num_boxes
        D=self.cfg.emb_features
        K=self.cfg.crop_size[0]
        NFB=self.cfg.num_features_boxes
        NFR, NFG=self.cfg.num_features_relation, self.cfg.num_features_gcn
        
        
        if cfg.backbone=='inv3':
            self.backbone=MyInception_v3(transform_input=False,pretrained=True)
        elif cfg.backbone=='vgg16':
            self.backbone=MyVGG16(pretrained=True)
        elif cfg.backbone=='vgg19':
            self.backbone=MyVGG19(pretrained=False)
        else:
            assert False
        
        if not cfg.train_backbone:
            for p in self.backbone.parameters():
                p.requires_grad=False
        
        self.roi_align=RoIAlign(*self.cfg.crop_size)
        
        self.fc_emb_1=nn.Linear(K*K*D,NFB)
        self.nl_emb_1=nn.LayerNorm([NFB])
                   
        self.dropout_global=nn.Dropout(p=self.cfg.train_dropout_prob)

        self.fc_activities=nn.Linear(NFG,self.cfg.num_activities)

        self.gcn_list= GCN_Module(cfg)
        
        self.encoder = st_gcn(cfg,in_channels=1024,out_channels=64,kernel_size=(3,1),stride=1)
        self.encoder_a = st_gcn_short(cfg,in_channels=1024,out_channels=64,kernel_size=(1,1),stride=1)
        self.decoder_a = st_gcn_decoder(cfg,in_channels=1024,out_channels=64,kernel_size=(1,1),stride=1)

        self.encoder_p = ConvGraphicalEncoder(in_channels=2, out_channels=1024, \
            kernel_size=1)

        self.decoder_p = ConvGraphicalDecoder(in_channels=1024, out_channels=2, \
            kernel_size=1)

        for m in self.modules():
            if isinstance(m,nn.Linear):
                nn.init.kaiming_normal_(m.weight)
Beispiel #9
0
    def __init__(self, args):
        super(BBoxModule, self).__init__()
        self.args = args
        for supervision in args.supervision:
            if supervision['name'] == 'bbox':
                self.crop_height = int(supervision['other']['pool_size'])
                self.crop_width = int(supervision['other']['pool_size'])
        self.roi_align = RoIAlign(self.crop_height,
                                  self.crop_width,
                                  transform_fpcoor=True)

        self.down_sampling_rate = self.args.down_sampling_rate

        self.feat_dim = args.feat_dim * self.crop_width * self.crop_height
        self.num_class = args.num_base_class
        self.regress = nn.Linear(self.feat_dim, 4)
Beispiel #10
0
    def __init__(self, cfg):
        super(Basenet_collective, self).__init__()
        self.cfg=cfg
        
        D=self.cfg.emb_features
        K=self.cfg.crop_size[0]
        NFB=self.cfg.num_features_boxes

        # START: Original code by Zijian and Xinran
        if cfg.backbone == 'inv3':
            self.backbone = MyInception_v3(transform_input=False, pretrained=True)
        elif cfg.backbone == 'vgg16':
            self.backbone = MyVGG16(pretrained=True)
        elif cfg.backbone == 'vgg19':
            self.backbone = MyVGG19(pretrained=True)
        elif cfg.backbone == 'mobilenet':
            self.backbone = MyMobileNet(pretrained=True)
        else:
            assert False
        # END: Original code by Zijian and Xinran

        if not self.cfg.train_backbone:
            for p in self.backbone.parameters():
                p.requires_grad=False
        
        self.roi_align=RoIAlign(*self.cfg.crop_size)

        # START: Original code by Zijian and Xinran
        if cfg.backbone == 'inv3':
            self.fc_emb_1 = nn.Linear(K * K * D, NFB)
        elif cfg.backbone == 'mobilenet':
            self.fc_emb_1 = nn.Linear(32000, NFB)
        # END: Original code by Zijian and Xinran

        self.dropout_emb_1 = nn.Dropout(p=self.cfg.train_dropout_prob)
        self.fc_actions=nn.Linear(NFB,self.cfg.num_actions)
        self.fc_activities=nn.Linear(NFB,self.cfg.num_activities)
        
        for m in self.modules():
            if isinstance(m,nn.Linear):
                nn.init.kaiming_normal_(m.weight)
Beispiel #11
0
def test_roialign(is_cuda=True):
    # generate data
    crop_height = 3
    crop_width = 3
    image_data, boxes_data, box_index_data = generate_data(
        batch_size=2,
        depth=2,
        im_height=10,
        im_width=10,
        n_boxes=2,
        xyxy=True, box_normalize=False)
    max_inp = np.abs(image_data).max()
    print('max_input:', max_inp)

    image_torch = to_varabile(image_data, requires_grad=True, is_cuda=is_cuda)
    boxes = to_varabile(boxes_data, requires_grad=False, is_cuda=is_cuda)
    box_index = to_varabile(box_index_data, requires_grad=False, is_cuda=is_cuda)

    roi_align = RoIAlign(crop_height, crop_width, transform_fpcoor=False)
    gradcheck(roi_align, (image_torch, boxes, box_index), eps=max_inp/500)

    print('test ok')
Beispiel #12
0
    def __init__(self, args, backbone, classifier):
        super(BaseLearningModule, self).__init__()
        self.args = args
        self.backbone = backbone
        self.classifier = classifier

        self.crop_height = int(args.crop_height)
        self.crop_width = args.crop_width
        self.roi_align = RoIAlign(args.crop_height,
                                  args.crop_width,
                                  transform_fpcoor=True)
        self.down_sampling_rate = args.down_sampling_rate

        # supervision modules are generated in train
        # args.module example:
        # [{'name': 'seg', 'module': seg_module}]
        if hasattr(args, 'module'):
            for module in args.module:
                setattr(self, module['name'], module['module'])

        self.mode = 'train'
        if self.classifier is not None:
            self.classifier.mode = self.mode
Beispiel #13
0
    
images = ImageReader()
image_data = DataLoader(dataset=images, num_workers=4, batch_size=1, shuffle=False, drop_last=False) #How to make it work for batch size >1


# In[ ]:


os.environ["CUDA_VISIBLE_DEVICES"] = str(2)
crop_height = 7
crop_width = 7
roi_dir = '/scratch/medhini2/paragraph_generation/dataset/RoI/'
for batch_idx, (filename, featuremap_data, boxes_data, boxes_idx) in enumerate(image_data):
    featuremap = Variable(featuremap_data, requires_grad=True).cuda()
    if boxes_idx[0]!=0:
        boxes = Variable(boxes_data, requires_grad=False).cuda()
        box_index = Variable(boxes_idx.type(torch.cuda.IntTensor), requires_grad=False).cuda()
        #RoIAlign layer
        roi_align = RoIAlign(crop_height, crop_width)
        crops = roi_align(featuremap, boxes[0], box_index)
        roi_file = os.path.join(roi_dir,filename[0])
        torch.save(crops.data,roi_file)
        


# In[ ]:




Beispiel #14
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([transforms.ToTensor()])

    num_classes = 80
    yolov3 = Darknet(args.cfg_file)
    yolov3.load_weights(args.weights_file)
    yolov3.net_info["height"] = args.reso
    inp_dim = int(yolov3.net_info["height"])
    assert inp_dim % 32 == 0
    assert inp_dim > 32
    print("yolo-v3 network successfully loaded")

    attribute_size = [15, 7, 3, 5, 8, 4, 15, 7, 3, 5, 3, 3, 4]

    encoder = EncoderClothing(args.embed_size, device, args.roi_size,
                              attribute_size)

    yolov3.to(device)
    encoder.to(device)

    yolov3.eval()
    encoder.eval()

    encoder.load_state_dict(torch.load(args.encoder_path))

    #cap = cv2.VideoCapture('demo2.mp4')

    cap = cv2.VideoCapture(0)
    assert cap.isOpened(), 'Cannot capture source'

    frames = 0
    start = time.time()

    counter = Counter()
    color_stream = list()
    pattern_stream = list()
    gender_stream = list()
    season_stream = list()
    class_stream = list()
    sleeves_stream = list()

    ret, frame = cap.read()
    if ret:

        image, orig_img, dim = prep_image2(frame, inp_dim)
        im_dim = torch.FloatTensor(dim).repeat(1, 2)

        image_tensor = image.to(device)
    detections = yolov3(image_tensor, device, True)

    os.system('clear')
    cv2.imshow("frame", orig_img)
    cv2.moveWindow("frame", 50, 50)
    text_img = np.zeros((200, 1750, 3))
    cv2.imshow("text", text_img)
    cv2.moveWindow("text", 50, dim[1] + 110)

    while cap.isOpened():

        ret, frame = cap.read()
        #### ret, frame = ros_message_cam_image()
        if ret:

            image, orig_img, dim = prep_image2(frame, inp_dim)
            im_dim = torch.FloatTensor(dim).repeat(1, 2)

            image_tensor = image.to(device)
            im_dim = im_dim.to(device)

            # Generate an caption from the image
            detections = yolov3(image_tensor, device,
                                True)  # prediction mode for yolo-v3
            detections = write_results(detections,
                                       args.confidence,
                                       num_classes,
                                       device,
                                       nms=True,
                                       nms_conf=args.nms_thresh)

            #### detections = ros_message_rois()
            #### ros_rois --> [0,0, x1, y1, x2, y2]

            # original image dimension --> im_dim

            #view_image(detections)
            text_img = np.zeros((200, 1750, 3))

            if type(detections) != int:
                if detections.shape[0]:
                    bboxs = detections[:, 1:5].clone()

                    im_dim = im_dim.repeat(detections.shape[0], 1)
                    scaling_factor = torch.min(inp_dim / im_dim,
                                               1)[0].view(-1, 1)

                    detections[:, [1, 3]] -= (inp_dim - scaling_factor *
                                              im_dim[:, 0].view(-1, 1)) / 2
                    detections[:, [2, 4]] -= (inp_dim - scaling_factor *
                                              im_dim[:, 1].view(-1, 1)) / 2

                    detections[:, 1:5] /= scaling_factor

                    small_object_ratio = torch.FloatTensor(detections.shape[0])

                    for i in range(detections.shape[0]):
                        detections[i, [1, 3]] = torch.clamp(
                            detections[i, [1, 3]], 0.0, im_dim[i, 0])
                        detections[i, [2, 4]] = torch.clamp(
                            detections[i, [2, 4]], 0.0, im_dim[i, 1])

                        object_area = (detections[i, 3] - detections[i, 1]) * (
                            detections[i, 4] - detections[i, 2])
                        orig_img_area = im_dim[i, 0] * im_dim[i, 1]
                        small_object_ratio[i] = object_area / orig_img_area

                    detections = detections[small_object_ratio > 0.05]
                    im_dim = im_dim[small_object_ratio > 0.05]

                    if detections.size(0) > 0:
                        feature = yolov3.get_feature()
                        feature = feature.repeat(detections.size(0), 1, 1, 1)

                        orig_img_dim = im_dim[:, 1:]
                        orig_img_dim = orig_img_dim.repeat(1, 2)

                        scaling_val = 16

                        bboxs /= scaling_val
                        bboxs = bboxs.round()
                        bboxs_index = torch.arange(bboxs.size(0),
                                                   dtype=torch.int)
                        bboxs_index = bboxs_index.to(device)
                        bboxs = bboxs.to(device)

                        roi_align = RoIAlign(args.roi_size,
                                             args.roi_size,
                                             transform_fpcoor=True).to(device)
                        roi_features = roi_align(feature, bboxs, bboxs_index)

                        outputs = encoder(roi_features)

                        for i in range(detections.shape[0]):

                            sampled_caption = []
                            #attr_fc = outputs[]
                            for j in range(len(outputs)):
                                max_index = torch.max(outputs[j][i].data, 0)[1]
                                word = attribute_pool[j][max_index]
                                sampled_caption.append(word)

                            c11 = sampled_caption[11]
                            sampled_caption[11] = sampled_caption[10]
                            sampled_caption[10] = c11

                            sentence = ' '.join(sampled_caption)

                            sys.stdout.write(
                                '                                                                                        '
                                + '\r')

                            sys.stdout.write(sentence + '             ' + '\r')
                            sys.stdout.flush()
                            write(detections[i], orig_img, sentence, i + 1,
                                  coco_classes, colors)

                            cv2.putText(text_img, sentence, (0, i * 40 + 35),
                                        cv2.FONT_HERSHEY_PLAIN, 2,
                                        [255, 255, 255], 1)

            cv2.imshow("frame", orig_img)
            cv2.imshow("text", text_img)

            key = cv2.waitKey(1)
            if key & 0xFF == ord('q'):
                break
            if key & 0xFF == ord('w'):
                wait(0)
            if key & 0xFF == ord('s'):
                continue
            frames += 1
            #print("FPS of the video is {:5.2f}".format( frames / (time.time() - start)))

        else:
            break
Beispiel #15
0
import numpy as np
import torch
from torch.autograd import Variable

from roi_align.roi_align import RoIAlign


def to_varabile(arr, requires_grad=False, is_cuda=True):
    tensor = torch.from_numpy(arr)
    if is_cuda:
        tensor = tensor.cuda()
    var = Variable(tensor, requires_grad=requires_grad)
    return var


# the data you want
is_cuda = False
image_data = np.tile(np.arange(7, dtype=np.float32), 7).reshape(7, 7)
image_data = image_data[np.newaxis, np.newaxis]
boxes_data = np.asarray([[0, 0, 6, 6]], dtype=np.float32)
box_index_data = np.asarray([0], dtype=np.int32)

image_torch = to_varabile(image_data, requires_grad=True, is_cuda=is_cuda)
boxes = to_varabile(boxes_data, requires_grad=False, is_cuda=is_cuda)
box_index = to_varabile(box_index_data, requires_grad=False, is_cuda=is_cuda)

# set transform_fpcoor to False is the crop_and_resize
roi_align = RoIAlign(7, 7, transform_fpcoor=True)
print(roi_align(image_torch, boxes, box_index))
Beispiel #16
0
def main(args):
    # Image preprocessing
    transform = transforms.Compose([transforms.ToTensor()])

    # Load vocabulary wrapper

    # Build the models
    #CUDA = torch.cuda.is_available()

    num_classes = 80
    yolov3 = Darknet(args.cfg_file)
    yolov3.load_weights(args.weights_file)
    yolov3.net_info["height"] = args.reso
    inp_dim = int(yolov3.net_info["height"])
    assert inp_dim % 32 == 0
    assert inp_dim > 32
    print("yolo-v3 network successfully loaded")

    attribute_size = [15, 7, 3, 5, 8, 4, 15, 7, 3, 5, 3, 3, 4]

    encoder = EncoderClothing(args.embed_size, device, args.roi_size,
                              attribute_size)

    # Prepare an image
    images = "test"

    try:
        list_dir = os.listdir(images)
        #   list_dir.sort(key=lambda x: int(x[:-4]))
        imlist = [
            osp.join(osp.realpath('.'), images, img) for img in list_dir
            if os.path.splitext(img)[1] == '.jpg' or os.path.splitext(img)[1]
            == '.JPG' or os.path.splitext(img)[1] == '.png'
        ]
    except NotADirectoryError:
        imlist = []
        imlist.append(osp.join(osp.realpath('.'), images))
        print('Not a directory error')
    except FileNotFoundError:
        print("No file or directory with the name {}".format(images))
        exit()

    yolov3.to(device)
    encoder.to(device)

    yolov3.eval()
    encoder.eval()

    encoder.load_state_dict(torch.load(args.encoder_path))

    for inx, image in enumerate(imlist):

        #print(image)
        image, orig_img, im_dim = prep_image(image, inp_dim)
        im_dim = torch.FloatTensor(im_dim).repeat(1, 2)

        image_tensor = image.to(device)
        im_dim = im_dim.to(device)

        # Generate an caption from the image
        detections = yolov3(image_tensor, device,
                            True)  # prediction mode for yolo-v3
        detections = write_results(detections,
                                   args.confidence,
                                   num_classes,
                                   device,
                                   nms=True,
                                   nms_conf=args.nms_thresh)
        # original image dimension --> im_dim
        #view_image(detections)

        os.system('clear')
        if type(detections) != int:
            if detections.shape[0]:
                bboxs = detections[:, 1:5].clone()
                im_dim = im_dim.repeat(detections.shape[0], 1)
                scaling_factor = torch.min(inp_dim / im_dim, 1)[0].view(-1, 1)

                detections[:, [1, 3]] -= (
                    inp_dim - scaling_factor * im_dim[:, 0].view(-1, 1)) / 2
                detections[:, [2, 4]] -= (
                    inp_dim - scaling_factor * im_dim[:, 1].view(-1, 1)) / 2

                detections[:, 1:5] /= scaling_factor

                small_object_ratio = torch.FloatTensor(detections.shape[0])

                for i in range(detections.shape[0]):
                    detections[i,
                               [1, 3]] = torch.clamp(detections[i, [1, 3]],
                                                     0.0, im_dim[i, 0])
                    detections[i,
                               [2, 4]] = torch.clamp(detections[i, [2, 4]],
                                                     0.0, im_dim[i, 1])

                    object_area = (detections[i, 3] - detections[i, 1]) * (
                        detections[i, 4] - detections[i, 2])
                    orig_img_area = im_dim[i, 0] * im_dim[i, 1]
                    small_object_ratio[i] = object_area / orig_img_area

                detections = detections[small_object_ratio > 0.02]
                im_dim = im_dim[small_object_ratio > 0.02]

                if detections.size(0) > 0:
                    feature = yolov3.get_feature()
                    feature = feature.repeat(detections.size(0), 1, 1, 1)

                    #orig_img_dim = im_dim[:, 1:]
                    #orig_img_dim = orig_img_dim.repeat(1, 2)

                    scaling_val = 16

                    bboxs /= scaling_val
                    bboxs = bboxs.round()
                    bboxs_index = torch.arange(bboxs.size(0), dtype=torch.int)
                    bboxs_index = bboxs_index.to(device)
                    bboxs = bboxs.to(device)

                    roi_align = RoIAlign(args.roi_size,
                                         args.roi_size,
                                         transform_fpcoor=True).to(device)
                    roi_features = roi_align(feature, bboxs, bboxs_index)
                    #    print(roi_features)
                    #    print(roi_features.size())

                    #roi_features = roi_features.reshape(roi_features.size(0), -1)

                    #roi_align_feature = encoder(roi_features)

                    outputs = encoder(roi_features)
                    #attribute_size = [15, 7, 3, 5, 7, 4, 15, 7, 3, 5, 4, 3, 4]
                    #losses = [criteria[i](outputs[i], targets[i]) for i in range(len(attribute_size))]

                    for i in range(detections.shape[0]):

                        sampled_caption = []
                        #attr_fc = outputs[]
                        for j in range(len(outputs)):
                            #temp = outputs[j][i].data
                            max_index = torch.max(outputs[j][i].data, 0)[1]
                            word = attribute_pool[j][max_index]
                            sampled_caption.append(word)

                        c11 = sampled_caption[11]
                        sampled_caption[11] = sampled_caption[10]
                        sampled_caption[10] = c11

                        sentence = ' '.join(sampled_caption)

                        # again sampling for testing
                        #print ('---------------------------')
                        print(str(i + 1) + ': ' + sentence)
                        write(detections[i], orig_img, sentence, i + 1,
                              coco_classes, colors)
                        #list(map(lambda x: write(x, orig_img, captions), detections[i].unsqueeze(0)))

        cv2.imshow("frame", orig_img)
        key = cv2.waitKey(0)
        os.system('clear')
        if key & 0xFF == ord('q'):
            break
Beispiel #17
0
import torch
import cv2
import numpy as np
from torch.autograd import Variable

def to_varabile(data,requires_grad,is_cuda):
    if is_cuda:
        data = data.cuda()
    data = Variable(data,requires_grad=requires_grad)
    return data

# input data
is_cuda = torch.cuda.is_available()
# image_data = cv2.imread('/data/2019AAAI/data/ctw15/test/text_image/1002.jpg')
image_data = np.ones((100,100,3))
image_data = image_data.transpose((2, 0, 1)).astype(np.float32)
image_data = torch.from_numpy((image_data))
boxes_data = torch.Tensor([[0,0,200,200],[0,0,200,200]])
box_index_data = torch.IntTensor([0])
image = to_varabile(image_data, requires_grad=True, is_cuda=is_cuda)
image = image.unsqueeze(0)
print(image.size())
boxes = to_varabile(boxes_data, requires_grad=False, is_cuda=is_cuda)
box_index = to_varabile(box_index_data, requires_grad=False, is_cuda=is_cuda)
print(image,boxes,box_index)
# RoIAlign layer
roi_align = RoIAlign(7, 7,extrapolation_value=0)
crops = roi_align(image, boxes, box_index)
print(crops)

Beispiel #18
0
import numpy as np
import torch
from torch.autograd import Variable

from roi_align.roi_align import RoIAlign


def to_varabile(arr, requires_grad=False, is_cuda=True):
    tensor = torch.from_numpy(arr)
    if is_cuda:
        tensor = tensor.cuda()
    var = Variable(tensor, requires_grad=requires_grad)
    return var


# the data you want
is_cuda = False
image_data = np.tile(np.arange(7, dtype=np.float32), 7).reshape(7, 7)
image_data = image_data[np.newaxis, np.newaxis]
boxes_data = np.asarray([[0, 0, 3, 3]], dtype=np.float32)
box_index_data = np.asarray([0], dtype=np.int32)

image_torch = to_varabile(image_data, requires_grad=True, is_cuda=is_cuda)
boxes = to_varabile(boxes_data, requires_grad=False, is_cuda=is_cuda)
box_index = to_varabile(box_index_data, requires_grad=False, is_cuda=is_cuda)

# set transform_fpcoor to False is the crop_and_resize
print(
    RoIAlign.apply(image_torch, boxes, box_index, 3, 3, transform_fpcoor=True))
Beispiel #19
0
    def __init__(self):
        super(LPR, self).__init__()
        self.header = torch.IntTensor([0, 0, 0, 0])
        # set transform_fpcoor to False is the crop_and_resize
        #self.roi_align = RoIAlign(28, 28, transform_fpcoor=True)
        self.roi_align = RoIAlign(28, 28)

        self.conv_1 = LPConv(3, 32, 3, 1, 1, bias=False)

        self.conv_2 = LPConv(32, 64, 3, 2, 1, bias=False)
        self.res1 = ResBlock(32, 64, bias=False)

        self.conv_3 = LPConv(64, 128, 3, 2, 1, bias=False)
        self.res2_1 = ResBlock(64, 128, bias=False)
        self.res2_2 = ResBlock(64, 128, bias=False)

        self.conv_4 = LPConv(128, 256, 3, 2, 1, bias=False)
        self.res3_1 = ResBlock(128, 256, bias=False)
        self.res3_2 = ResBlock(128, 256, bias=False)
        self.res3_3 = ResBlock(128, 256, bias=False)
        self.res3_4 = ResBlock(128, 256, bias=False)
        self.res3_5 = ResBlock(128, 256, bias=False)
        self.res3_6 = ResBlock(128, 256, bias=False)
        self.res3_7 = ResBlock(128, 256, bias=False)
        self.res3_8 = ResBlock(128, 256, bias=False)

        self.conv_5 = LPConv(256, 512, 3, 2, 1, bias=False)
        self.res4_1 = ResBlock(256, 512, bias=False)
        self.res4_2 = ResBlock(256, 512, bias=False)
        self.res4_3 = ResBlock(256, 512, bias=False)
        self.res4_4 = ResBlock(256, 512, bias=False)
        self.res4_5 = ResBlock(256, 512, bias=False)
        self.res4_6 = ResBlock(256, 512, bias=False)
        self.res4_7 = ResBlock(256, 512, bias=False)
        self.res4_8 = ResBlock(256, 512, bias=False)


        self.conv_6 = LPConv(512, 1024, 3, 2, 1, bias=False)
        self.res5_1 = ResBlock(512, 1024, bias=False)
        self.res5_2 = ResBlock(512, 1024, bias=False)
        self.res5_3 = ResBlock(512, 1024, bias=False)
        self.res5_4 = ResBlock(512, 1024, bias=False)

        self.detection3 = DetectionBlock(512, 1024, bias=False)
        self.concat3 = UpsampleBlock(256, 512, bias=False)
        self.detection2 = DetectionBlock(256, 512, 256, bias=False)
        self.concat2 = UpsampleBlock(128, 256, bias=False)
        self.detection1 = DetectionBlock(128, 256, 128, bias=False)

        self.feature1 = nn.Sequential(
            self.conv_1, self.conv_2, self.res1,
            self.conv_3, self.res2_1, self.res2_2,
            self.conv_4, self.res3_1, self.res3_2, self.res3_3, self.res3_4, self.res3_5, self.res3_6, self.res3_7, self.res3_8,
        )

        self.feature2 = nn.Sequential(
            self.conv_5, self.res4_1, self.res4_2, self.res4_3, self.res4_4, self.res4_5, self.res4_6, self.res4_7, self.res4_8,
        )
        self.feature3 = nn.Sequential(
            self.conv_6, self.res5_1, self.res5_2, self.res5_3, self.res5_4,
        )

        self.module_list = nn.ModuleList()
        self.module_list.append(self.feature1)
        self.module_list.append(self.feature2)
        self.module_list.append(self.feature3)
        self.module_list.append(self.detection3)
        self.module_list.append(self.concat3)
        self.module_list.append(self.detection2)
        self.module_list.append(self.concat2)
        self.module_list.append(self.detection1)
        for p in self.parameters():
            p.requires_grad=False

        self.lpdetection1 = LprDetection1(64, 256, bias=False)
        self.lpdetection1_1 = LprDetection1(64, 128, bias=False)
        self.lpdetection2 = LPR_Classifer1()

        # CornerNet
        cnv_dim = 128
        out_dim = 1
        curr_dim = 128

        self.cnvs = nn.ModuleList([
            make_cnv_layer(curr_dim, cnv_dim)
        ])

        self.ct_cnvs = nn.ModuleList([
            make_ct_layer(cnv_dim)
        ])
        ## keypoint heatmaps
        self.ct_heats = nn.ModuleList([
            make_kp_layer(cnv_dim, curr_dim, out_dim)
        ])
        for ct_heat in self.ct_heats:
            ct_heat[-1].bias.data.fill_(-2.19)

        self.ct_regrs = nn.ModuleList([
            make_kp_layer(cnv_dim, curr_dim, 2)
        ])
        self.regboxes = nn.ModuleList([
            make_kp_layer(cnv_dim, curr_dim, 2)
        ])
        self.affine = nn.ModuleList([
            make_kp_layer(cnv_dim, curr_dim, 6)
        ])

        self.loss = AELoss2(pull_weight=1e-1, wh_weight=2e-1, focal_loss=_neg_loss)
Beispiel #20
0
import numpy as np
import torch
from torch.autograd import Variable

from roi_align.roi_align import RoIAlign


def to_varabile(arr, requires_grad=False, is_cuda=True):
    tensor = torch.from_numpy(arr)
    if is_cuda:
        tensor = tensor.cuda()
    var = Variable(tensor, requires_grad=requires_grad)
    return var


# the data you want
is_cuda = False
image_data = np.tile(np.arange(7, dtype=np.float32), 7).reshape(7, 7)
image_data = image_data[np.newaxis, np.newaxis]
boxes_data = np.asarray([[0, 0, 3, 3]], dtype=np.float32)
box_index_data = np.asarray([0], dtype=np.int32)

image_torch = to_varabile(image_data, requires_grad=True, is_cuda=is_cuda)
boxes = to_varabile(boxes_data, requires_grad=False, is_cuda=is_cuda)
box_index = to_varabile(box_index_data, requires_grad=False, is_cuda=is_cuda)

# set transform_fpcoor to False is the crop_and_resize
roi_align = RoIAlign(3, 3, transform_fpcoor=True)
print(roi_align(image_torch, boxes, box_index))
Beispiel #21
0
def detect_attributes(image, yolo_dim, yolov3, encoder):
    ''' detect_attributes
    '''
    text_results = []
    image, orig_img, im_dim = prep_image(image, yolo_dim)
    im_dim = torch.FloatTensor(im_dim).repeat(1, 2)

    image_tensor = image.to(device)
    im_dim = im_dim.to(device)

    # Generate an caption from the image
    # prediction mode for yolo-v3
    detections = yolov3(image_tensor, device, True)
    detections = write_results(
        detections,
        args.confidence,
        device,
        num_classes=80,
        nms=True,
        nms_conf=args.nms_thresh,
    )
    # original image dimension --> im_dim
    # view_image(detections)

    os.system("clear")
    if not isinstance(detections, int):
        if detections.shape[0]:
            bboxs = detections[:, 1:5].clone()
            im_dim = im_dim.repeat(detections.shape[0], 1)
            scaling_factor = torch.min(yolo_dim / im_dim, 1)[0].view(-1, 1)

            detections[:, [1, 3]] -= (
                yolo_dim - scaling_factor * im_dim[:, 0].view(-1, 1)) / 2
            detections[:, [2, 4]] -= (
                yolo_dim - scaling_factor * im_dim[:, 1].view(-1, 1)) / 2

            detections[:, 1:5] /= scaling_factor

            small_object_ratio = torch.FloatTensor(detections.shape[0])

            for i in range(detections.shape[0]):
                detections[i, [1, 3]] = torch.clamp(detections[i, [1, 3]], 0.0,
                                                    im_dim[i, 0])
                detections[i, [2, 4]] = torch.clamp(detections[i, [2, 4]], 0.0,
                                                    im_dim[i, 1])

                object_area = (detections[i, 3] - detections[i, 1]) * (
                    detections[i, 4] - detections[i, 2])
                orig_img_area = im_dim[i, 0] * im_dim[i, 1]
                small_object_ratio[i] = object_area / orig_img_area

            detections = detections[small_object_ratio > 0.02]
            im_dim = im_dim[small_object_ratio > 0.02]

            if detections.size(0) > 0:
                feature = yolov3.get_feature()
                feature = feature.repeat(detections.size(0), 1, 1, 1)

                scaling_val = 16

                bboxs /= scaling_val
                bboxs = bboxs.round()
                bboxs_index = torch.arange(bboxs.size(0), dtype=torch.int)
                bboxs_index = bboxs_index.to(device)
                bboxs = bboxs.to(device)

                roi_align = RoIAlign(args.roi_size,
                                     args.roi_size,
                                     transform_fpcoor=True).to(device)
                roi_features = roi_align(feature, bboxs, bboxs_index)

                outputs = encoder(roi_features)

                for i in range(detections.shape[0]):

                    sampled_caption = []

                    for j in range(len(outputs) - 1):
                        max_index = torch.max(outputs[j][i].data, 0)[1]
                        word = attribute_pool[j][max_index]
                        sampled_caption.append(word)
            # for reversion lower length and lower type
                    c11 = sampled_caption[11]
                    sampled_caption[11] = sampled_caption[10]
                    sampled_caption[10] = c11

                    sentence = " ".join(sampled_caption)

                    print(str(i + 1) + ": " + sentence)
                    write(
                        detections[i],
                        orig_img,
                        sentence,
                        i + 1,
                        coco_classes,
                        colors,
                    )
                return text_results, orig_img