class EmotionDetector: def __init__(self, model='VGG19', main_dir=main_dir_path, face_detector='undefined', use_cuda=False, reliability=0.8): self.main_dir = main_dir self.face_detector = face_detector self.use_cuda = use_cuda self.reliability = reliability self.cut_size = 44 self.transform_test = transforms.Compose([ transforms.TenCrop(self.cut_size), transforms.Lambda(lambda crops: torch.stack( [transforms.ToTensor()(crop) for crop in crops])), ]) self.class_names = [ 'Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral' ] if model == 'VGG19': self.net = VGG('VGG19') elif model == 'Resnet18': self.net = ResNet18() self.checkpoint = torch.load(os.path.join( self.main_dir + 'pretrained_model/' + model, 'PrivateTest_model.t7'), map_location='cpu') self.net.load_state_dict(self.checkpoint['net']) if self.use_cuda: self.net.cuda() self.net.eval() def rgb2gray(self, rgb): return np.dot(rgb[..., :3], [0.299, 0.587, 0.114]) def detect_emotion_single_face(self, raw_img): ''' This function is used to dectect facial emotion for an image of single face ''' gray = self.rgb2gray(raw_img) gray = resize(gray, (48, 48), mode='symmetric').astype(np.uint8) img = gray[:, :, np.newaxis] img = np.concatenate((img, img, img), axis=2) img = Image.fromarray(img) inputs = self.transform_test(img) ncrops, c, h, w = np.shape(inputs) inputs = inputs.view(-1, c, h, w) if self.use_cuda: inputs = inputs.cuda() inputs = Variable(inputs, volatile=True) outputs = self.net(inputs) outputs_avg = outputs.view(ncrops, -1).mean(0) # avg over crops score = F.softmax(outputs_avg) _, predicted = torch.max(outputs_avg.data, 0) if torch.max(score) > self.reliability: #return score, predicted return score, self.class_names[int(predicted.cpu().numpy())] else: return score, 'UNK' def detect_emotion_multiple_face(self, raw_img): ''' This function is used to dectect facial emotion for an image with multiple faces ''' if isinstance(self.face_detector, MTCNN): bounding_boxes, _, _ = self.face_detector.align(raw_img) else: print( 'No MTCNN face dectector found.' ) #TODO: change to add more facedetection model to do experiments) scores = [] predicteds = [] for facebox in bounding_boxes: face_img = raw_img[int(facebox[1]):int(facebox[3]), int(facebox[0]):int(facebox[2])] gray = self.rgb2gray(face_img) gray = resize(gray, (48, 48), mode='symmetric').astype(np.uint8) img = gray[:, :, np.newaxis] img = np.concatenate((img, img, img), axis=2) img = Image.fromarray(img) inputs = self.transform_test(img) ncrops, c, h, w = np.shape(inputs) inputs = inputs.view(-1, c, h, w) if self.use_cuda: inputs = inputs.cuda() inputs = Variable(inputs, volatile=True) outputs = self.net(inputs) outputs_avg = outputs.view(ncrops, -1).mean(0) # avg over crops score = F.softmax(outputs_avg) _, predicted = torch.max(outputs_avg.data, 0) scores.append(score) #predicteds.append(predicted) if torch.max(score) > self.reliability: predicteds.append(self.class_names[int( predicted.cpu().numpy())]) else: predicteds.append('UNK') return bounding_boxes, scores, predicteds def detect_emotion_from_faceboxes(self, faceboxes): ''' ''' scores = [] predicteds = [] for facebox in faceboxes: gray = self.rgb2gray(face_img) gray = resize(gray, (48, 48), mode='symmetric').astype(np.uint8) img = gray[:, :, np.newaxis] img = np.concatenate((img, img, img), axis=2) img = Image.fromarray(img) inputs = self.transform_test(img) ncrops, c, h, w = np.shape(inputs) inputs = inputs.view(-1, c, h, w) if self.use_cuda: inputs = inputs.cuda() inputs = Variable(inputs, volatile=True) outputs = self.net(inputs) outputs_avg = outputs.view(ncrops, -1).mean(0) # avg over crops score = F.softmax(outputs_avg) _, predicted = torch.max(outputs_avg.data, 0) scores.append(score) #predicteds.append(predicted) if torch.max(score) > self.reliability: predicteds.append(self.class_names[int( predicted.cpu().numpy())]) else: predicteds.append('UNK') return scores, predicteds
class CNNDetector(object): def __init__(self, net_12_param_path=None, net_48_param_path=None, net_vgg_param_path=None, use_cuda=True, pthreshold=0.7, rthershold=0.9): if use_cuda == False: self.device = torch.device('cpu') else: self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') if net_12_param_path is not None: self.net_12 = Net12() self.net_12.load_state_dict( torch.load(net_12_param_path, map_location=lambda storage, loc: storage)) self.net_12.to(self.device) self.net_12.eval() if net_48_param_path is not None: self.net_48 = Net48() self.net_48.load_state_dict( torch.load(net_48_param_path, map_location=lambda storage, loc: storage)) self.net_48.to(self.device) self.net_48.eval() if net_vgg_param_path is not None: self.net_vgg = VGG('VGG19') self.net_vgg.load_state_dict( torch.load(net_vgg_param_path, map_location=lambda storage, loc: storage)) self.net_vgg.to(self.device) self.net_vgg.eval() self.pthreshold = pthreshold self.rthershold = rthershold def generate_stage(self, img): """ Args: img: source image Rets: bounding boxes, numpy array, n x 5 Generate face bounding box proposals using net-12. """ proposals = list() downscaling_factor = 0.7 current_height, current_width, _ = img.shape current_scale = 1.0 # limit maximum height to 500 if current_height > 500: current_scale = 500.0 / current_height receptive_field = 12 stride = 2 while True: # get the resized image at current scale im_resized = imageproc.resize_image(img, current_scale) current_height, current_width, _ = im_resized.shape if min(current_height, current_width ) <= receptive_field: # receptive field of the net-12 break # transpose hwc (Numpy) to chw (Tensor) feed_imgs = ( transforms.ToTensor()(im_resized)).unsqueeze(0).float() # feed to net-12 with torch.no_grad(): feed_imgs = feed_imgs.to(self.device) bbox_class, bbox_regress = self.net_12(feed_imgs) bbox_class = bbox_class.cpu().squeeze(0).detach().numpy() bbox_regress = bbox_regress.cpu().squeeze(0).detach().numpy() # FILTER classes with threshold up_thresh_masked_index = np.where( bbox_class > self.pthreshold) # threshold up_thresh_masked_index = up_thresh_masked_index[1:3] filtered_results = np.vstack([ # pixel coordinate for receptive window np.round((stride * up_thresh_masked_index[1]) / current_scale), np.round((stride * up_thresh_masked_index[0]) / current_scale), np.round( (stride * up_thresh_masked_index[1] + receptive_field) / current_scale), np.round( (stride * up_thresh_masked_index[0] + receptive_field) / current_scale), # original bbox output form network bbox_class[0, up_thresh_masked_index[0], up_thresh_masked_index[1]], bbox_regress[:, up_thresh_masked_index[0], up_thresh_masked_index[1]], ]).T keep_mask = imageproc.neighbour_supression(filtered_results[:, :5], 0.7, 'Union') filtered_results = filtered_results[keep_mask] current_scale *= downscaling_factor proposals.append(filtered_results) # aggregate proposals from list proposals = np.vstack(proposals) keep_mask = imageproc.neighbour_supression(proposals[:, 0:5], 0.5, 'Union') proposals = proposals[keep_mask] if len(proposals) == 0: # no proposal generated return None # convert multi-sacle bbox to unified bbox at original img scale receptive_window_width_pixels = proposals[:, 2] - proposals[:, 0] + 1 receptive_window_height_pixels = proposals[:, 3] - proposals[:, 1] + 1 bbox_aligned = np.vstack([ proposals[:, 0] + proposals[:, 5] * receptive_window_width_pixels, # upleft_x proposals[:, 1] + proposals[:, 6] * \ receptive_window_height_pixels, # upleft_y proposals[:, 2] + proposals[:, 7] * \ receptive_window_width_pixels, # downright_x proposals[:, 3] + proposals[:, 8] * \ receptive_window_height_pixels, # downright_y proposals[:, 4], # classes ]) bbox_aligned = bbox_aligned.T return bbox_aligned def refine_stage(self, img, proposal_bbox): """ Args: img: source image proposal_bbox: bounding box proposals from generate stage Rets: bounding boxes, numpy array, n x 5 Apply delta corrdinate to bboxes using net-48. """ if proposal_bbox is None: return None, None proposal_bbox = imageproc.convert_to_square(proposal_bbox) cropped_tmp_tensors = imageproc.bbox_crop(img, proposal_bbox) # feed to net-48 with torch.no_grad(): feed_imgs = Variable(torch.stack(cropped_tmp_tensors)) feed_imgs = feed_imgs.to(self.device) bbox_class, bbox_regress, landmark = self.net_48(feed_imgs) bbox_class = bbox_class.cpu().detach().numpy() bbox_regress = bbox_regress.cpu().detach().numpy() landmark = landmark.cpu().detach().numpy() # threshold up_thresh_masked_index = np.where(bbox_class > self.rthershold)[0] boxes = proposal_bbox[up_thresh_masked_index] bbox_class = bbox_class[up_thresh_masked_index] bbox_regress = bbox_regress[up_thresh_masked_index] landmark = landmark[up_thresh_masked_index] # aggregate keep_mask = imageproc.neighbour_supression(boxes, 0.5, mode="Minimum") if len(keep_mask) == 0: return None, None proposals = boxes[keep_mask] bbox_class = bbox_class[keep_mask] bbox_regress = bbox_regress[keep_mask] landmark = landmark[keep_mask] receptive_window_width_pixels = proposals[:, 2] - proposals[:, 0] + 1 receptive_window_height_pixels = proposals[:, 3] - proposals[:, 1] + 1 # get new bounding boxes boxes_align = np.vstack([ proposals[:, 0] + bbox_regress[:, 0] * receptive_window_width_pixels, # upleft_x proposals[:, 1] + bbox_regress[:, 1] * receptive_window_height_pixels, # upleft_y proposals[:, 2] + bbox_regress[:, 2] * receptive_window_width_pixels, # downright_x proposals[:, 3] + bbox_regress[:, 3] * receptive_window_height_pixels, # downright_y bbox_class[:, 0], ]).T # get facial landmarks align_landmark_topx = proposals[:, 0] align_landmark_topy = proposals[:, 1] landmark_align = np.vstack([ align_landmark_topx + landmark[:, 0] * receptive_window_width_pixels, # lefteye_x align_landmark_topy + landmark[:, 1] * receptive_window_height_pixels, # lefteye_y align_landmark_topx + landmark[:, 2] * receptive_window_width_pixels, # righteye_x align_landmark_topy + landmark[:, 3] * receptive_window_height_pixels, # righteye_y align_landmark_topx + landmark[:, 4] * receptive_window_width_pixels, # nose_x align_landmark_topy + landmark[:, 5] * receptive_window_height_pixels, # nose_y align_landmark_topx + landmark[:, 6] * receptive_window_width_pixels, # leftmouth_x align_landmark_topy + landmark[:, 7] * receptive_window_height_pixels, # leftmouth_y align_landmark_topx + landmark[:, 8] * receptive_window_width_pixels, # rightmouth_x align_landmark_topy + landmark[:, 9] * receptive_window_height_pixels, # rightmouth_y ]).T return boxes_align, landmark_align def detect_face(self, img, atleastone=True): """ Args: img: source image atleastone: whether the size of image should be retured when no face is found Rets: bounding boxes, numpy array landmark, numpy array Detect faces in the image. """ if self.net_12: boxes_align = self.generate_stage(img) if self.net_48: boxes_align, landmark_align = self.refine_stage(img, boxes_align) if boxes_align is None: if atleastone: boxes_align = np.array([[0, 0, img.shape[1], img.shape[0]]]) else: boxes_align = np.array([]) if landmark_align is None: landmark_align = np.array([]) return boxes_align, landmark_align def crop_faces(self, img, bbox=None): """ see imageproc.bbox_crop """ return imageproc.bbox_crop(img, bbox, totensor=False) def vgg_net(self, img): """ Args: img: source image Rets: prob of each expression: in order of ['Angry', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral'] Detect facial expression in the image. """ grey_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) grey_img = cv2.resize(grey_img, (48, 48)).astype(np.uint8) grey_img = grey_img[:, :, np.newaxis] grey_img = np.concatenate((grey_img, grey_img, grey_img), axis=2) receptive_field = 44 # get ten crops at the corners and center tencrops = transforms.Compose([ transforms.ToPILImage(), transforms.TenCrop(receptive_field), transforms.Lambda(lambda crops: torch.stack( [transforms.ToTensor()(crop) for crop in crops])), ]) inputs = tencrops(grey_img) ncrops, c, h, w = np.shape(inputs) # feed to VGG net with torch.no_grad(): inputs = inputs.view(-1, c, h, w) inputs = inputs.to(self.device) outputs = self.net_vgg(inputs) # get mean value across all the crops outputs_avg = outputs.view(ncrops, -1).mean(0) probabilities = F.softmax(outputs_avg, dim=0) # max prob as the detection resutlt _, predicted_class = torch.max(outputs_avg.data, 0) probabilities = probabilities.cpu().numpy() predicted_class = int(predicted_class.cpu().numpy()) return probabilities, predicted_class
class DPP(object): def __init__(self, args): self.criterion = nn.CrossEntropyLoss().cuda() self.lr = args.lr self.epochs = args.epochs self.save_dir = './' + args.save_dir #later change if (os.path.exists(self.save_dir) == False): os.mkdir(self.save_dir) if (args.model == 'vgg16'): self.model = VGG('VGG16', 0) self.optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, self.model.parameters()), lr=self.lr, momentum=args.momentum, weight_decay=args.weight_decay) self.model = torch.nn.DataParallel(self.model) self.model.cuda() elif (args.model == 'dpp_vgg16'): self.model = integrated_kernel(args) self.optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, self.model.parameters()), lr=self.lr, momentum=args.momentum, weight_decay=args.weight_decay) #Parallel num_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad) print('The number of parametrs of models is', num_params) if (args.save_load): location = args.save_location print("locaton", location) checkpoint = torch.load(location) self.model.load_state_dict(checkpoint['state_dict']) def train(self, train_loader, test_loader, graph): #Declaration Model self.model.train() best_prec = 0 losses = AverageMeter() top1 = AverageMeter() for epoch in range(self.epochs): #Test Accuarcy #self.adjust_learning_rate(epoch) for k, (inputs, target) in enumerate(train_loader): target = target.cuda(async=True) input_var = inputs.cuda() target_var = target output = self.model(input_var) loss = self.criterion(output, target_var) #Compute gradient and Do SGD step self.optimizer.zero_grad() loss.backward() self.optimizer.step() #Measure accuracy and record loss prec1 = self.accuracy(output.data, target)[0] losses.update(loss.item(), inputs.size(0)) top1.update(prec1.item(), inputs.size(0)) graph.train_loss(losses.avg, epoch, 'train_loss') graph.train_acc(top1.avg, epoch, 'train_acc') prec = self.test(test_loader, epoch, graph) if (prec > best_prec): print("Acc", prec) best_prec = prec self.save_checkpoint( { 'best_prec1': best_prec, 'state_dict': self.model.state_dict(), }, filename=os.path.join(self.save_dir, 'checkpoint_{}.tar'.format(epoch))) def test(self, test_loader, epoch, test_graph): self.model.eval() losses = AverageMeter() top1 = AverageMeter() for k, (inputs, target) in enumerate(test_loader): target = target.cuda() inputs = inputs.cuda() #Calculate each model #Compute gradient and Do SGD step output = self.model(inputs) loss = self.criterion(output, target) #Measure accuracy and record loss prec1 = self.accuracy(output.data, target)[0] losses.update(loss.item(), inputs.size(0)) top1.update(prec1.item(), inputs.size(0)) test_graph.test_loss(losses.avg, epoch, 'test_loss') test_graph.test_acc(top1.avg, epoch, 'test_acc') return top1.avg def accuracy(self, output, target, topk=(1, )): maxk = max(topk) batch_size = target.size(0) _, pred = output.topk(maxk, 1, True, True) pred = pred.t() correct = pred.eq(target.view(1, -1).expand_as(pred)) res = [] for k in topk: correct_k = correct[:k].view(-1).float().sum(0) res.append(correct_k.mul_(100.0 / batch_size)) return res def adjust_learning_rate(self, epoch): self.lr = self.lr * (0.1**(epoch // 90)) for param_group in self.optimizer.param_groups: param_group['lr'] = self.lr def save_checkpoint(self, state, filename='checkpoint.pth.tar'): torch.save(state, filename)