def main(): cam = cv2.VideoCapture(0) densepose = DensePoseWrapper() sanitizer = Sanitizer() sanitizer.loadModel("./models/Sanitizer.pth") tracker = Tracker() uvMapper = UVMapper() descriptionExtractor = DescriptionExtractor() while True: # Get image from webcam return_value, image = assert return_value, "Failed to read from web camera" # White balance the image to get better color features image = white_balance(image) debugImage = image.copy() # Send image to DensePose people = densepose.extract(image) debugImage = densepose.renderDebug(debugImage, people) # Refine DensePose output to get actual people people = sanitizer.extract(people) debugImage = sanitizer.renderDebug(debugImage, alpha=0.2) # Track the people (which modifies the people variables) tracker.extract(people, True) debugImage = tracker.renderDebug(debugImage, people) # Extract UV map for each person peopleMaps = uvMapper.extract(people, image) peopleTextures = uvMapper.getPeopleTexture(peopleMaps) for i in range(len(peopleTextures)): cv2.imshow("UV image " + str(i), peopleTextures[i]) # Classify what the person is wearing clothes = descriptionExtractor.extract(peopleMaps) # Show image print("Show image") cv2.imshow("debug image", debugImage) # Quit on escape if cv2.waitKey(1) == 27: break print("") cv2.destroyAllWindows()
class ActionClassifier(DenseSense.algorithms.Algorithm.Algorithm): actions = { 4: "dance", 11: "sit", 14: "walk", 69: "hand wave", 12: "idle", # stand 17: "idle", # carry/hold (an object) 36: "idle", # lift/pick up 37: "idle", # listen 47: "idle", # put down } COCO_Datasets = ["val2014", "train2014", "val2017", "train2017"] AVA_Datasets = [ "ava_val", "ava_train", "ava_val_predictive", "ava_train_predictive" ] def __init__(self): print("Initiating ActionClassifier") super().__init__() self._modelPath = None self._AE_model = AutoEncoder() self._training = False def loadModel(self, modelPath): # TODO: load multiple models, refactor name self._modelPath = modelPath print("Loading ActionClassifier file from: " + self._modelPath) self._AE_model.load_state_dict( torch.load(self._modelPath, map_location=device)) def saveModel(self, modelPath): if modelPath is None: print("Don't know where to save model") self._modelPath = modelPath print("Saving ActionClassifier model to: " + self._modelPath), self._modelPath) def extract_ae(self, people, delta_time=None): S = _tensorify_people(people) if S.shape[0] == 0: return # Run prediction with torch.no_grad(): embeddings = self._AE_model.encode(S, delta_time) # Add prediction to people for i, embedding in enumerate(embeddings): people[i].pose_vector = embedding.detach().cpu().numpy() def _initTraining(self, learningRate, datasetName, useLMDB): self.datasetName = datasetName from DenseSense.algorithms.DensePoseWrapper import DensePoseWrapper from DenseSense.algorithms.Sanitizer import Sanitizer self.denseposeExtractor = DensePoseWrapper() self.sanitizer = Sanitizer() self.sanitizer.load_model(topDir + "/models/Sanitizer.pth") if datasetName in ActionClassifier.COCO_Datasets: print("Loading COCO dataset: " + datasetName) from pycocotools.coco import COCO from os import path annFile = topDir + '/annotations/instances_{}.json'.format( datasetName) self.cocoPath = topDir + '/data/{}'.format(datasetName) self.coco = COCO(annFile) personCatID = self.coco.getCatIds(catNms=['person'])[0] self.dataset = self.coco.getImgIds(catIds=personCatID) elif datasetName in ActionClassifier.AVA_Datasets: print("Loading AVA dataset: " + datasetName) import csv from collections import defaultdict from DenseSense.utils.YoutubeLoader import YoutubeLoader annFile = topDir + "/annotations/{}.csv".format( datasetName.replace("_predictive", "")) self.dataset = defaultdict(lambda: defaultdict(defaultdict)) with open(annFile, 'r') as csvFile: reader = csv.reader(csvFile) for row in reader: video, t, x1, y1, x2, y2, action, person = row actions = {action} if person in self.dataset[video][t]: actions = actions.union( self.dataset[video][t][person]["actions"]) self.dataset[video][t][person] = { "bbox": (x1, y1, x2, y2), "actions": actions } ordered_data = [] for key, video in self.dataset.items(): ordered_data.append((key, [])) for t, annotation in video.items(): ordered_data[-1][1].append((int(t), annotation)) ordered_data[-1][1].sort(key=lambda x: x[0]) self.dataset = ordered_data self.youtubeLoader = YoutubeLoader(verbose=False) for key, video in self.dataset: self.youtubeLoader.queue_video(key, video[0][0], video[-1][0]) self.current_video_index = 0 self.current_video_frame_index = 0 self.tracker = Tracker() else: raise Exception("Unknown dataset") self.useLMDB = useLMDB if useLMDB: self.lmdb = LMDBHelper("a", max_size=1028 * 1028 * 1028 * 32) self.lmdb.verbose = False self.optimizer = torch.optim.Adam(self._AE_model.parameters(), lr=learningRate) self.loss_function = torch.nn.BCELoss() def _load(self, index=None): # Load next if index is None if self.datasetName in ActionClassifier.COCO_Datasets: people = None # Load image from disk and process cocoImage = self.coco.loadImgs(self.dataset[index])[0] if self.useLMDB: people = self.lmdb.get("DensePoseWrapper_Sanitized_Coco", str(cocoImage["id"])) if people is None: image = cv2.imread(self.cocoPath + "/" + cocoImage["file_name"]) if image is None: raise Exception("Could not find image: " + str(index)) people = self.denseposeExtractor.extract(image) people = self.sanitizer.extract(people) if self.useLMDB:"DensePoseWrapper_Sanitized_Coco", str(cocoImage["id"]), people) return people, cocoImage elif self.datasetName in ActionClassifier.AVA_Datasets: data = None image = None people, frame_time, is_last = None, None, False key = self.dataset[self.current_video_index][0] if self.useLMDB: data = self.lmdb.get( "DensePoseWrapper_Sanitized_AVA", str(key) + "_" + str(self.current_video_frame_index)) if data is None: image, frame_time, is_last = self.youtubeLoader.get( self.current_video_index, self.current_video_frame_index) if image is None: people = [] frame_time = 0 else: people = self.denseposeExtractor.extract(image) people = self.sanitizer.extract(people) if self.useLMDB: # Save processed data "DensePoseWrapper_Sanitized_AVA", str(key) + "_" + str(self.current_video_frame_index), (people, frame_time, is_last)) else: people, frame_time, is_last = data timestamp = np.round(frame_time) ava_annotation = None sameTimestamp = [ v[1] for v in self.dataset[self.current_video_index][1] if v[0] == timestamp ] if len(sameTimestamp) == 1: ava_annotation = sameTimestamp[0] # To show the whole dataset as it's being downloaded if image is not None and True: if ava_annotation is not None: for k, p in ava_annotation.items(): bbox = np.array([ float(p["bbox"][0]), float(p["bbox"][1]), float(p["bbox"][2]), float(p["bbox"][3]) ]) p1 = bbox[:2] * np.array( [image.shape[1], image.shape[0]], dtype=np.float) p2 = bbox[2:] * np.array( [image.shape[1], image.shape[0]], dtype=np.float) image = cv2.rectangle(image, tuple(p1.astype(np.int32)), tuple(p2.astype(np.int32)), (20, 20, 200), 1) cv2.imshow("frame", image) cv2.waitKey(1) # Change increment video and frame if is_last: self.current_video_frame_index = 0 self.current_video_index += 1 if len(self.dataset) == self.current_video_index: self.current_video_index = 0 else: self.current_video_frame_index += 1 return people, frame_time, is_last, ava_annotation def trainAutoEncoder(self, epochs=100, learningRate=0.005, dataset="Coco", useLMDB=True, printUpdateEvery=40, visualize=0, tensorboard=False): self._training = True self._initTraining(learningRate, dataset, useLMDB) # Tensorboard setup if tensorboard or type(tensorboard) == str: from torch.utils.tensorboard import SummaryWriter if type(tensorboard) == str: writer = SummaryWriter(topDir + "/data/tensorboard/" + tensorboard) else: writer = SummaryWriter(topDir + "/data/tensorboard/") tensorboard = True # Start the training process total_iterations = len(self.dataset) visualize_counter = 0 open_windows = set() if self.datasetName in ActionClassifier.COCO_Datasets: print("Starting COCO dataset training") for epoch in range(epochs): epochLoss = np.float64(0) for i in range(total_iterations): people, annotation = self._load(i) S = _tensorify_people(people) if S.shape[0] == 0: continue # Run prediction embedding = self._AE_model.encode(S) out = self._AE_model.decode(embedding) # Optimize lossSize = self.loss_function(out, S) lossSize.backward() self.optimizer.step() self.optimizer.zero_grad() lossSize = lossSize.cpu().item() # Give feedback of training process epochLoss += lossSize / total_iterations visualize_counter += 1 if (i - 1) % printUpdateEvery == 0: print("Iteration {} / {}, epoch {} / {}".format( i, total_iterations, epoch, epochs)) print("Loss size: {}\n".format(lossSize / printUpdateEvery)) if visualize != 0 and visualize <= visualize_counter: visualize_counter = 0 new_open_windows = set() for index, _ in enumerate(S): inpS = (S[index, 0].detach()).cpu().to( torch.float).numpy() outS = (out[index, 0].detach()).cpu().to( torch.float32).numpy() emb = embedding.detach().cpu().numpy() debug_image = self._get_ae_from_embedding( index, inpS, emb, outS, None) cv2.imshow("person " + str(index), debug_image) new_open_windows.add("person " + str(index)) break # Loss size:".format( epoch, epochs, epochLoss)) self.saveModel(self._modelPath) elif self.datasetName in ActionClassifier.AVA_Datasets: # Unfortunately, needs to run through the whole AVA dataset to determine the size in frames print("Going through ava dataset once to determine the size") total_iterations = 0 for video_i in range(len(self.dataset)): is_last = False while not is_last: people, frame_time, is_last, annotation = self._load( ) # Load next total_iterations += 1 if (total_iterations - 1) % 500 == 0: print("Frame/iteration {} (video {} / {})".format( total_iterations, video_i, len(self.dataset))) print("Total number of iterations are {}".format(total_iterations)) print("Starting AVA dataset training") last_frame_time = None last_people = [] S_next = None current_video = 0 was_last = False for epoch in range(epochs): epochLoss = np.float64(0) for i in range(total_iterations): people, frame_time, is_last, annotation = self._load( ) # Load next current_video += is_last if "predictive" in self.datasetName: # Track the next frame self.tracker.extract(people, time_now=frame_time) if is_last: # If new video next self.tracker = Tracker() last_frame_time = None # Only save the people who exist in all frames old_ids = list(map(lambda p:, last_people)) new_ids = list(map(lambda p:, people)) old_people = list( filter(lambda p: in new_ids, last_people.copy())) new_people = list( filter(lambda p: in old_ids, people.copy())) # Filter old Ss S = _tensorify_people(old_people, True) S_next = _tensorify_people(new_people, False) last_people = people else: frame_time = last_frame_time S = _tensorify_people(people) if S.shape[0] == 0: continue delta_time = 0 if last_frame_time is not None and was_last is False: delta_time = frame_time - last_frame_time last_frame_time = frame_time # Run prediction embedding = self._AE_model.encode(S, delta_time) out = self._AE_model.decode(embedding) # Optimize if "predictive" in self.datasetName: lossSize = self.loss_function(out, S_next) else: lossSize = self.loss_function(out, S) lossSize.backward() self.optimizer.step() self.optimizer.zero_grad() lossSize = lossSize.cpu().item() # Give feedback of training process epochLoss += lossSize / total_iterations visualize_counter += 1 was_last = is_last if (i - 1) % printUpdateEvery == 0: print("Iteration {} / {} (video {}/{}), epoch {} / {}". format(i, total_iterations, current_video, len(self.dataset), epoch, epochs)) print("Loss size: {}\n".format(lossSize / printUpdateEvery)) if visualize != 0 and visualize <= visualize_counter: visualize_counter = 0 new_open_windows = set() for index, _ in enumerate(S): inpS = (S[index, 0].detach()).cpu().to( torch.float).numpy() outS = (out[index, 0].detach()).cpu().to( torch.float32).numpy() emb = embedding.detach().cpu().numpy() debug_image = self._get_ae_from_embedding( index, inpS, emb, outS, S_next) cv2.imshow("person " + str(index), debug_image) new_open_windows.add("person " + str(index)) break # Only show one person for window in open_windows.difference( new_open_windows): cv2.destroyWindow(window) open_windows = new_open_windows cv2.waitKey(1) if tensorboard: absI = i + epoch * total_iterations writer.add_scalar("Loss size", lossSize, absI) print("Finished epoch {} / {}. def main(): densepose = DensePoseWrapper() sanitizer = Sanitizer() sanitizer.load_model("./models/Sanitizer.pth") tracker = Tracker() uvMapper = UVMapper() descriptionExtractor = DescriptionExtractor() descriptionExtractor.loadModel("./models/DescriptionExtractor.pth") actionClassifier = ActionClassifier() actionClassifier.loadModel("./models/ActionClassifier_AutoEncoder.pth") cam = cv2.VideoCapture(0) frameIndex = 0 frame_time = time.time() oldOpenWindows = set() while True: # Get image from webcam return_value, image = assert return_value, "Failed to read from web camera" delta_time = time.time() - frame_time frame_time = time.time() # White balance the image to get better color features image = white_balance(image) debugImage = image.copy() # Send image to DensePose people = densepose.extract(image) debugImage = densepose.renderDebug(debugImage, people) print("DensePose people:", len(people)) # Refine DensePose output to get actual people people = sanitizer.extract(people) debugImage = sanitizer.renderDebug(debugImage, people, alpha=0.2) print("Sanitizer people", len(people)) # Track the people (which modifies the people variables) tracker.extract(people, True) debugImage = tracker.renderDebug(debugImage, people) print("Tracker people", len(people)) # Extract UV map for each person peopleMaps = uvMapper.extract(people, image) peopleTextures = uvMapper.getPeopleTexture(peopleMaps) # Classify what the person is wearing clothes = descriptionExtractor.extract(peopleMaps) clothingImages = descriptionExtractor.getLabelImage() # Get pose embedding actionClassifier.extract_ae(people, delta_time) debugACAE = actionClassifier.get_ae_debug(people) # Per person window management newOpenWindows = set() for i, person in enumerate(people): # Show UV map and label S_ROI = (person.I * (255 / 25)).astype(np.uint8) S_ROI = cv2.applyColorMap(S_ROI, cv2.COLORMAP_PARULA) S_ROI = cv2.resize(S_ROI, (160, 160)) personWindow = cv2.resize(peopleTextures[i], (int(5 / 3 * 160), 160)) coloredSlice = np.zeros((160, 3, 3), dtype=np.uint8) coloredSlice[:, :] = person.color personWindow = np.hstack( (coloredSlice, S_ROI, personWindow, clothingImages[i])) # View window windowName = "UV image " + str( newOpenWindows.add(windowName) cv2.imshow(windowName, personWindow) cv2.resizeWindow(windowName, 600, 600) # ... and a window for ac ae windowName = "ActionClassifier_AutoEncoder image " + str( newOpenWindows.add(windowName) cv2.imshow( windowName, cv2.resize( debugACAE[i], (debugACAE[i].shape[1] * 3, debugACAE[i].shape[0] * 3))) for oldWindow in oldOpenWindows: if oldWindow not in newOpenWindows: cv2.destroyWindow(oldWindow) oldOpenWindows = newOpenWindows # Show image print("Show frame:", frameIndex, "\n") cv2.imshow("debug image", debugImage) frameIndex += 1 # Quit on escape if cv2.waitKey(1) == 27: break cv2.destroyAllWindows()
class DescriptionExtractor(DenseSense.algorithms.Algorithm.Algorithm): iteration = 0 availableLabels = { 0: "none", 1: "short sleeve top", 2: "long sleeve top", 3: "short sleeve outwear", 4: "long sleeve outwear", 5: "vest", 6: "sling", 7: "shorts", 8: "trousers", 9: "skirt", 10: "short sleeve dress", 11: "long sleeve dress", 12: "dress vest", 13: "sling dress" } # 0 : none # 1 : trousers # 2 : R hand # 3 : L hand # 4 : R foot # 5 : L foot # 6 : R thigh # 7 : L thigh # 8 : R calf # 9 : L calf # 10 : L upper arm # 11 : R upper arm # 12 : L lower arm # 13 : R lower arm # 14 : head labelColorCheck = { 0: [], 1: [1, 10, 11], 2: [1, 10, 11, 12, 13], 3: [1, 10, 11], 4: [1, 10, 11, 12, 13], 5: [1, 10, 11], 6: [1, 10, 11], 7: [6, 7], 8: [6, 7, 8, 9], 9: [6, 7], 10: [1, 10, 11], 11: [1, 10, 11, 12, 13], 12: [1, 10, 11], 13: [1, 10, 11] } colors = [((255, 255, 255), "white"), ((210, 209, 218), "white"), ((145, 164, 164), "white"), ((169, 144, 135), "white"), ((197, 175, 177), "white"), ((117, 126, 115), "white"), ((124, 126, 129), "white"), ((0, 0, 0), "black"), ((10, 10, 10), "black"), ((1, 6, 9), "black"), ((5, 10, 6), "black"), ((18, 15, 11), "black"), ((18, 22, 9), "black"), ((16, 16, 14), "black"), ((153, 153, 0), "yellow"), ((144, 115, 99), "pink"), ((207, 185, 174), "pink"), ((206, 191, 131), "pink"), ((208, 179, 54), "pink"), ((202, 19, 43), "red"), ((206, 28, 50), "red"), ((82, 30, 26), "red"), ((156, 47, 35), "orange"), ((126, 78, 47), "wine red"), ((74, 72, 77), "green"), ((31, 38, 38), "green"), ((40, 52, 79), "green"), ((100, 82, 116), "green"), ((8, 17, 55), "green"), ((29, 31, 37), "dark green"), ((46, 46, 36), "blue"), ((29, 78, 60), "blue"), ((74, 97, 85), "blue"), ((60, 68, 67), "blue"), ((181, 195, 232), "neon blue"), ((40, 148, 184), "bright blue"), ((210, 40, 69), "orange"), ((66, 61, 52), "gray"), ((154, 120, 147), "gray"), ((124, 100, 86), "gray"), ((46, 55, 46), "gray"), ((119, 117, 122), "gray"), ((88, 62, 62), "brown"), ((60, 29, 17), "brown"), ((153, 50, 204), "purple"), ((77, 69, 30), "purple"), ((153, 91, 14), "violet"), ((207, 185, 151), "beige")] colorsHSV = None class Network(nn.Module): def __init__(self, labels): # 0 is None, and not trained on anyways continue label = self.availableLabels[i] info = {"activation": value} if determineColorThreshold < value: # If certainty is above threshold, take the time to calculate the average color averageOfAreas = np.zeros(3, dtype=np.int64) relevantAreas = torch.as_tensor( self.labelColorCheck[i], dtype=torch.int64).to(device) nonBlackAreas = 0 for areaIndex in relevantAreas: if (averages[areaIndex] == -1).all(): # Calculate average relevantPixels = peopleMapsDevice[personIndex, areaIndex, :, :] relevantPixels = relevantPixels[ torch.sum(relevantPixels, axis=2) != 0] if relevantPixels.shape[0] == 0: # All black averages[areaIndex] = np.zeros(3) continue average = relevantPixels.mean( axis=0).cpu().numpy().astype(np.uint8) averages[areaIndex] = average nonBlackAreas += 1 averageOfAreas += averages[areaIndex] averageOfAreas = (averageOfAreas / float(nonBlackAreas)).astype(np.uint8) info.update(self._findColorName(averageOfAreas)) labels[label] = info self.peopleLabels.append(labels) return self.peopleLabels def train(self, epochs=100, learningRate=0.005, dataset="Coco", useDatabase=True, printUpdateEvery=40, visualize=False, tensorboard=False): self._training = True self._initTraining(learningRate, dataset, useDatabase) # Deal with tensorboard if tensorboard or type(tensorboard) == str: from torch.utils.tensorboard import SummaryWriter if type(tensorboard) == str: writer = SummaryWriter(topDir + "/data/tensorboard/" + tensorboard) else: writer = SummaryWriter(topDir + "/data/tensorboard/") tensorboard = True def findBestROI(ROIs, label): bestMatch = 0 bestIndex = -1 for i, ROI in enumerate(ROIs): lbox = np.array(label["bbox"]) larea = lbox[2:] - lbox[:2] larea = larea[0] * larea[1] rbox = ROI.bounds rarea = rbox[2:] - rbox[:2] rarea = rarea[0] * rarea[1] SI = np.maximum(0, np.minimum(lbox[2], rbox[2]) - np.maximum(lbox[0], rbox[0])) * \ np.maximum(0, np.minimum(lbox[3], rbox[3]) - np.maximum(lbox[1], rbox[1])) SU = larea + rarea - SI overlap = SI / SU if bestMatch < overlap and SU != 0: bestMatch = overlap bestIndex = i return bestIndex Iterations = len(self.dataset) print("Starting training") for epoch in range(epochs): epochLoss = np.float64(0) for i in range(Iterations): ROIs, peopleTextures, labels = self._load(i) # Loss size:".format( epoch, epochs, epochLoss)) self.saveModel(self.modelPath) self._training = False def getLabelImage(self): images = [] for personLabel in self.peopleLabels: # Sort labels by score labels = sorted(list(personLabel.items()), key=lambda x: x[1]["activation"], reverse=True) # Create image image = np.zeros((160, 210, 3)) for i, label in enumerate(labels): name, classification = label text = "{0:4d}% {1}".format( int(classification["activation"] * 100), name) color = (255, 255, 255) if classification[ "activation"] < 0.75: # FIXME: magic number, tune color = (128, 128, 128) image = cv2.putText(image, text, (0, 12 + 12 * i), cv2.FONT_HERSHEY_DUPLEX, .3, color, 1, cv2.LINE_AA) # Add color if "bestMatch" in classification: colorText = classification["bestMatch"][1] colorTextColor = classification["color"] colorTextColor = (int(colorTextColor[0]), int(colorTextColor[1]), int(colorTextColor[2])) image = cv2.putText(image, colorText, (150, 12 + 12 * i), cv2.FONT_HERSHEY_DUPLEX, .3, colorTextColor, 1, cv2.LINE_AA) images.append(image.astype(np.uint8)) return images def _load(self, index): cocoImage = self.dataset[index] ROIs = None if self.useDatabase: ROIs = self.lmdb.get("DensePoseWrapper_Sanitized_deepfashion2", str(cocoImage["id"])) if ROIs is None: ROIs = self.denseposeExtractor.extract(cocoImage[0]) ROIs = self.sanitizer.extract(ROIs) if self.useDatabase:"DensePoseWrapper_Sanitized_deepfashion2", str(cocoImage["id"]), ROIs) peopleTextures = None if self.useDatabase: peopleTextures = self.lmdb.get("UVMapper_deepfashion2", str(index)) if peopleTextures is None: peopleTextures = self.uvMapper.extract(ROIs, cocoImage[0]) if self.useDatabase:"UVMapper_deepfashion2", str(index), peopleTextures) return ROIs, peopleTextures, cocoImage[1] def _findColorName(self, color): b = color[0] g = color[1] r = color[2] # This prints the color colored in the terminal colorRepr = '\033[{};2;{};{};{}m'.format(38, r, g, b) \ + "rgb("+str(r)+", "+str(g)+", "+str(b)+")"+'\033[0m' # Get nearest color name HSVobj = convert_color(sRGBColor(r, g, b), HSVColor) nearestIndex = -1 diffMin = 100000 for i in range(len(self.colorsHSV)): colEntry = self.colorsHSV[i] d = HSVobj.hsv_h - colEntry.hsv_h dh = min(abs(d), 360 - abs(d)) / 180.0 ds = abs(HSVobj.hsv_s - colEntry.hsv_s) dv = abs(HSVobj.hsv_v - colEntry.hsv_v) / 255.0 diff = np.sqrt(dh * dh + ds * ds + dv * dv) if diff < diffMin: diffMin = diff nearestIndex = i return { "color": tuple(color), "colorDistance": diffMin, "coloredStr": colorRepr, "bestMatch": self.colors[nearestIndex] }
print(color["color"]+" "+color["coloredStr"]) labels[label] = info labelsPeople.append(labels) return labelsPeople def train(self, epochs=100, learningRate=0.005, dataset="Coco", useDatabase=True, printUpdateEvery=40, visualize=False, tensorboard=False): self._training = True self._initTraining(learningRate, dataset, useDatabase) # Deal with tensorboard if tensorboard or type(tensorboard) == str: from torch.utils.tensorboard import SummaryWriter if type(tensorboard) == str: writer = SummaryWriter("./data/tensorboard/" + tensorboard) else: writer = SummaryWriter("./data/tensorboard/") tensorboard = True def findBestROI(ROIs, label): bestMatch = 0 bestIndex = -1 for i, ROI in enumerate(ROIs): lbox = np.array(label["bbox"]) larea = lbox[2:] - lbox[:2] larea = larea[0] * larea[1] rbox = ROI.bounds rarea = rbox[2:] - rbox[:2] rarea = rarea[0] * rarea[1] SI = np.maximum(0, np.minimum(lbox[2], rbox[2]) - np.maximum(lbox[0], rbox[0])) * \ np.maximum(0, np.minimum(lbox[3], rbox[3]) - np.maximum(lbox[1], rbox[1])) SU = larea + rarea - SI overlap = SI / SU if bestMatch < overlap and SU != 0: bestMatch = overlap bestIndex = i return bestIndex Iterations = len(self.dataset) print("Starting training") for epoch in range(epochs): epochLoss = np.float64(0) for i in range(Iterations): ROIs, peopleTextures, labels = self._load(i) # Figure out what ROI belongs to what label groundtruth = np.zeros((len(ROIs), 14), dtype=np.float32) for label in labels: mostMatching = findBestROI(ROIs, label) if mostMatching != -1: groundtruth[mostMatching][label["category_id"]] = 1 # Most items in this dataset will be bypassed because no people were found or overlapping with gt if len(ROIs) == 0 or not np.any(groundtruth != 0): continue groundtruth = torch.from_numpy(groundtruth).to(device) # Apply noise to peopleTextures noise = np.random.randn(*peopleTextures.shape) * 5 b = peopleTextures.astype(np.int32) peopleTextures = peopleTextures.astype( np.int32) + noise.astype(np.int32) peopleTextures = np.clip(peopleTextures, 0, 255) peopleTextures = peopleTextures.astype(np.uint8) peopleTextures = torch.Tensor(peopleTextures).to(device) predictions = self.classifier.forward(peopleTextures) print(groundtruth) print(predictions) print("\n") lossSize = self.lossFunction(predictions, groundtruth) lossSize.backward() self.optimizer.step() self.optimizer.zero_grad() lossSize = lossSize.cpu().item() epochLoss += lossSize / Iterations if (i - 1) % printUpdateEvery == 0: print("Iteration {} / {}, epoch {} / {}".format( i, Iterations, epoch, epochs)) print("Loss size: {}\n".format(lossSize / printUpdateEvery)) if tensorboard: absI = i + epoch * Iterations writer.add_scalar("Loss size", lossSize, absI) # class Sanitizer(DenseSense.algorithms.Algorithm.Algorithm): # UNet, inspired by # But with a fully connected layer in the middle class MaskGenerator(nn.Module): def __init__(self): super(Sanitizer.MaskGenerator, self).__init__() self.dconv1 = nn.Sequential( nn.Conv2d(1, 8, 3, padding=2), nn.LeakyReLU(inplace=True), ) self.dconv2 = nn.Sequential( nn.Conv2d(8, 4, 3, padding=1), nn.LeakyReLU(inplace=True), ) self.dconv3 = nn.Sequential( nn.Conv2d(3, 1, 3, padding=1), nn.LeakyReLU(inplace=True), ) self.fcImg = nn.Linear(14*14*2+2, 14*14) self.maxpool = nn.MaxPool2d(2) self.upsample1 = nn.Upsample(size=(29, 29), mode="bilinear") self.upsample2 = nn.Upsample(size=(56, 56), mode="bilinear") self.sigmoid = nn.Sigmoid() self.leakyReLU = nn.LeakyReLU() def forward(self, people): if len(people) == 0: return np.array([]), torch.Tensor([]).to(device) # Send data to device S = torch.Tensor(len(people), 1, 56, 56) b = torch.Tensor(len(people), 2) for i in range(len(people)): person = people[i] S[i][0] = torch.from_numpy(person.S) bnds = person.bounds area = np.power(np.sqrt((bnds[2] - bnds[0]) * (bnds[3] - bnds[1])), 0.2) if bnds[3] == bnds[1]: aspect = 0 else: aspect = (bnds[2] - bnds[0]) / (bnds[3] - bnds[1]) b[i] = torch.Tensor([area, aspect]) S = b = batchSize = S.shape[0] # Normalize input x = S.clone() x[0 < x] = x[0 < x] / 15.0 * 0.2 + 0.8 # Convolutions x = self.dconv1(x) # 1 -> 8, 56x56 -> 58x58 x = self.maxpool(x) # 58x58 -> 29x29 conv = self.dconv2(x) # 8 -> 4 x = self.maxpool(conv[:, :2]) # 29x29 -> 14x14 # Fully connected layer x = x.view(batchSize, 14*14*2) x =[x, b], dim=1) # Image and bbox info x = self.fcImg(x) x = self.leakyReLU(x) x = x.view(batchSize, 1, 14, 14) # Merge fully connected with past convolution calculation x = self.upsample1(x) # 14x14 -> 29x29 x =[x, conv[:, 2:]], dim=1) x = self.dconv3(x) # 3 -> 1 x = self.sigmoid(x) x = self.upsample2(x) # 29x29 -> 56x56 return x, S def __init__(self): super().__init__() # Generate and maybe load mask generator model() self.maskGenerator = Sanitizer.MaskGenerator() self.modelPath = None self._training = False self._trainingInitiated = False self._ROI_masks = torch.Tensor() self._ROIs = torch.Tensor() self._ROI_bounds = np.array([]) self._overlappingROIs = np.array([]) self._overlappingROIsValues = np.array([]) def loadModel(self, modelPath): self.modelPath = modelPath print("Loading Sanitizer MaskGenerator file from: " + self.modelPath) self.maskGenerator.load_state_dict(torch.load(self.modelPath, map_location=device)) def saveModel(self, modelPath): if modelPath is None: print("Don't know where to save model") self.modelPath = modelPath print("Saving Sanitizer MaskGenerator model to: "+self.modelPath), self.modelPath) def _initTraining(self, learningRate, dataset, useDatabase): # Dataset is COCO print("Initiating training of Sanitizer MaskGenerator") print("Loading COCO") from pycocotools.coco import COCO from os import path # TODO: support other data sets than Coco annFile = './annotations/instances_{}.json'.format(dataset) self.cocoPath = './data/{}'.format(dataset) self.coco = COCO(annFile) self.personCatID = self.coco.getCatIds(catNms=['person'])[0] self.cocoImageIds = self.coco.getImgIds(catIds=self.personCatID) def isNotCrowd(imgId): annIds = self.coco.getAnnIds(imgIds=imgId, catIds=self.personCatID, iscrowd=False) annotation = self.coco.loadAnns(annIds)[0] return not annotation["iscrowd"] self.cocoImageIds = list(filter(isNotCrowd, self.cocoImageIds)) self.cocoOnDisk = path.exists(self.cocoPath) print("Coco dataset size: {}".format(len(self.cocoImageIds))) print("Coco images found on disk:", self.cocoOnDisk) # Init LMDB_helper if useDatabase: self.lmdb = LMDBHelper("a") self.lmdb.verbose = False # Init loss function and optimizer self.optimizer = torch.optim.Adam(self.maskGenerator.parameters(), lr=learningRate, amsgrad=True) self.lossFunction = torch.nn.MSELoss() # Init DensePose extractor self.denseposeExtractor = DensePoseWrapper() def extract(self, people): # Generate masks for all ROIs (people) using neural network model with torch.no_grad(): self._generateMasks(people) if len(self._ROI_masks) == 0: return people # Multiply masks with with segmentation mask from DensePose masked = self._ROI_masks*self._ROIs # Find overlapping ROIs overlaps, overlapLow, overlapHigh = self._overlappingMatrix( self._ROI_bounds.astype(np.int32), self._ROI_bounds.astype(np.int32) ) overlaps[np.triu_indices(overlaps.shape[0])] = False overlapsInds = np.array(list(zip(*np.where(overlaps)))) overlapsCorr = np.full_like(overlaps, 0, dtype=np.float) if overlapsInds.shape[0] != 0: for a, b in overlapsInds: # For every overlap # Extract part that overlaps from mask and make sizes match to smallest dim xCoords = np.array([overlapLow[0][a, b], overlapHigh[0][a, b]]) yCoords = np.array([overlapLow[1][a, b], overlapHigh[1][a, b]]) aMask = self._getTransformedROI(masked[a, 0], self._ROI_bounds[a], xCoords, yCoords) bMask = self._getTransformedROI(masked[b, 0], self._ROI_bounds[b], xCoords, yCoords) aArea = aMask.shape[0]*aMask.shape[1] bArea = bMask.shape[0]*bMask.shape[1] if aArea < bArea: bMask = bMask.unsqueeze(0) bMask = F.adaptive_avg_pool2d(bMask, aMask.shape)[0] else: aMask = aMask.unsqueeze(0) aMask = F.adaptive_avg_pool2d(aMask, bMask.shape)[0] # Calculate correlation aMean = aMask.mean() bMean = bMask.mean() correlation = torch.sum((aMask-aMean)*(bMask-bMean))/(aMask.shape[0]*aMask.shape[1]-1) overlapsCorr[a, b] = correlation # Find best disjoint sets of overlapping ROIs threshold = 0.06 # Must be above 0 goodCorrelations = np.argwhere(threshold < overlapsCorr) sortedCorrelations = overlapsCorr[goodCorrelations[:, 0], goodCorrelations[:, 1]].argsort() goodCorrelations = goodCorrelations[sortedCorrelations] overlapsCorr += overlapsCorr.T coupled = {} def getBiPotential(a, diff): potential = 0 for bOther in np.argwhere(overlapsCorr[diff] != 0): bOther = bOther[0] if bOther in coupled[a][0]: potential += overlapsCorr[a, bOther] return potential for a, b in goodCorrelations: aIn = a in coupled bIn = b in coupled if aIn: if bIn: potential = overlapsCorr[a, b] for diff in coupled[b][0]: potential += getBiPotential(a, diff) if 0 < potential: coupled[a][0].update(coupled[b][0]) for diff in coupled[b][0]: coupled[diff] = coupled[a] coupled[a][1] += potential else: potential = overlapsCorr[a, b] + getBiPotential(a, b) if 0 < potential: coupled[a][0].add(b) coupled[a][1] += potential coupled[b] = coupled[a] elif bIn: potential = overlapsCorr[b, a] + getBiPotential(b, a) if 0 < potential: coupled[b][0].add(a) coupled[b][1] += potential coupled[a] = coupled[b] else: n = [{a, b}, overlapsCorr[a, b]] coupled[a] = n coupled[b] = n newPeople = [] # Update all people data their data while len(coupled) != 0: instance = next(iter(coupled)) instances = list(coupled[instance][0]) for i in instances: del coupled[i] instances = list(map(lambda i: people[i], instances)) instances[0].merge(instances[1:]) newPeople.append(instances[0]) return newPeople def train(self, epochs=100, learningRate=0.005, dataset="Coco", useDatabase=True, printUpdateEvery=40, visualize=False, tensorboard=False): self._training = True if not self._trainingInitiated: self._initTraining(learningRate, dataset, useDatabase) if tensorboard or type(tensorboard) == str: from torch.utils.tensorboard import SummaryWriter if type(tensorboard) == str: writer = SummaryWriter("./data/tensorboard/"+tensorboard) else: writer = SummaryWriter("./data/tensorboard/") tensorboard = True # dummy_input = torch.Tensor(5, 1, 56, 56) # writer.add_graph(self.maskGenerator, dummy_input) # writer.close() Iterations = len(self.cocoImageIds) meanPixels = [] print("Starting training") for epoch in range(epochs): epochLoss = np.float64(0) interestingImage = None interestingMeasure = -100000 for i in range(Iterations): # Load instance of COCO dataset cocoImage, image = self._getCocoImage(i) if image is None: # FIXME print("Image is None??? Skipping.", i) print(cocoImage) continue # Get annotation annIds = self.coco.getAnnIds(imgIds=cocoImage["id"], catIds=self.personCatID, iscrowd=False) annotation = self.coco.loadAnns(annIds) # Draw each person in annotation to separate mask segs = [] seg_bounds = [] for person in annotation: mask = np.zeros(image.shape[0:2], dtype=np.uint8) for s in person["segmentation"]: s = np.reshape(np.array(s, dtype=np.int32), (-2, 2)) cv2.fillPoly(mask, [s], 1) segs.append(mask) bbox = person["bbox"] seg_bounds.append(np.array([bbox[0], bbox[1], bbox[0] + bbox[2], bbox[1] + bbox[3]])) seg_bounds = np.array(seg_bounds, dtype=np.int32) # Get DensePose data from DB or Extractor generated = False ROIs = None if useDatabase: ROIs = self.lmdb.get(DensePoseWrapper, "coco" + str(cocoImage["id"])) if ROIs is None: ROIs = self.denseposeExtractor.extract(image) generated = True if useDatabase and generated:, "coco" + str(cocoImage["id"]), ROIs) # Run prediction self._generateMasks(ROIs) if len(self._ROI_masks) == 0: continue if tensorboard: means = [torch.mean(ROI).detach().cpu().numpy() for ROI in self._ROI_masks] meanPixels.append(sum(means)/len(means)) # Find overlaps between bboxes of segs and ROIs overlaps, overlapLow, overlapHigh = self._overlappingMatrix( seg_bounds.astype(np.int32), self._ROI_bounds.astype(np.int32) ) overlapsInds = np.array(list(zip(*np.where(overlaps)))) if overlapsInds.shape[0] == 0: continue # Get average value where there is overlap between COCO-mask for each person and predictions for contentAverage = {} for a, b in overlapsInds: # For every overlap xCoords = np.array([overlapLow[0][a, b], overlapHigh[0][a, b]]) yCoords = np.array([overlapLow[1][a, b], overlapHigh[1][a, b]]) ROI_mask = self._getTransformedROI(self._ROI_masks[a, 0], self._ROI_bounds[a], xCoords, yCoords) # Segmentation overlap area segOverlap = segs[b][yCoords[0]:yCoords[1], xCoords[0]:xCoords[1]] # Transform segmentation segOverlap = cv2.resize(segOverlap, (ROI_mask.shape[1], ROI_mask.shape[0]), interpolation=cv2.INTER_AREA) # Calculate sum of product of the ROI mask and segment overlap segOverlap = torch.from_numpy(segOverlap).float().to(device) avgVariable = torch.sum(ROI_mask * segOverlap) # Store this sum if str(a) not in contentAverage: contentAverage[str(a)] = [] contentAverage[str(a)].append((avgVariable, segOverlap, ROI_mask)) self._overlappingROIs = np.unique(overlapsInds[:, 0]) # Choose which segment each ROI should be compared with losses = [] for j in range(len(self._overlappingROIs)): # For every ROI with overlap a = self._overlappingROIs[j] AL = list(contentAverage[str(a)]) AV = np.array([float(x[0].cpu()) for x in AL]) ind = AV.argmax() lossSize = self.lossFunction(AL[ind][2], AL[ind][1]) losses.append(lossSize) # Modify weights losses = torch.stack(losses) lossSize = torch.sum(losses) lossSize.backward() self.optimizer.step() self.optimizer.zero_grad() lossSize = lossSize.cpu().item() epochLoss += lossSize/Iterations if (i-1) % printUpdateEvery == 0: print("Iteration {} / {}, epoch {} / {}".format(i, Iterations, epoch, epochs)) print("Loss size: {}\n".format(lossSize / printUpdateEvery)) if tensorboard: absI = i + epoch * Iterations writer.add_scalar("Loss size", lossSize, absI) writer.add_histogram("Mean ROI pixel value", np.array(meanPixels), absI) meanPixels = [] if tensorboard: interestingness = np.random.random() # just choose a random one if interestingMeasure < interestingness: interestingImage = self.renderDebug(image.copy()) interestingMeasure = interestingness # Show visualization if visualize: image = self.renderDebug(image) plt.ion() plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)) plt.draw() plt.pause(4) print("Finished epoch {} / {}. Loss size:".format(epoch, epochs, epochLoss)) if tensorboard: writer.add_scalar("epoch loss size", epochLoss, Iterations*epoch) if interestingImage is not None: interestingImage = cv2.cvtColor(interestingImage, cv2.COLOR_BGR2RGB) interestingImage = torch.from_numpy(interestingImage).permute(2, 0, 1) writer.add_image("interesting image", interestingImage, Iterations*epoch) self.saveModel(self.modelPath) self._training = False def _generateMasks(self, ROIs): self._ROI_masks, self._ROIs = self.maskGenerator.forward(ROIs) self._ROIs[self._ROIs != 0] = 1 self._ROI_bounds = np.zeros((len(ROIs), 4), dtype=np.int32) for i in range(len(ROIs)): self._ROI_bounds[i] = np.array(ROIs[i].bounds, dtype=np.int32) ROIs[i].A = torch.round(self._ROI_masks[i, 0]).cpu().numpy() def _getCocoImage(self, index): if self.cocoOnDisk: # Load image from disk cocoImage = self.coco.loadImgs(self.cocoImageIds[index])[0] image = cv2.imread(self.cocoPath + "/" + cocoImage["file_name"]) return cocoImage, image else: raise FileNotFoundError("COCO image cant be found on disk") @staticmethod def _overlappingMatrix(a, b): xo_high = np.minimum(a[:, 2], b[:, None, 2]) xo_low = np.maximum(a[:, 0], b[:, None, 0]) xo = xo_high - xo_low yo_high = np.minimum(a[:, 3], b[:, None, 3]) yo_low = np.maximum(a[:, 1], b[:, None, 1]) yo = yo_high - yo_low overlappingMask = np.logical_and((0 < xo), (0 < yo)) return overlappingMask, (xo_low, yo_low), (xo_low + xo, yo_low + yo) @staticmethod def _getTransformedROI(ROI, bounds, xCoords, yCoords): # ROI transformed overlap area ROI_xCoords = (xCoords -bounds[0]) / (bounds[2] - bounds[0]) ROI_xCoords = (ROI_xCoords * 56).astype(np.int32) ROI_xCoords[1] += ROI_xCoords[0] == ROI_xCoords[1] ROI_yCoords = (yCoords - bounds[1]) / (bounds[3] - bounds[1]) ROI_yCoords = (ROI_yCoords * 56).astype(np.int32) ROI_yCoords[1] += ROI_yCoords[0] == ROI_yCoords[1] ROI_mask = ROI[ROI_yCoords[0]:ROI_yCoords[1], ROI_xCoords[0]:ROI_xCoords[1]] return ROI_mask def renderDebug(self, image, alpha=0.55): # Normalize ROIs from (0, 1) to (0, 255) ROIsMaskNorm = self._ROI_masks * 255 # Render masks on image for i in range(len(self._ROI_masks)): mask = ROIsMaskNorm[i, 0].cpu().detach().to(torch.uint8).numpy() bnds = self._ROI_bounds[i] # Change colors of mask if 0 < alpha: mask = cv2.applyColorMap(mask, cv2.COLORMAP_SUMMER) else: alpha = -alpha mask = cv2.applyColorMap(mask, cv2.COLORMAP_PINK) # TODO: render contours instead? # Resize mask to bounds dims = (bnds[2] - bnds[0], bnds[3] - bnds[1]) mask = cv2.resize(mask, dims, interpolation=cv2.INTER_AREA) # Overlay image overlap = image[bnds[1]:bnds[3], bnds[0]:bnds[2]] mask = mask * alpha + overlap * (1.0 - alpha) image[bnds[1]:bnds[3], bnds[0]:bnds[2]] = mask return image