def smoother(y, interval=100): avg = MovingAverage(interval) for i in range(len(y)): avg.append(y[i]) y[i] = avg.get_avg() return y
def evaluate(net, dataset, max_num=-1, during_training=False, cocoapi=False, traditional_nms=False): frame_times = MovingAverage() dataset_size = len(dataset) if max_num < 0 else min(max_num, len(dataset)) dataset_indices = list(range(len(dataset))) dataset_indices = dataset_indices[:dataset_size] progress_bar = ProgressBar(40, dataset_size) # For each class and iou, stores tuples (score, isPositive) # Index ap_data[type][iouIdx][classIdx] ap_data = { 'box': [[APDataObject() for _ in cfg.dataset.class_names] for _ in iou_thresholds], 'mask': [[APDataObject() for _ in cfg.dataset.class_names] for _ in iou_thresholds] } make_json = Make_json() for i, image_idx in enumerate(dataset_indices): timer.reset() with timer.env('Data loading'): img, gt, gt_masks, h, w, num_crowd = dataset.pull_item(image_idx) batch = img.unsqueeze(0) if cuda: batch = batch.cuda() with timer.env('Network forward'): #changed net_outs = net(batch) nms_outs = NMS(net_outs, traditional_nms) prep_metrics(ap_data, nms_outs, gt, gt_masks, h, w, num_crowd, dataset.ids[image_idx], make_json, cocoapi) # First couple of images take longer because we're constructing the graph. # Since that's technically initialization, don't include those in the FPS calculations. fps = 0 if i > 1 and not during_training: frame_times.add(timer.total_time()) fps = 1 / frame_times.get_avg() progress = (i + 1) / dataset_size * 100 progress_bar.set_val(i + 1) print('\rProcessing: %s %d / %d (%.2f%%) %.2f fps ' % (repr(progress_bar), i + 1, dataset_size, progress, fps), end='') else: table, box_row, mask_row = calc_map(ap_data) print(table) return table, box_row, mask_row
def savevideo(net: Yolact, in_path: str, out_path: str): vid = cv2.VideoCapture(in_path) target_fps = round(vid.get(cv2.CAP_PROP_FPS)) frame_width = round(vid.get(cv2.CAP_PROP_FRAME_WIDTH)) frame_height = round(vid.get(cv2.CAP_PROP_FRAME_HEIGHT)) num_frames = round(vid.get(cv2.CAP_PROP_FRAME_COUNT)) out = cv2.VideoWriter(out_path, cv2.VideoWriter_fourcc(*"mp4v"), target_fps, (frame_width, frame_height)) frame_freq = 10 transform = FastBaseTransform() frame_times = MovingAverage() progress_bar = ProgressBar(30, num_frames) preds = None try: for i in range(num_frames): timer.reset() with timer.env('Video'): # process only 10th frame # care to be taken that the first frame is read always frame = torch.from_numpy(vid.read()[1]).cuda().float() # need to adjust for multi frame if i % frame_freq == 0: batch = transform(frame.unsqueeze(0)) preds = net(batch) current_preds = make_copy(preds) processed = prep_display(current_preds, frame, None, None, undo_transform=False, class_color=True) out.write(processed) if i > 1: frame_times.add(timer.total_time()) fps = 1 / frame_times.get_avg() progress = (i + 1) / num_frames * 100 progress_bar.set_val(i + 1) print( '\rProcessing Frames %s %6d / %6d (%5.2f%%) %5.2f fps ' % (repr(progress_bar), i + 1, num_frames, progress, fps), end='') except KeyboardInterrupt: print('Stopping early.') vid.release() out.release() print()
def play_video(): nonlocal frame_buffer, running, video_fps, is_webcam, slide_num video_frame_times = MovingAverage(100) frame_time_stabilizer = frame_time_target last_time = None stabilizer_step = 0.0005 while running: frame_time_start = time.time() if not frame_buffer.empty(): next_time = time.time() if last_time is not None: video_frame_times.add(next_time - last_time) video_fps = 1 / video_frame_times.get_avg() cv2.imshow(path, frame_buffer.get()) last_time = next_time key_press = cv2.waitKey(1) & 0xff if key_press == 27: # Press Escape to close running = False elif key_press == ord('n'): if slide_num < len(bg_imgs) - 1: slide_num = slide_num + 1 elif key_press == ord('b'): if 0 < slide_num: slide_num = slide_num - 1 buffer_size = frame_buffer.qsize() if buffer_size < args.video_multiframe: frame_time_stabilizer += stabilizer_step elif buffer_size > args.video_multiframe: frame_time_stabilizer -= stabilizer_step if frame_time_stabilizer < 0: frame_time_stabilizer = 0 new_target = frame_time_stabilizer if is_webcam else max(frame_time_stabilizer, frame_time_target) next_frame_target = max(2 * new_target - video_frame_times.get_avg(), 0) target_time = frame_time_start + next_frame_target - 0.001 # Let's just subtract a millisecond to be safe # This gives more accurate timing than if sleeping the whole amount at once while time.time() < target_time: time.sleep(0.001)
def play_video(): nonlocal frame_buffer, running, video_fps, is_webcam video_frame_times = MovingAverage(100) frame_time_stabilizer = frame_time_target last_time = None stabilizer_step = 0.0005 while running: frame_time_start = time.time() if not frame_buffer.empty(): next_time = time.time() if last_time is not None: video_frame_times.add(next_time - last_time) video_fps = 1 / video_frame_times.get_avg() cv2.imshow(path, frame_buffer.get()) # print("how many masks ", frame_buffer.shape) last_time = next_time #self.image_pub.publish(self.bridge.cv2_to_imgmsg(frame_buffer.get(), "bgr8")) if cv2.waitKey(1) == 27: # Press Escape to close running = False buffer_size = frame_buffer.qsize() if buffer_size < args.video_multiframe: frame_time_stabilizer += stabilizer_step elif buffer_size > args.video_multiframe: frame_time_stabilizer -= stabilizer_step if frame_time_stabilizer < 0: frame_time_stabilizer = 0 new_target = frame_time_stabilizer if is_webcam else max( frame_time_stabilizer, frame_time_target) next_frame_target = max( 2 * new_target - video_frame_times.get_avg(), 0) target_time = frame_time_start + next_frame_target - 0.001 # Let's just subtract a millisecond to be safe # This gives more accurate timing than if sleeping the whole amount at once while time.time() < target_time: time.sleep(0.001)
def play_video(): nonlocal frame_buffer, running, video_fps video_frame_times = MovingAverage(100) frame_time_stabilizer = frame_time_target last_time = None stabilizer_step = 0.0005 while running: frame_time_start = time.time() if not frame_buffer.empty(): next_time = time.time() if last_time is not None: video_frame_times.add(next_time - last_time) video_fps = 1 / video_frame_times.get_avg() cv2.imshow(path, frame_buffer.get()) last_time = next_time if cv2.waitKey(1) == 27: # Press Escape to close running = False buffer_size = frame_buffer.qsize() if buffer_size < args.video_multiframe: frame_time_stabilizer += stabilizer_step elif buffer_size > args.video_multiframe: frame_time_stabilizer -= stabilizer_step if frame_time_stabilizer < 0: frame_time_stabilizer = 0 next_frame_target = max( 2 * max(frame_time_stabilizer, frame_time_target) - video_frame_times.get_avg(), 0) target_time = frame_time_start + next_frame_target - 0.001 # Let's just subtract a millisecond to be safe # This gives more accurate timing than if sleeping the whole amount at once while time.time() < target_time: time.sleep(0.001)
def evalvideo(net: Yolact, path: str, out_path: str = None): # If the path is a digit, parse it as a webcam index is_webcam = path.isdigit() # If the input image size is constant, this make things faster (hence why we can use it in a video setting). #cudnn.benchmark = True if is_webcam: vid = cv2.VideoCapture(int(path)) else: vid = cv2.VideoCapture(path) if not vid.isOpened(): print('Could not open video "%s"' % path) exit(-1) target_fps = round(vid.get(cv2.CAP_PROP_FPS)) frame_width = round(vid.get(cv2.CAP_PROP_FRAME_WIDTH)) frame_height = round(vid.get(cv2.CAP_PROP_FRAME_HEIGHT)) if is_webcam: num_frames = float('inf') else: num_frames = round(vid.get(cv2.CAP_PROP_FRAME_COUNT)) transform = FastBaseTransform() frame_times = MovingAverage(100) fps = 0 frame_time_target = 1 / target_fps running = True fps_str = '' vid_done = False frames_displayed = 0 if out_path is not None: out = cv2.VideoWriter(out_path, cv2.VideoWriter_fourcc(*"mp4v"), target_fps, (frame_width, frame_height)) def cleanup_and_exit(): print() pool.terminate() vid.release() if out_path is not None: out.release() cv2.destroyAllWindows() exit() def get_next_frame(vid): frames = [] for idx in range(args.video_multiframe): frame = vid.read()[1] if frame is None: return frames frames.append(frame) return frames def transform_frame(frames): with jt.no_grad(): frames = [jt.array(frame).float() for frame in frames] return frames, transform(jt.stack(frames, 0)) def eval_network(inp): with jt.no_grad(): frames, imgs = inp num_extra = 0 while imgs.size(0) < args.video_multiframe: imgs = jt.contrib.concat([imgs, imgs[0].unsqueeze(0)], dim=0) num_extra += 1 out = net(imgs) if num_extra > 0: out = out[:-num_extra] return frames, out def prep_frame(inp, fps_str): with jt.no_grad(): frame, preds = inp return prep_display(preds, frame, None, None, undo_transform=False, class_color=True, fps_str=fps_str) frame_buffer = Queue() video_fps = 0 # All this timing code to make sure that def play_video(): try: nonlocal frame_buffer, running, video_fps, is_webcam, num_frames, frames_displayed, vid_done video_frame_times = MovingAverage(100) frame_time_stabilizer = frame_time_target last_time = None stabilizer_step = 0.0005 progress_bar = ProgressBar(30, num_frames) while running: frame_time_start = time.time() if not frame_buffer.empty(): next_time = time.time() if last_time is not None: video_frame_times.add(next_time - last_time) video_fps = 1 / video_frame_times.get_avg() if out_path is None: cv2.imshow(path, frame_buffer.get()) else: out.write(frame_buffer.get()) frames_displayed += 1 last_time = next_time if out_path is not None: if video_frame_times.get_avg() == 0: fps = 0 else: fps = 1 / video_frame_times.get_avg() progress = frames_displayed / num_frames * 100 progress_bar.set_val(frames_displayed) print( '\rProcessing Frames %s %6d / %6d (%5.2f%%) %5.2f fps ' % (repr(progress_bar), frames_displayed, num_frames, progress, fps), end='') # This is split because you don't want savevideo to require cv2 display functionality (see #197) if out_path is None and cv2.waitKey(1) == 27: # Press Escape to close running = False if not (frames_displayed < num_frames): running = False if not vid_done: buffer_size = frame_buffer.qsize() if buffer_size < args.video_multiframe: frame_time_stabilizer += stabilizer_step elif buffer_size > args.video_multiframe: frame_time_stabilizer -= stabilizer_step if frame_time_stabilizer < 0: frame_time_stabilizer = 0 new_target = frame_time_stabilizer if is_webcam else max( frame_time_stabilizer, frame_time_target) else: new_target = frame_time_target next_frame_target = max( 2 * new_target - video_frame_times.get_avg(), 0) target_time = frame_time_start + next_frame_target - 0.001 # Let's just subtract a millisecond to be safe if out_path is None or args.emulate_playback: # This gives more accurate timing than if sleeping the whole amount at once while time.time() < target_time: time.sleep(0.001) else: # Let's not starve the main thread, now time.sleep(0.001) except: # See issue #197 for why this is necessary import traceback traceback.print_exc() extract_frame = lambda x, i: (x[0][i] if x[1][i]['detection'] is None else x[0][i].to(x[1][i]['detection']['box'].device ), [x[1][i]]) # Prime the network on the first frame because I do some thread unsafe things otherwise print('Initializing model.. ', end='') first_batch = eval_network(transform_frame(get_next_frame(vid))) print('Done.') # For each frame the sequence of functions it needs to go through to be processed (in reversed order) sequence = [prep_frame, eval_network, transform_frame] pool = ThreadPool(processes=len(sequence) + args.video_multiframe + 2) pool.apply_async(play_video) active_frames = [{ 'value': extract_frame(first_batch, i), 'idx': 0 } for i in range(len(first_batch[0]))] print() if out_path is None: print('Press Escape to close.') try: while vid.isOpened() and running: # Hard limit on frames in buffer so we don't run out of memory >.> while frame_buffer.qsize() > 100: time.sleep(0.001) start_time = time.time() # Start loading the next frames from the disk if not vid_done: next_frames = pool.apply_async(get_next_frame, args=(vid, )) else: next_frames = None if not (vid_done and len(active_frames) == 0): # For each frame in our active processing queue, dispatch a job # for that frame using the current function in the sequence for frame in active_frames: _args = [frame['value']] if frame['idx'] == 0: _args.append(fps_str) frame['value'] = pool.apply_async(sequence[frame['idx']], args=_args) # For each frame whose job was the last in the sequence (i.e. for all final outputs) for frame in active_frames: if frame['idx'] == 0: frame_buffer.put(frame['value'].get()) # Remove the finished frames from the processing queue active_frames = [x for x in active_frames if x['idx'] > 0] # Finish evaluating every frame in the processing queue and advanced their position in the sequence for frame in list(reversed(active_frames)): frame['value'] = frame['value'].get() frame['idx'] -= 1 if frame['idx'] == 0: # Split this up into individual threads for prep_frame since it doesn't support batch size active_frames += [{ 'value': extract_frame(frame['value'], i), 'idx': 0 } for i in range(1, len(frame['value'][0]))] frame['value'] = extract_frame(frame['value'], 0) # Finish loading in the next frames and add them to the processing queue if next_frames is not None: frames = next_frames.get() if len(frames) == 0: vid_done = True else: active_frames.append({ 'value': frames, 'idx': len(sequence) - 1 }) # Compute FPS frame_times.add(time.time() - start_time) fps = args.video_multiframe / frame_times.get_avg() else: fps = 0 fps_str = 'Processing FPS: %.2f | Video Playback FPS: %.2f | Frames in Buffer: %d' % ( fps, video_fps, frame_buffer.qsize()) if not args.display_fps: print('\r' + fps_str + ' ', end='') except KeyboardInterrupt: print('\nStopping..') cleanup_and_exit()
def train(): if not os.path.exists(args.save_folder): os.mkdir(args.save_folder) dataset = COCODetection(image_path=cfg.dataset.train_images, info_file=cfg.dataset.train_info, transform=SSDAugmentation(MEANS)) if args.validation_epoch > 0: setup_eval() val_dataset = COCODetection(image_path=cfg.dataset.valid_images, info_file=cfg.dataset.valid_info, transform=BaseTransform(MEANS)) # Parallel wraps the underlying module, but when saving and loading we don't want that yolact_net = Yolact() net = yolact_net net.train() # Both of these can set args.resume to None, so do them before the check if args.resume == 'interrupt': args.resume = SavePath.get_interrupt(args.save_folder) elif args.resume == 'latest': args.resume = SavePath.get_latest(args.save_folder, cfg.name) if args.resume is not None: print('Resuming training, loading {}...'.format(args.resume)) yolact_net.load_weights(args.resume) if args.start_iter == -1: args.start_iter = SavePath.from_str(args.resume).iteration else: print('Initializing weights...') yolact_net.init_weights(backbone_path=args.save_folder + cfg.backbone.path) optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.decay) criterion = MultiBoxLoss(num_classes=cfg.num_classes, pos_threshold=cfg.positive_iou_threshold, neg_threshold=cfg.negative_iou_threshold, negpos_ratio=3) if args.cuda: cudnn.benchmark = True net = nn.DataParallel(net).cuda() criterion = nn.DataParallel(criterion).cuda() # loss counters loc_loss = 0 conf_loss = 0 iteration = max(args.start_iter, 0) last_time = time.time() epoch_size = len(dataset) // args.batch_size num_epochs = math.ceil(cfg.max_iter / epoch_size) # Which learning rate adjustment step are we on? lr' = lr * gamma ^ step_index step_index = 0 data_loader = data.DataLoader(dataset, args.batch_size, num_workers=args.num_workers, shuffle=True, collate_fn=detection_collate, pin_memory=True) save_path = lambda epoch, iteration: SavePath( cfg.name, epoch, iteration).get_path(root=args.save_folder) time_avg = MovingAverage() loss_types = ['B', 'C', 'M', 'P', 'D', 'E', 'S'] # Forms the print order loss_avgs = {k: MovingAverage(100) for k in loss_types} print('Begin training!') print() # try-except so you can use ctrl+c to save early and stop training try: for epoch in range(num_epochs): # Resume from start_iter if (epoch + 1) * epoch_size < iteration: continue for datum in data_loader: # Stop if we've reached an epoch if we're resuming from start_iter if iteration == (epoch + 1) * epoch_size: break # Stop at the configured number of iterations even if mid-epoch if iteration == cfg.max_iter: break # Change a config setting if we've reached the specified iteration changed = False for change in cfg.delayed_settings: if iteration >= change[0]: changed = True cfg.replace(change[1]) # Reset the loss averages because things might have changed for avg in loss_avgs: avg.reset() # If a config setting was changed, remove it from the list so we don't keep checking if changed: cfg.delayed_settings = [ x for x in cfg.delayed_settings if x[0] > iteration ] # Warm up by linearly interpolating the learning rate from some smaller value if cfg.lr_warmup_until > 0 and iteration <= cfg.lr_warmup_until: set_lr(optimizer, (args.lr - cfg.lr_warmup_init) * (iteration / cfg.lr_warmup_until) + cfg.lr_warmup_init) # Adjust the learning rate at the given iterations, but also if we resume from past that iteration while step_index < len( cfg.lr_steps ) and iteration >= cfg.lr_steps[step_index]: step_index += 1 set_lr(optimizer, args.lr * (args.gamma**step_index)) # Load training data # Note, for training on multiple gpus this will use the custom replicate and gather I wrote up there images, targets, masks, num_crowds = prepare_data(datum) # Forward Pass out = net(images) # Compute Loss optimizer.zero_grad() wrapper = ScatterWrapper(targets, masks, num_crowds) losses = criterion(out, wrapper, wrapper.make_mask()) losses = {k: v.mean() for k, v in losses.items() } # Mean here because Dataparallel loss = sum([losses[k] for k in losses]) # Backprop loss.backward( ) # Do this to free up vram even if loss is not finite if torch.isfinite(loss).item(): optimizer.step() # Add the loss to the moving average for bookkeeping for k in losses: loss_avgs[k].add(losses[k].item()) cur_time = time.time() elapsed = cur_time - last_time last_time = cur_time # Exclude graph setup from the timing information if iteration != args.start_iter: time_avg.add(elapsed) if iteration % 10 == 0: eta_str = str( datetime.timedelta(seconds=(cfg.max_iter - iteration) * time_avg.get_avg())).split('.')[0] total = sum([loss_avgs[k].get_avg() for k in losses]) loss_labels = sum([[k, loss_avgs[k].get_avg()] for k in loss_types if k in losses], []) print(('[%3d] %7d ||' + (' %s: %.3f |' * len(losses)) + ' T: %.3f || ETA: %s || timer: %.3f') % tuple([epoch, iteration] + loss_labels + [total, eta_str, elapsed]), flush=True) iteration += 1 if iteration % args.save_interval == 0 and iteration != args.start_iter: if args.keep_latest: latest = SavePath.get_latest(args.save_folder, cfg.name) print('Saving state, iter:', iteration) yolact_net.save_weights(save_path(epoch, iteration)) if args.keep_latest and latest is not None: if args.keep_latest_interval <= 0 or iteration % args.keep_latest_interval != args.save_interval: print('Deleting old save...') os.remove(latest) # This is done per epoch if args.validation_epoch > 0: if epoch % args.validation_epoch == 0 and epoch > 0: compute_validation_map(yolact_net, val_dataset) except KeyboardInterrupt: print('Stopping early. Saving network...') # Delete previous copy of the interrupted network so we don't spam the weights folder SavePath.remove_interrupt(args.save_folder) yolact_net.save_weights( save_path(epoch, repr(iteration) + '_interrupt')) exit() yolact_net.save_weights(save_path(epoch, iteration))
def play_video(): try: nonlocal frame_buffer, running, video_fps, is_webcam, num_frames, frames_displayed, vid_done video_frame_times = MovingAverage(100) frame_time_stabilizer = frame_time_target last_time = None stabilizer_step = 0.0005 progress_bar = ProgressBar(30, num_frames) while running: frame_time_start = time.time() if not frame_buffer.empty(): next_time = time.time() if last_time is not None: video_frame_times.add(next_time - last_time) video_fps = 1 / video_frame_times.get_avg() if out_path is None or os.path.isdir(out_path): cv2.imshow(path, frame_buffer.get()[0]) else: out.write(frame_buffer.get()[0]) frames_displayed += 1 last_time = next_time if out_path is not None and not os.path.isdir(out_path): if video_frame_times.get_avg() == 0: fps = 0 else: fps = 1 / video_frame_times.get_avg() progress = frames_displayed / num_frames * 100 progress_bar.set_val(frames_displayed) print( '\rProcessing Frames %s %6d / %6d (%5.2f%%) %5.2f fps ' % (repr(progress_bar), frames_displayed, num_frames, progress, fps), end='') if (out_path is None or os.path.isdir(out_path)) and cv2.waitKey(1) == 27: running = False if not vid_done: buffer_size = frame_buffer.qsize() if buffer_size < args.video_multiframe: frame_time_stabilizer += stabilizer_step elif buffer_size > args.video_multiframe: frame_time_stabilizer -= stabilizer_step if frame_time_stabilizer < 0: frame_time_stabilizer = 0 new_target = frame_time_stabilizer if is_webcam else max( frame_time_stabilizer, frame_time_target) else: new_target = frame_time_target next_frame_target = max( 2 * new_target - video_frame_times.get_avg(), 0) target_time = frame_time_start + next_frame_target - 0.001 if out_path is None or os.path.isdir( out_path) or args.emulate_playback: while time.time() < target_time: time.sleep(0.001) else: time.sleep(0.001) except: import traceback traceback.print_exc()
def get_default_log_avgs(): return { k: MovingAverage(100) for k in loss_types }
def evaluate(net: Yolact, dataset, train_mode=False): # net.detection.use_fast_nms = args.fast_nms # net.detection.use_cross_class_nms = args.cross_class_nms frame_times = MovingAverage() dataset_size = len(dataset) # if args.max_images < 0 else min(args.max_images, len(dataset)) progress_bar = ProgressBar(30, dataset_size) print() iou_thresholds = [x / 100 for x in range(50, 100, 5)] if not args.display and not args.benchmark: # 不显示,直接算分 # For each class and iou, stores tuples (score, isPositive) # Index ap_data[type][iouIdx][classIdx] ap_data = { 'box': [[APDataObject() for _ in COCO_CLASSES] for _ in iou_thresholds], 'mask': [[APDataObject() for _ in COCO_CLASSES] for _ in iou_thresholds] } detections = Detections() dataset_indices = list(range(len(dataset))) if args.shuffle: random.shuffle(dataset_indices) elif not args.no_sort: # Do a deterministic shuffle based on the image ids # # I do this because on python 3.5 dictionary key order is *random*, while in 3.6 it's # the order of insertion. That means on python 3.6, the images come in the order they are in # in the annotations file. For some reason, the first images in the annotations file are # the hardest. To combat this, I use a hard-coded hash function based on the image ids # to shuffle the indices we use. That way, no matter what python version or how pycocotools # handles the data, we get the same result every time. hashed = [badhash(x) for x in dataset.ids] dataset_indices.sort(key=lambda x: hashed[x]) # 再else就什么也不做 # 我们去掉了args.max_images之后,这句也可以不要了。不过以免万一先做保留。 dataset_indices = dataset_indices[:dataset_size] # Main eval loop for it, image_idx in enumerate(dataset_indices): # Load Data img, gt, gt_masks, h, w, num_crowd = dataset.pull_item(image_idx) batch = torch.autograd.Variable(img.unsqueeze(0)) if args.cuda: batch = batch.cuda() # 送入网络'Network Extra' preds = net(batch) # Perform the meat of the operation here depending on our mode. if args.display: img_numpy = prep_display(preds, img, h, w) # 我们不搞display elif args.benchmark: prep_benchmark(preds, h, w) # 我们也不搞这个 else: prep_metrics(ap_data, preds, img, gt, gt_masks, h, w, num_crowd, dataset.ids[image_idx], detections) # First couple of images take longer because we're constructing the graph. # Since that's technically initialization, don't include those in the FPS calculations. # if it > 1: # frame_times.add(timer.total_time()) # if args.display: # if it > 1: # print('Avg FPS: %.4f' % (1 / frame_times.get_avg())) # plt.imshow(img_numpy) # plt.title(str(dataset.ids[image_idx])) # plt.show() # elif not args.no_bar: # if it > 1: # fps = 1 / frame_times.get_avg() # else: # fps = 0 # progress = (it + 1) / dataset_size * 100 # progress_bar.set_val(it + 1) # print('\rProcessing Images %s %6d / %6d (%5.2f%%) %5.2f fps ' # % (repr(progress_bar), it + 1, dataset_size, progress, fps), end='') if not args.display and not args.benchmark: print() if args.output_coco_json: print('Dumping detections...') if args.output_web_json: detections.dump_web() else: detections.dump() else: if not train_mode: print('Saving data...') with open(args.ap_data_file, 'wb') as f: pickle.dump(ap_data, f) return calc_map(ap_data)
def train(rank, args): if args.num_gpus > 1: multi_gpu_rescale(args) if rank == 0: if not os.path.exists(args.save_folder): os.mkdir(args.save_folder) # set up logger setup_logger(output=os.path.join(args.log_folder, cfg.name), distributed_rank=rank) logger = logging.getLogger("yolact.train") w = SummaryHelper(distributed_rank=rank, log_dir=os.path.join(args.log_folder, cfg.name)) w.add_text("argv", " ".join(sys.argv)) logger.info("Args: {}".format(" ".join(sys.argv))) import git with git.Repo(search_parent_directories=True) as repo: w.add_text("git_hash", repo.head.object.hexsha) logger.info("git hash: {}".format(repo.head.object.hexsha)) try: logger.info("Initializing torch.distributed backend...") dist.init_process_group(backend='nccl', init_method=args.dist_url, world_size=args.num_gpus, rank=rank) except Exception as e: logger.error("Process group URL: {}".format(args.dist_url)) raise e dist.barrier() if torch.cuda.device_count() > 1: logger.info('Multiple GPUs detected! Turning off JIT.') collate_fn = detection_collate if cfg.dataset.name == 'YouTube VIS': dataset = YoutubeVIS(image_path=cfg.dataset.train_images, info_file=cfg.dataset.train_info, configs=cfg.dataset, transform=SSDAugmentationVideo(MEANS)) if cfg.dataset.joint == 'coco': joint_dataset = COCODetection( image_path=cfg.joint_dataset.train_images, info_file=cfg.joint_dataset.train_info, transform=SSDAugmentation(MEANS)) joint_collate_fn = detection_collate if args.validation_epoch > 0: setup_eval() val_dataset = YoutubeVIS(image_path=cfg.dataset.valid_images, info_file=cfg.dataset.valid_info, configs=cfg.dataset, transform=BaseTransformVideo(MEANS)) collate_fn = collate_fn_youtube_vis elif cfg.dataset.name == 'FlyingChairs': dataset = FlyingChairs(image_path=cfg.dataset.trainval_images, info_file=cfg.dataset.trainval_info) collate_fn = collate_fn_flying_chairs else: dataset = COCODetection(image_path=cfg.dataset.train_images, info_file=cfg.dataset.train_info, transform=SSDAugmentation(MEANS)) if args.validation_epoch > 0: setup_eval() val_dataset = COCODetection(image_path=cfg.dataset.valid_images, info_file=cfg.dataset.valid_info, transform=BaseTransform(MEANS)) # Set cuda device early to avoid duplicate model in master GPU if args.cuda: torch.cuda.set_device(rank) # Parallel wraps the underlying module, but when saving and loading we don't want that yolact_net = Yolact() net = yolact_net net.train() # I don't use the timer during training (I use a different timing method). # Apparently there's a race condition with multiple GPUs. # use timer for experiments timer.disable_all() # Both of these can set args.resume to None, so do them before the check if args.resume == 'interrupt': args.resume = SavePath.get_interrupt(args.save_folder) elif args.resume == 'latest': args.resume = SavePath.get_latest(args.save_folder, cfg.name) if args.resume is not None: logger.info('Resuming training, loading {}...'.format(args.resume)) yolact_net.load_weights(args.resume, args=args) if args.start_iter == -1: args.start_iter = SavePath.from_str(args.resume).iteration else: logger.info('Initializing weights...') yolact_net.init_weights(backbone_path=args.save_folder + cfg.backbone.path) if cfg.flow.train_flow: criterion = OpticalFlowLoss() else: criterion = MultiBoxLoss(num_classes=cfg.num_classes, pos_threshold=cfg.positive_iou_threshold, neg_threshold=cfg.negative_iou_threshold, negpos_ratio=3) if args.cuda: cudnn.benchmark = True net.cuda(rank) criterion.cuda(rank) net = nn.parallel.DistributedDataParallel(net, device_ids=[rank], output_device=rank, broadcast_buffers=False, find_unused_parameters=True) # net = nn.DataParallel(net).cuda() # criterion = nn.DataParallel(criterion).cuda() optimizer = optim.SGD(filter(lambda x: x.requires_grad, net.parameters()), lr=args.lr, momentum=args.momentum, weight_decay=args.decay) # loss counters loc_loss = 0 conf_loss = 0 iteration = max(args.start_iter, 0) w.set_step(iteration) last_time = time.time() epoch_size = len(dataset) // args.batch_size // args.num_gpus num_epochs = math.ceil(cfg.max_iter / epoch_size) # Which learning rate adjustment step are we on? lr' = lr * gamma ^ step_index step_index = 0 from data.sampler_utils import InfiniteSampler, build_batch_data_sampler infinite_sampler = InfiniteSampler(dataset, seed=args.random_seed, num_replicas=args.num_gpus, rank=rank, shuffle=True) train_sampler = build_batch_data_sampler(infinite_sampler, images_per_batch=args.batch_size) data_loader = data.DataLoader( dataset, num_workers=args.num_workers, collate_fn=collate_fn, multiprocessing_context="fork" if args.num_workers > 1 else None, batch_sampler=train_sampler) data_loader_iter = iter(data_loader) if cfg.dataset.joint: joint_infinite_sampler = InfiniteSampler(joint_dataset, seed=args.random_seed, num_replicas=args.num_gpus, rank=rank, shuffle=True) joint_train_sampler = build_batch_data_sampler( joint_infinite_sampler, images_per_batch=args.batch_size) joint_data_loader = data.DataLoader( joint_dataset, num_workers=args.num_workers, collate_fn=joint_collate_fn, multiprocessing_context="fork" if args.num_workers > 1 else None, batch_sampler=joint_train_sampler) joint_data_loader_iter = iter(joint_data_loader) dist.barrier() save_path = lambda epoch, iteration: SavePath( cfg.name, epoch, iteration).get_path(root=args.save_folder) time_avg = MovingAverage() data_time_avg = MovingAverage(10) global loss_types # Forms the print order loss_avgs = {k: MovingAverage(100) for k in loss_types} def backward_and_log(prefix, net_outs, targets, masks, num_crowds, extra_loss=None): optimizer.zero_grad() out = net_outs["pred_outs"] wrapper = ScatterWrapper(targets, masks, num_crowds) losses = criterion(out, wrapper, wrapper.make_mask()) losses = {k: v.mean() for k, v in losses.items()} # Mean here because Dataparallel if extra_loss is not None: assert type(extra_loss) == dict losses.update(extra_loss) loss = sum([losses[k] for k in losses]) # Backprop loss.backward() # Do this to free up vram even if loss is not finite if torch.isfinite(loss).item(): optimizer.step() # Add the loss to the moving average for bookkeeping for k in losses: loss_avgs[k].add(losses[k].item()) w.add_scalar('{prefix}/{key}'.format(prefix=prefix, key=k), losses[k].item()) return losses logger.info('Begin training!') # try-except so you can use ctrl+c to save early and stop training try: for epoch in range(num_epochs): # Resume from start_iter if (epoch + 1) * epoch_size < iteration: continue while True: data_start_time = time.perf_counter() datum = next(data_loader_iter) dist.barrier() data_end_time = time.perf_counter() data_time = data_end_time - data_start_time if iteration != args.start_iter: data_time_avg.add(data_time) # Stop if we've reached an epoch if we're resuming from start_iter if iteration == (epoch + 1) * epoch_size: break # Stop at the configured number of iterations even if mid-epoch if iteration == cfg.max_iter: break # Change a config setting if we've reached the specified iteration changed = False for change in cfg.delayed_settings: if iteration >= change[0]: changed = True cfg.replace(change[1]) # Reset the loss averages because things might have changed for avg in loss_avgs: avg.reset() # If a config setting was changed, remove it from the list so we don't keep checking if changed: cfg.delayed_settings = [ x for x in cfg.delayed_settings if x[0] > iteration ] # Warm up by linearly interpolating the learning rate from some smaller value if cfg.lr_warmup_until > 0 and iteration <= cfg.lr_warmup_until and cfg.lr_warmup_init < args.lr: set_lr(optimizer, (args.lr - cfg.lr_warmup_init) * (iteration / cfg.lr_warmup_until) + cfg.lr_warmup_init) elif cfg.lr_schedule == 'cosine': set_lr( optimizer, args.lr * ((math.cos(math.pi * iteration / cfg.max_iter) + 1.) * .5)) # Adjust the learning rate at the given iterations, but also if we resume from past that iteration while cfg.lr_schedule == 'step' and step_index < len( cfg.lr_steps ) and iteration >= cfg.lr_steps[step_index]: step_index += 1 set_lr(optimizer, args.lr * (args.gamma**step_index)) global lr w.add_scalar('meta/lr', lr) if cfg.dataset.name == "FlyingChairs": imgs_1, imgs_2, flows = prepare_flow_data(datum) net_outs = net(None, extras=(imgs_1, imgs_2)) # Compute Loss optimizer.zero_grad() losses = criterion(net_outs, flows) losses = {k: v.mean() for k, v in losses.items() } # Mean here because Dataparallel loss = sum([losses[k] for k in losses]) # Backprop loss.backward( ) # Do this to free up vram even if loss is not finite if torch.isfinite(loss).item(): optimizer.step() # Add the loss to the moving average for bookkeeping for k in losses: loss_avgs[k].add(losses[k].item()) w.add_scalar('loss/%s' % k, losses[k].item()) elif cfg.dataset.joint or not cfg.dataset.is_video: if cfg.dataset.joint: joint_datum = next(joint_data_loader_iter) dist.barrier() # Load training data # Note, for training on multiple gpus this will use the custom replicate and gather I wrote up there images, targets, masks, num_crowds = prepare_data( joint_datum) else: images, targets, masks, num_crowds = prepare_data( datum) extras = { "backbone": "full", "interrupt": False, "moving_statistics": { "aligned_feats": [] } } net_outs = net(images, extras=extras) out = net_outs["pred_outs"] # Compute Loss optimizer.zero_grad() wrapper = ScatterWrapper(targets, masks, num_crowds) losses = criterion(out, wrapper, wrapper.make_mask()) losses = {k: v.mean() for k, v in losses.items() } # Mean here because Dataparallel loss = sum([losses[k] for k in losses]) # Backprop loss.backward( ) # Do this to free up vram even if loss is not finite if torch.isfinite(loss).item(): optimizer.step() # Add the loss to the moving average for bookkeeping for k in losses: loss_avgs[k].add(losses[k].item()) w.add_scalar('joint/%s' % k, losses[k].item()) # Forward Pass if cfg.dataset.is_video: # reference frames references = [] moving_statistics = {"aligned_feats": [], "conf_hist": []} for idx, frame in enumerate(datum[:0:-1]): images, annots = frame extras = { "backbone": "full", "interrupt": True, "keep_statistics": True, "moving_statistics": moving_statistics } with torch.no_grad(): net_outs = net(images, extras=extras) moving_statistics["feats"] = net_outs["feats"] moving_statistics["lateral"] = net_outs["lateral"] keys_to_save = ("outs_phase_1", "outs_phase_2") for key in set(net_outs.keys()) - set(keys_to_save): del net_outs[key] references.append(net_outs) # key frame with annotation, but not compute full backbone frame = datum[0] images, annots = frame frame = ( images, annots, ) images, targets, masks, num_crowds = prepare_data(frame) extras = { "backbone": "full", "interrupt": not cfg.flow.base_backward, "moving_statistics": moving_statistics } gt_net_outs = net(images, extras=extras) if cfg.flow.base_backward: losses = backward_and_log("compute", gt_net_outs, targets, masks, num_crowds) keys_to_save = ("outs_phase_1", "outs_phase_2") for key in set(gt_net_outs.keys()) - set(keys_to_save): del gt_net_outs[key] # now do the warp if len(references) > 0: reference_frame = references[0] extras = { "backbone": "partial", "moving_statistics": moving_statistics } net_outs = net(images, extras=extras) extra_loss = yolact_net.extra_loss( net_outs, gt_net_outs) losses = backward_and_log("warp", net_outs, targets, masks, num_crowds, extra_loss=extra_loss) cur_time = time.time() elapsed = cur_time - last_time last_time = cur_time w.add_scalar('meta/data_time', data_time) w.add_scalar('meta/iter_time', elapsed) # Exclude graph setup from the timing information if iteration != args.start_iter: time_avg.add(elapsed) if iteration % 10 == 0: eta_str = str( datetime.timedelta(seconds=(cfg.max_iter - iteration) * time_avg.get_avg())).split('.')[0] if torch.cuda.is_available(): max_mem_mb = torch.cuda.max_memory_allocated( ) / 1024.0 / 1024.0 # torch.cuda.reset_max_memory_allocated() else: max_mem_mb = None logger.info("""\ eta: {eta} epoch: {epoch} iter: {iter} \ {losses} {loss_total} \ time: {time} data_time: {data_time} lr: {lr} {memory}\ """.format(eta=eta_str, epoch=epoch, iter=iteration, losses=" ".join([ "{}: {:.3f}".format(k, loss_avgs[k].get_avg()) for k in losses ]), loss_total="T: {:.3f}".format( sum([loss_avgs[k].get_avg() for k in losses])), data_time="{:.3f}".format(data_time_avg.get_avg()), time="{:.3f}".format(elapsed), lr="{:.6f}".format(lr), memory="max_mem: {:.0f}M".format(max_mem_mb))) if rank == 0 and iteration % 100 == 0: if cfg.flow.train_flow: import flowiz as fz from layers.warp_utils import deform_op tgt_size = (64, 64) flow_size = flows.size()[2:] vis_data = [] for pred_flow in net_outs: vis_data.append(pred_flow) deform_gt = deform_op(imgs_2, flows) flows_pred = [ F.interpolate(x, size=flow_size, mode='bilinear', align_corners=False) for x in net_outs ] deform_preds = [ deform_op(imgs_2, x) for x in flows_pred ] vis_data.append( F.interpolate(flows, size=tgt_size, mode='area')) vis_data = [ F.interpolate(flow[:1], size=tgt_size) for flow in vis_data ] vis_data = [ fz.convert_from_flow( flow[0].data.cpu().numpy().transpose( 1, 2, 0)).transpose( 2, 0, 1).astype('float32') / 255 for flow in vis_data ] def convert_image(image): image = F.interpolate(image, size=tgt_size, mode='area') image = image[0] image = image.data.cpu().numpy() image = image[::-1] image = image.transpose(1, 2, 0) image = image * np.array(STD) + np.array(MEANS) image = image.transpose(2, 0, 1) image = image / 255 image = np.clip(image, -1, 1) image = image[::-1] return image vis_data.append(convert_image(imgs_1)) vis_data.append(convert_image(imgs_2)) vis_data.append(convert_image(deform_gt)) vis_data.extend( [convert_image(x) for x in deform_preds]) vis_data_stack = np.stack(vis_data, axis=0) w.add_images("preds_flow", vis_data_stack) elif cfg.flow.warp_mode == "flow": import flowiz as fz tgt_size = (64, 64) vis_data = [] for pred_flow, _, _ in net_outs["preds_flow"]: vis_data.append(pred_flow) vis_data = [ F.interpolate(flow[:1], size=tgt_size) for flow in vis_data ] vis_data = [ fz.convert_from_flow( flow[0].data.cpu().numpy().transpose( 1, 2, 0)).transpose( 2, 0, 1).astype('float32') / 255 for flow in vis_data ] input_image = F.interpolate(images, size=tgt_size, mode='area') input_image = input_image[0] input_image = input_image.data.cpu().numpy() input_image = input_image.transpose(1, 2, 0) input_image = input_image * np.array( STD[::-1]) + np.array(MEANS[::-1]) input_image = input_image.transpose(2, 0, 1) input_image = input_image / 255 input_image = np.clip(input_image, -1, 1) vis_data.append(input_image) vis_data_stack = np.stack(vis_data, axis=0) w.add_images("preds_flow", vis_data_stack) iteration += 1 w.set_step(iteration) if rank == 0 and iteration % args.save_interval == 0 and iteration != args.start_iter: if args.keep_latest: latest = SavePath.get_latest(args.save_folder, cfg.name) logger.info('Saving state, iter: {}'.format(iteration)) yolact_net.save_weights(save_path(epoch, iteration)) if args.keep_latest and latest is not None: if args.keep_latest_interval <= 0 or iteration % args.keep_latest_interval != args.save_interval: logger.info('Deleting old save...') os.remove(latest) # This is done per epoch if args.validation_epoch > 0: if epoch % args.validation_epoch == 0 and epoch > 0: if rank == 0: compute_validation_map(yolact_net, val_dataset) dist.barrier() except KeyboardInterrupt: if args.interrupt_no_save: logger.info('No save on interrupt, just exiting...') elif rank == 0: print('Stopping early. Saving network...') # Delete previous copy of the interrupted network so we don't spam the weights folder SavePath.remove_interrupt(args.save_folder) yolact_net.save_weights( save_path(epoch, repr(iteration) + '_interrupt')) return if rank == 0: yolact_net.save_weights(save_path(epoch, iteration))
def evaluate(net: STMask, dataset): net.detect.use_fast_nms = args.fast_nms cfg.mask_proto_debug = args.mask_proto_debug frame_times = MovingAverage() dataset_size = math.ceil(len(dataset) / args.batch_size) if args.max_images < 0 else min( args.max_images, len(dataset)) progress_bar = ProgressBar(30, dataset_size) print() data_loader = data.DataLoader(dataset, args.batch_size, shuffle=False, collate_fn=detection_collate, pin_memory=True) results = [] try: # Main eval loop for it, data_batch in enumerate(data_loader): timer.reset() with timer.env('Load Data'): images, images_meta, ref_images, ref_images_meta = prepare_data( data_batch, is_cuda=True, train_mode=False) pad_h, pad_w = images.size()[2:4] with timer.env('Network Extra'): preds = net(images, img_meta=images_meta, ref_x=ref_images, ref_imgs_meta=ref_images_meta) # Perform the meat of the operation here depending on our mode. if it == dataset_size - 1: batch_size = len(dataset) % args.batch_size else: batch_size = images.size(0) for batch_id in range(batch_size): if args.display: img_id = (images_meta[batch_id]['video_id'], images_meta[batch_id]['frame_id']) if not cfg.display_mask_single: img_numpy = prep_display( preds[batch_id], images[batch_id], pad_h, pad_w, img_meta=images_meta[batch_id], img_ids=img_id) else: for p in range( preds[batch_id]['detection']['box'].size(0)): preds_single = {'detection': {}} for k in preds[batch_id]['detection']: if preds[batch_id]['detection'][ k] is not None and k not in {'proto'}: preds_single['detection'][k] = preds[ batch_id]['detection'][k][p] else: preds_single['detection'][k] = None preds_single['net'] = preds[batch_id]['net'] preds_single['detection'][ 'box_ids'] = torch.tensor(-1) img_numpy = prep_display( preds_single, images[batch_id], pad_h, pad_w, img_meta=images_meta[batch_id], img_ids=img_id) plt.imshow(img_numpy) plt.axis('off') plt.savefig(''.join([ args.mask_det_file[:-12], 'out_single/', str(img_id), '_', str(p), '.png' ])) plt.clf() else: cfg.preserve_aspect_ratio = True preds_cur = postprocess_ytbvis( preds[batch_id], pad_h, pad_w, images_meta[batch_id], score_threshold=cfg.eval_conf_thresh) segm_results = bbox2result_with_id(preds_cur, cfg.classes) results.append(segm_results) # First couple of images take longer because we're constructing the graph. # Since that's technically initialization, don't include those in the FPS calculations. if it > 1: frame_times.add(timer.total_time() / batch_size) if args.display and not cfg.display_mask_single: if it > 1: print('Avg FPS: %.4f' % (1 / frame_times.get_avg())) plt.imshow(img_numpy) plt.axis('off') plt.title(str(img_id)) root_dir = ''.join([ args.mask_det_file[:-12], 'out/', str(images_meta[batch_id]['video_id']), '/' ]) if not os.path.exists(root_dir): os.makedirs(root_dir) plt.savefig(''.join([ root_dir, str(images_meta[batch_id]['frame_id']), '.png' ])) plt.clf() # plt.show() elif not args.no_bar: if it > 1: fps = 1 / frame_times.get_avg() else: fps = 0 progress = (it + 1) / dataset_size * 100 progress_bar.set_val(it + 1) print( '\rProcessing Images %s %6d / %6d (%5.2f%%) %5.2f fps ' % (repr(progress_bar), it + 1, dataset_size, progress, fps), end='') if not args.display and not args.benchmark: print() if args.output_json: print('Dumping detections...') results2json_videoseg(dataset, results, args.mask_det_file) if cfg.use_valid_sub or cfg.use_train_sub: if cfg.use_valid_sub: print('calculate evaluation metrics ...') ann_file = cfg.valid_sub_dataset.ann_file else: print('calculate train_sub metrics ...') ann_file = cfg.train_dataset.ann_file dt_file = args.mask_det_file metrics = calc_metrics(ann_file, dt_file) return metrics elif args.benchmark: print() print() print('Stats for the last frame:') timer.print_stats() avg_seconds = frame_times.get_avg() print('Average: %5.2f fps, %5.2f ms' % (1 / frame_times.get_avg(), 1000 * avg_seconds)) except KeyboardInterrupt: print('Stopping...')
def validation(net: STMask, valid_data=False, output_metrics_file=None): cfg.mask_proto_debug = args.mask_proto_debug if not valid_data: cfg.valid_sub_dataset.test_mode = True dataset = get_dataset(cfg.valid_sub_dataset) else: cfg.valid_dataset.test_mode = True dataset = get_dataset(cfg.valid_dataset) frame_times = MovingAverage() dataset_size = math.ceil(len(dataset) / args.batch_size) if args.max_images < 0 else min( args.max_images, len(dataset)) progress_bar = ProgressBar(30, dataset_size) print() data_loader = data.DataLoader(dataset, args.batch_size, shuffle=False, collate_fn=detection_collate, pin_memory=True) results = [] try: # Main eval loop for it, data_batch in enumerate(data_loader): timer.reset() with timer.env('Load Data'): images, images_meta, ref_images, ref_images_meta = prepare_data( data_batch, is_cuda=True, train_mode=False) pad_h, pad_w = images.size()[2:4] with timer.env('Network Extra'): preds = net(images, img_meta=images_meta, ref_x=ref_images, ref_imgs_meta=ref_images_meta) if it == dataset_size - 1: batch_size = len(dataset) % args.batch_size else: batch_size = images.size(0) for batch_id in range(batch_size): cfg.preserve_aspect_ratio = True preds_cur = postprocess_ytbvis( preds[batch_id], pad_h, pad_w, images_meta[batch_id], score_threshold=cfg.eval_conf_thresh) segm_results = bbox2result_with_id(preds_cur, cfg.classes) results.append(segm_results) # First couple of images take longer because we're constructing the graph. # Since that's technically initialization, don't include those in the FPS calculations. if it > 1: if batch_size == 0: batch_size = 1 frame_times.add(timer.total_time() / batch_size) if it > 1 and frame_times.get_avg() > 0: fps = 1 / frame_times.get_avg() else: fps = 0 progress = (it + 1) / dataset_size * 100 progress_bar.set_val(it + 1) print( '\rProcessing Images %s %6d / %6d (%5.2f%%) %5.2f fps ' % (repr(progress_bar), it + 1, dataset_size, progress, fps), end='') print() print('Dumping detections...') if not valid_data: results2json_videoseg(dataset, results, args.mask_det_file) print('calculate evaluation metrics ...') ann_file = cfg.valid_sub_dataset.ann_file dt_file = args.mask_det_file calc_metrics(ann_file, dt_file, output_file=output_metrics_file) else: results2json_videoseg(dataset, results, output_metrics_file.replace('.txt', '.json')) except KeyboardInterrupt: print('Stopping...')
def train(): if not os.path.exists(args.save_folder): os.mkdir(args.save_folder) dataset = COCODetection(image_path=cfg.dataset.train_images, info_file=cfg.dataset.train_info, transform=SSDAugmentation(MEANS)) if args.validation_epoch > 0: setup_eval() val_dataset = COCODetection(image_path=cfg.dataset.valid_images, info_file=cfg.dataset.valid_info, transform=BaseTransform(MEANS)) # Parallel wraps the underlying module, but when saving and loading we don't want that yolact_net = Yolact() net = yolact_net net.train() print('\n--- Generator created! ---') # NOTE # I maunally set the original image size and seg size as 138 # might change in the future, for example 550 if cfg.pred_seg: dis_size = 138 dis_net = Discriminator_Wgan(i_size = dis_size, s_size = dis_size) # Change the initialization inside the dis_net class inside # set the dis net's initial parameter values # dis_net.apply(gan_init) dis_net.train() print('--- Discriminator created! ---\n') if args.log: log = Log(cfg.name, args.log_folder, dict(args._get_kwargs()), overwrite=(args.resume is None), log_gpu_stats=args.log_gpu) # I don't use the timer during training (I use a different timing method). # Apparently there's a race condition with multiple GPUs, so disable it just to be safe. timer.disable_all() # Both of these can set args.resume to None, so do them before the check if args.resume == 'interrupt': args.resume = SavePath.get_interrupt(args.save_folder) elif args.resume == 'latest': args.resume = SavePath.get_latest(args.save_folder, cfg.name) if args.resume is not None: print('Resuming training, loading {}...'.format(args.resume)) yolact_net.load_weights(args.resume) if args.start_iter == -1: args.start_iter = SavePath.from_str(args.resume).iteration else: print('Initializing weights...') yolact_net.init_weights(backbone_path=args.save_folder + cfg.backbone.path) # optimizer_gen = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, # weight_decay=args.decay) # if cfg.pred_seg: # optimizer_dis = optim.SGD(dis_net.parameters(), lr=cfg.dis_lr, momentum=args.momentum, # weight_decay=args.decay) # schedule_dis = ReduceLROnPlateau(optimizer_dis, mode = 'min', patience=6, min_lr=1E-6) # NOTE: Using the Ranger Optimizer for the generator optimizer_gen = Ranger(net.parameters(), lr = args.lr, weight_decay=args.decay) # optimizer_gen = optim.RMSprop(net.parameters(), lr = args.lr) # FIXME: Might need to modify the lr in the optimizer carefually # check this # def make_D_optimizer(cfg, model): # params = [] # for key, value in model.named_parameters(): # if not value.requires_grad: # continue # lr = cfg.SOLVER.BASE_LR/5.0 # weight_decay = cfg.SOLVER.WEIGHT_DECAY # if "bias" in key: # lr = cfg.SOLVER.BASE_LR * cfg.SOLVER.BIAS_LR_FACTOR/5.0 # weight_decay = cfg.SOLVER.WEIGHT_DECAY_BIAS # params += [{"params": [value], "lr": lr, "weight_decay": weight_decay}] # optimizer = torch.optim.SGD(params, lr, momentum=cfg.SOLVER.MOMENTUM) # return optimizer if cfg.pred_seg: optimizer_dis = optim.SGD(dis_net.parameters(), lr=cfg.dis_lr) # optimizer_dis = optim.RMSprop(dis_net.parameters(), lr = cfg.dis_lr) schedule_dis = ReduceLROnPlateau(optimizer_dis, mode = 'min', patience=6, min_lr=1E-6) criterion = MultiBoxLoss(num_classes=cfg.num_classes, pos_threshold=cfg.positive_iou_threshold, neg_threshold=cfg.negative_iou_threshold, negpos_ratio=cfg.ohem_negpos_ratio, pred_seg=cfg.pred_seg) # criterion_dis = nn.BCELoss() # Take the advice from WGAN criterion_dis = DiscriminatorLoss_Maskrcnn() criterion_gen = GeneratorLoss_Maskrcnn() if args.batch_alloc is not None: # e.g. args.batch_alloc: 24,24 args.batch_alloc = [int(x) for x in args.batch_alloc.split(',')] if sum(args.batch_alloc) != args.batch_size: print('Error: Batch allocation (%s) does not sum to batch size (%s).' % (args.batch_alloc, args.batch_size)) exit(-1) net = CustomDataParallel(NetLoss(net, criterion, pred_seg=cfg.pred_seg)) if args.cuda: net = net.cuda() # NOTE if cfg.pred_seg: dis_net = nn.DataParallel(dis_net) dis_net = dis_net.cuda() # Initialize everything if not cfg.freeze_bn: yolact_net.freeze_bn() # Freeze bn so we don't kill our means yolact_net(torch.zeros(1, 3, cfg.max_size, cfg.max_size).cuda()) if not cfg.freeze_bn: yolact_net.freeze_bn(True) # loss counters loc_loss = 0 conf_loss = 0 iteration = max(args.start_iter, 0) last_time = time.time() epoch_size = len(dataset) // args.batch_size num_epochs = math.ceil(cfg.max_iter / epoch_size) # Which learning rate adjustment step are we on? lr' = lr * gamma ^ step_index step_index = 0 data_loader = data.DataLoader(dataset, args.batch_size, num_workers=args.num_workers, shuffle=True, collate_fn=detection_collate, pin_memory=True) # NOTE val_loader = data.DataLoader(val_dataset, args.batch_size, num_workers=args.num_workers*2, shuffle=True, collate_fn=detection_collate, pin_memory=True) save_path = lambda epoch, iteration: SavePath(cfg.name, epoch, iteration).get_path(root=args.save_folder) time_avg = MovingAverage() global loss_types # Forms the print order # TODO: global command can modify global variable inside of the function. loss_avgs = { k: MovingAverage(100) for k in loss_types } # NOTE # Enable AMP amp_enable = cfg.amp scaler = torch.cuda.amp.GradScaler(enabled=amp_enable) print('Begin training!') print() # try-except so you can use ctrl+c to save early and stop training try: for epoch in range(num_epochs): # Resume from start_iter if (epoch+1)*epoch_size < iteration: continue for datum in data_loader: # Stop if we've reached an epoch if we're resuming from start_iter if iteration == (epoch+1)*epoch_size: break # Stop at the configured number of iterations even if mid-epoch if iteration == cfg.max_iter: break # Change a config setting if we've reached the specified iteration changed = False for change in cfg.delayed_settings: if iteration >= change[0]: changed = True cfg.replace(change[1]) # Reset the loss averages because things might have changed for avg in loss_avgs: avg.reset() # If a config setting was changed, remove it from the list so we don't keep checking if changed: cfg.delayed_settings = [x for x in cfg.delayed_settings if x[0] > iteration] # Warm up by linearly interpolating the learning rate from some smaller value if cfg.lr_warmup_until > 0 and iteration <= cfg.lr_warmup_until: set_lr(optimizer_gen, (args.lr - cfg.lr_warmup_init) * (iteration / cfg.lr_warmup_until) + cfg.lr_warmup_init) # Adjust the learning rate at the given iterations, but also if we resume from past that iteration while step_index < len(cfg.lr_steps) and iteration >= cfg.lr_steps[step_index]: step_index += 1 set_lr(optimizer_gen, args.lr * (args.gamma ** step_index)) # NOTE if cfg.pred_seg: # ====== GAN Train ====== # train the gen and dis in different iteration # it_alter_period = iteration % (cfg.gen_iter + cfg.dis_iter) # FIXME: # present_time = time.time() for _ in range(cfg.dis_iter): # freeze_pretrain(yolact_net, freeze=False) # freeze_pretrain(net, freeze=False) # freeze_pretrain(dis_net, freeze=False) # if it_alter_period == 0: # print('--- Generator freeze ---') # print('--- Discriminator training ---') if cfg.amp: with torch.cuda.amp.autocast(): # ----- Discriminator part ----- # seg_list is the prediction mask # can be regarded as generated images from YOLACT # pred_list is the prediction label # seg_list dim: list of (138,138,instances) # pred_list dim: list of (instances) losses, seg_list, pred_list = net(datum) seg_clas, mask_clas, b, seg_size = seg_mask_clas(seg_list, pred_list, datum) # input image size is [b, 3, 550, 550] # downsample to [b, 3, seg_h, seg_w] image_list = [img.to(cuda0) for img in datum[0]] image = interpolate(torch.stack(image_list), size = seg_size, mode='bilinear',align_corners=False) # Because in the discriminator training, we do not # want the gradient flow back to the generator part # we detach seg_clas (mask_clas come the data, does not have grad) output_pred = dis_net(img = image.detach(), seg = seg_clas.detach()) output_grou = dis_net(img = image.detach(), seg = mask_clas.detach()) # p = elem_mul_p.squeeze().permute(1,2,0).cpu().detach().numpy() # g = elem_mul_g.squeeze().permute(1,2,0).cpu().detach().numpy() # image = image.squeeze().permute(1,2,0).cpu().detach().numpy() # from PIL import Image # seg_PIL = Image.fromarray(p, 'RGB') # mask_PIL = Image.fromarray(g, 'RGB') # seg_PIL.save('mul_seg.png') # mask_PIL.save('mul_mask.png') # raise RuntimeError # from matplotlib import pyplot as plt # fig, (ax1, ax2) = plt.subplots(1,2) # ax1.imshow(mask_show) # ax2.imshow(seg_show) # plt.show(block=False) # plt.pause(2) # plt.close() # if iteration % (cfg.gen_iter + cfg.dis_iter) == 0: # print(f'Probability of fake is fake: {output_pred.mean().item():.2f}') # print(f'Probability of real is real: {output_grou.mean().item():.2f}') # 0 for Fake/Generated # 1 for True/Ground Truth # fake_label = torch.zeros(b) # real_label = torch.ones(b) # Advice of practical implementation # from https://arxiv.org/abs/1611.08408 # loss_pred = -criterion_dis(output_pred,target=real_label) # loss_pred = criterion_dis(output_pred,target=fake_label) # loss_grou = criterion_dis(output_grou,target=real_label) # loss_dis = loss_pred + loss_grou # Wasserstein Distance (Earth-Mover) loss_dis = criterion_dis(input=output_grou,target=output_pred) # Backprop the discriminator # Scales loss. Calls backward() on scaled loss to create scaled gradients. scaler.scale(loss_dis).backward() scaler.step(optimizer_dis) scaler.update() optimizer_dis.zero_grad() # clip the updated parameters _ = [par.data.clamp_(-cfg.clip_value, cfg.clip_value) for par in dis_net.parameters()] # ----- Generator part ----- # freeze_pretrain(yolact_net, freeze=False) # freeze_pretrain(net, freeze=False) # freeze_pretrain(dis_net, freeze=False) # if it_alter_period == (cfg.dis_iter+1): # print('--- Generator training ---') # print('--- Discriminator freeze ---') # FIXME: # print(f'dis time pass: {time.time()-present_time:.2f}') # FIXME: # present_time = time.time() with torch.cuda.amp.autocast(): losses, seg_list, pred_list = net(datum) seg_clas, mask_clas, b, seg_size = seg_mask_clas(seg_list, pred_list, datum) image_list = [img.to(cuda0) for img in datum[0]] image = interpolate(torch.stack(image_list), size = seg_size, mode='bilinear',align_corners=False) # Perform forward pass of all-fake batch through D # NOTE this seg_clas CANNOT detach, in order to flow the # gradient back to the generator # output = dis_net(img = image, seg = seg_clas) # Since the log(1-D(G(x))) not provide sufficient gradients # We want log(D(G(x)) instead, this can be achieve by # use the real_label as target. # This step is crucial for the information of discriminator # to go into the generator. # Calculate G's loss based on this output # real_label = torch.ones(b) # loss_gen = criterion_dis(output,target=real_label) # GAN MaskRCNN output_pred = dis_net(img = image, seg = seg_clas) output_grou = dis_net(img = image, seg = mask_clas) # Advice from WGAN # loss_gen = -torch.mean(output) loss_gen = criterion_gen(input=output_grou,target=output_pred) # since the dis is already freeze, the gradients will only # record the YOLACT losses = { k: (v).mean() for k,v in losses.items() } # Mean here because Dataparallel loss = sum([losses[k] for k in losses]) loss += loss_gen # Generator backprop scaler.scale(loss).backward() scaler.step(optimizer_gen) scaler.update() optimizer_gen.zero_grad() # FIXME: # print(f'gen time pass: {time.time()-present_time:.2f}') # print('GAN part over') else: losses, seg_list, pred_list = net(datum) seg_clas, mask_clas, b, seg_size = seg_mask_clas(seg_list, pred_list, datum) image_list = [img.to(cuda0) for img in datum[0]] image = interpolate(torch.stack(image_list), size = seg_size, mode='bilinear',align_corners=False) output_pred = dis_net(img = image.detach(), seg = seg_clas.detach()) output_grou = dis_net(img = image.detach(), seg = mask_clas.detach()) loss_dis = criterion_dis(input=output_grou,target=output_pred) loss_dis.backward() optimizer_dis.step() optimizer_dis.zero_grad() _ = [par.data.clamp_(-cfg.clip_value, cfg.clip_value) for par in dis_net.parameters()] # ----- Generator part ----- # FIXME: # print(f'dis time pass: {time.time()-present_time:.2f}') # FIXME: # present_time = time.time() losses, seg_list, pred_list = net(datum) seg_clas, mask_clas, b, seg_size = seg_mask_clas(seg_list, pred_list, datum) image_list = [img.to(cuda0) for img in datum[0]] image = interpolate(torch.stack(image_list), size = seg_size, mode='bilinear',align_corners=False) # GAN MaskRCNN output_pred = dis_net(img = image, seg = seg_clas) output_grou = dis_net(img = image, seg = mask_clas) loss_gen = criterion_gen(input=output_grou,target=output_pred) # since the dis is already freeze, the gradients will only # record the YOLACT losses = { k: (v).mean() for k,v in losses.items() } # Mean here because Dataparallel loss = sum([losses[k] for k in losses]) loss += loss_gen loss.backward() # Do this to free up vram even if loss is not finite optimizer_gen.zero_grad() if torch.isfinite(loss).item(): # since the optimizer_gen is for YOLACT only # only the gen will be updated optimizer_gen.step() # FIXME: # print(f'gen time pass: {time.time()-present_time:.2f}') # print('GAN part over') else: # ====== Normal YOLACT Train ====== # Zero the grad to get ready to compute gradients optimizer_gen.zero_grad() # Forward Pass + Compute loss at the same time (see CustomDataParallel and NetLoss) losses = net(datum) losses = { k: (v).mean() for k,v in losses.items() } # Mean here because Dataparallel loss = sum([losses[k] for k in losses]) # no_inf_mean removes some components from the loss, so make sure to backward through all of it # all_loss = sum([v.mean() for v in losses.values()]) # Backprop loss.backward() # Do this to free up vram even if loss is not finite if torch.isfinite(loss).item(): optimizer_gen.step() # Add the loss to the moving average for bookkeeping _ = [loss_avgs[k].add(losses[k].item()) for k in losses] # for k in losses: # loss_avgs[k].add(losses[k].item()) cur_time = time.time() elapsed = cur_time - last_time last_time = cur_time # Exclude graph setup from the timing information if iteration != args.start_iter: time_avg.add(elapsed) if iteration % 10 == 0: eta_str = str(datetime.timedelta(seconds=(cfg.max_iter-iteration) * time_avg.get_avg())).split('.')[0] total = sum([loss_avgs[k].get_avg() for k in losses]) loss_labels = sum([[k, loss_avgs[k].get_avg()] for k in loss_types if k in losses], []) if cfg.pred_seg: print(('[%3d] %7d ||' + (' %s: %.3f |' * len(losses)) + ' T: %.3f || ETA: %s || timer: %.3f') % tuple([epoch, iteration] + loss_labels + [total, eta_str, elapsed]), flush=True) # print(f'Generator loss: {loss_gen:.2f} | Discriminator loss: {loss_dis:.2f}') # Loss Key: # - B: Box Localization Loss # - C: Class Confidence Loss # - M: Mask Loss # - P: Prototype Loss # - D: Coefficient Diversity Loss # - E: Class Existence Loss # - S: Semantic Segmentation Loss # - T: Total loss if args.log: precision = 5 loss_info = {k: round(losses[k].item(), precision) for k in losses} loss_info['T'] = round(loss.item(), precision) if args.log_gpu: log.log_gpu_stats = (iteration % 10 == 0) # nvidia-smi is sloooow log.log('train', loss=loss_info, epoch=epoch, iter=iteration, lr=round(cur_lr, 10), elapsed=elapsed) log.log_gpu_stats = args.log_gpu iteration += 1 if iteration % args.save_interval == 0 and iteration != args.start_iter: if args.keep_latest: latest = SavePath.get_latest(args.save_folder, cfg.name) print('Saving state, iter:', iteration) yolact_net.save_weights(save_path(epoch, iteration)) if args.keep_latest and latest is not None: if args.keep_latest_interval <= 0 or iteration % args.keep_latest_interval != args.save_interval: print('Deleting old save...') os.remove(latest) # This is done per epoch if args.validation_epoch > 0: # NOTE: Validation loss # if cfg.pred_seg: # net.eval() # dis_net.eval() # cfg.gan_eval = True # with torch.no_grad(): # for datum in tqdm(val_loader, desc='GAN Validation'): # losses, seg_list, pred_list = net(datum) # losses, seg_list, pred_list = net(datum) # # TODO: warp below as a function # seg_list = [v.permute(2,1,0).contiguous() for v in seg_list] # b = len(seg_list) # batch size # _, seg_h, seg_w = seg_list[0].size() # seg_clas = torch.zeros(b, cfg.num_classes-1, seg_h, seg_w) # mask_clas = torch.zeros(b, cfg.num_classes-1, seg_h, seg_w) # target_list = [target for target in datum[1][0]] # mask_list = [interpolate(mask.unsqueeze(0), size = (seg_h,seg_w),mode='bilinear', \ # align_corners=False).squeeze() for mask in datum[1][1]] # for idx in range(b): # for i, (pred, i_target) in enumerate(zip(pred_list[idx], target_list[idx])): # seg_clas[idx, pred, ...] += seg_list[idx][i,...] # mask_clas[idx, i_target[-1].long(), ...] += mask_list[idx][i,...] # seg_clas = torch.clamp(seg_clas, 0, 1) # image = interpolate(torch.stack(datum[0]), size = (seg_h,seg_w), # mode='bilinear',align_corners=False) # real_label = torch.ones(b) # output_pred = dis_net(img = image, seg = seg_clas) # output_grou = dis_net(img = image, seg = mask_clas) # loss_pred = -criterion_dis(output_pred,target=real_label) # loss_grou = criterion_dis(output_grou,target=real_label) # loss_dis = loss_pred + loss_grou # losses = { k: (v).mean() for k,v in losses.items() } # loss = sum([losses[k] for k in losses]) # val_loss = loss - cfg.lambda_dis*loss_dis # schedule_dis.step(loss_dis) # lr = [group['lr'] for group in optimizer_dis.param_groups] # print(f'Discriminator lr: {lr[0]}') # net.train() if epoch % args.validation_epoch == 0 and epoch > 0: cfg.gan_eval = False dis_net.eval() compute_validation_map(epoch, iteration, yolact_net, val_dataset, log if args.log else None) # Compute validation mAP after training is finished compute_validation_map(epoch, iteration, yolact_net, val_dataset, log if args.log else None) except KeyboardInterrupt: if args.interrupt: print('Stopping early. Saving network...') # Delete previous copy of the interrupted network so we don't spam the weights folder SavePath.remove_interrupt(args.save_folder) yolact_net.save_weights(save_path(epoch, repr(iteration) + '_interrupt')) exit() yolact_net.save_weights(save_path(epoch, iteration))
def evalvideo(net:Yolact, path:str): # If the path is a digit, parse it as a webcam index if path.isdigit(): vid = cv2.VideoCapture(int(path)) else: vid = cv2.VideoCapture(path) if not vid.isOpened(): print('Could not open video "%s"' % path) exit(-1) net = CustomDataParallel(net).cuda() transform = torch.nn.DataParallel(FastBaseTransform()).cuda() frame_times = MovingAverage() fps = 0 # The 0.8 is to account for the overhead of time.sleep frame_time_target = 0.8 / vid.get(cv2.CAP_PROP_FPS) def cleanup_and_exit(): print() pool.terminate() vid.release() cv2.destroyAllWindows() exit() def get_next_frame(vid): return [vid.read()[1] for _ in range(args.video_multiframe)] def transform_frame(frames): with torch.no_grad(): frames = [torch.from_numpy(frame).cuda().float() for frame in frames] return frames, transform(torch.stack(frames, 0)) def eval_network(inp): with torch.no_grad(): frames, imgs = inp return frames, net(imgs) def prep_frame(inp): with torch.no_grad(): frame, preds = inp return prep_display(preds, frame, None, None, undo_transform=False, class_color=True) extract_frame = lambda x, i: (x[0][i] if x[1][i] is None else x[0][i].to(x[1][i]['box'].device), [x[1][i]]) # Prime the network on the first frame because I do some thread unsafe things otherwise print('Initializing model... ', end='') eval_network(transform_frame(get_next_frame(vid))) print('Done.') # For each frame the sequence of functions it needs to go through to be processed (in reversed order) sequence = [prep_frame, eval_network, transform_frame] pool = ThreadPool(processes=len(sequence) + args.video_multiframe) active_frames = [] print() while vid.isOpened(): start_time = time.time() # Start loading the next frames from the disk next_frames = pool.apply_async(get_next_frame, args=(vid,)) # For each frame in our active processing queue, dispatch a job # for that frame using the current function in the sequence for frame in active_frames: frame['value'] = pool.apply_async(sequence[frame['idx']], args=(frame['value'],)) # For each frame whose job was the last in the sequence (i.e. for all final outputs) for frame in active_frames: if frame['idx'] == 0: # Wait here so that the frame has time to process and so that the video plays at the proper speed time.sleep(frame_time_target) cv2.imshow(path, frame['value'].get()) if cv2.waitKey(1) == 27: # Press Escape to close cleanup_and_exit() # Remove the finished frames from the processing queue active_frames = [x for x in active_frames if x['idx'] > 0] # Finish evaluating every frame in the processing queue and advanced their position in the sequence for frame in list(reversed(active_frames)): frame['value'] = frame['value'].get() frame['idx'] -= 1 if frame['idx'] == 0: # Split this up into individual threads for prep_frame since it doesn't support batch size active_frames += [{'value': extract_frame(frame['value'], i), 'idx': 0} for i in range(1, args.video_multiframe)] frame['value'] = extract_frame(frame['value'], 0) # Finish loading in the next frames and add them to the processing queue active_frames.append({'value': next_frames.get(), 'idx': len(sequence)-1}) # Compute FPS frame_times.add(time.time() - start_time) fps = args.video_multiframe / frame_times.get_avg() print('\rAvg FPS: %.2f ' % fps, end='') cleanup_and_exit()
def evalvideo(self, net: Yolact, path: str): # If the path is a digit, parse it as a webcam index is_webcam = path.isdigit() if is_webcam: vid = cv2.VideoCapture(int(path)) else: vid = cv2.VideoCapture(path) if not vid.isOpened(): print('Could not open video "%s"' % path) exit(-1) net = CustomDataParallel(net).cuda() transform = torch.nn.DataParallel(FastBaseTransform()).cuda() frame_times = MovingAverage(100) fps = 0 # The 0.8 is to account for the overhead of time.sleep frame_time_target = 1 / vid.get(cv2.CAP_PROP_FPS) running = True def cleanup_and_exit(): print() pool.terminate() vid.release() cv2.destroyAllWindows() exit() def get_next_frame(vid): return [vid.read()[1] for _ in range(args.video_multiframe)] def transform_frame(frames): with torch.no_grad(): frames = [ torch.from_numpy(frame).cuda().float() for frame in frames ] return frames, transform(torch.stack(frames, 0)) def eval_network(inp): with torch.no_grad(): frames, imgs = inp return frames, net(imgs) def prep_frame(inp): with torch.no_grad(): frame, preds = inp return self.prep_display(preds, frame, None, None, undo_transform=False, class_color=True) frame_buffer = Queue() video_fps = 0 # All this timing code to make sure that def play_video(): nonlocal frame_buffer, running, video_fps, is_webcam video_frame_times = MovingAverage(100) frame_time_stabilizer = frame_time_target last_time = None stabilizer_step = 0.0005 while running: frame_time_start = time.time() if not frame_buffer.empty(): next_time = time.time() if last_time is not None: video_frame_times.add(next_time - last_time) video_fps = 1 / video_frame_times.get_avg() cv2.imshow(path, frame_buffer.get()) last_time = next_time #self.image_pub.publish(self.bridge.cv2_to_imgmsg(frame_buffer.get(), "bgr8")) if cv2.waitKey(1) == 27: # Press Escape to close running = False buffer_size = frame_buffer.qsize() if buffer_size < args.video_multiframe: frame_time_stabilizer += stabilizer_step elif buffer_size > args.video_multiframe: frame_time_stabilizer -= stabilizer_step if frame_time_stabilizer < 0: frame_time_stabilizer = 0 new_target = frame_time_stabilizer if is_webcam else max( frame_time_stabilizer, frame_time_target) next_frame_target = max( 2 * new_target - video_frame_times.get_avg(), 0) target_time = frame_time_start + next_frame_target - 0.001 # Let's just subtract a millisecond to be safe # This gives more accurate timing than if sleeping the whole amount at once while time.time() < target_time: time.sleep(0.001) extract_frame = lambda x, i: (x[0][i] if x[1][i][ 'detection'] is None else x[0][i].to(x[1][i]['detection']['box']. device), [x[1][i]]) # Prime the network on the first frame because I do some thread unsafe things otherwise print('Initializing model... ', end='') eval_network(transform_frame(get_next_frame(vid))) print('Done.') # For each frame the sequence of functions it needs to go through to be processed (in reversed order) sequence = [prep_frame, eval_network, transform_frame] pool = ThreadPool(processes=len(sequence) + args.video_multiframe + 2) pool.apply_async(play_video) active_frames = [] print() while vid.isOpened() and running: start_time = time.time() # Start loading the next frames from the disk next_frames = pool.apply_async(get_next_frame, args=(vid, )) # For each frame in our active processing queue, dispatch a job # for that frame using the current function in the sequence for frame in active_frames: frame['value'] = pool.apply_async(sequence[frame['idx']], args=(frame['value'], )) # For each frame whose job was the last in the sequence (i.e. for all final outputs) for frame in active_frames: if frame['idx'] == 0: frame_buffer.put(frame['value'].get()) # Remove the finished frames from the processing queue active_frames = [x for x in active_frames if x['idx'] > 0] # Finish evaluating every frame in the processing queue and advanced their position in the sequence for frame in list(reversed(active_frames)): frame['value'] = frame['value'].get() frame['idx'] -= 1 if frame['idx'] == 0: # Split this up into individual threads for prep_frame since it doesn't support batch size active_frames += [{ 'value': extract_frame(frame['value'], i), 'idx': 0 } for i in range(1, args.video_multiframe)] # active_frames += [{'value': extract_frame(frame['value'], i), 'idx': 0} for i in range(1, len(frame['value'][0]))] frame['value'] = extract_frame(frame['value'], 0) # Finish loading in the next frames and add them to the processing queue active_frames.append({ 'value': next_frames.get(), 'idx': len(sequence) - 1 }) # Compute FPS frame_times.add(time.time() - start_time) fps = args.video_multiframe / frame_times.get_avg() print( '\rProcessing FPS: %.2f | Video Playback FPS: %.2f | Frames in Buffer: %d ' % (fps, video_fps, frame_buffer.qsize()), end='') cleanup_and_exit()
criterion = nn.DataParallel(criterion).cuda() dataset = COCODetection(image_path=cfg.dataset.train_images, info_file=cfg.dataset.train_info, augmentation=SSDAugmentation()) data_loader = data.DataLoader(dataset, cfg.batch_size, num_workers=8, shuffle=True, collate_fn=detection_collate, pin_memory=True) step_index = 0 start_step = resume_step if args.resume else 0 batch_time = MovingAverage() loss_types = ['B', 'C', 'M', 'S'] loss_avgs = {k: MovingAverage() for k in loss_types} map_tables = [] training = True step = start_step writer = SummaryWriter('tensorboard_log') try: # Use try-except to use ctrl+c to stop and save early. while training: for i, datum in enumerate(data_loader): if cfg.warmup_until > 0 and step <= cfg.warmup_until: # Warm up learning rate. set_lr(optimizer, (cfg.lr - cfg.warmup_init) * (step / cfg.warmup_until) + cfg.warmup_init) # Adjust the learning rate according to the current step. while step_index < len(
else: evalimage(net, args.image) return elif args.images is not None: inp, out = args.images.split(':') evalimages(net, inp, out) return elif args.video is not None: if ':' in args.video: inp, out = args.video.split(':') evalvideo(net, inp, out) else: evalvideo(net, args.video) return frame_times = MovingAverage() dataset_size = len(dataset) if args.max_images < 0 else min(args.max_images, len(dataset)) progress_bar = ProgressBar(30, dataset_size) print() if not args.display and not args.benchmark: # For each class and iou, stores tuples (score, isPositive) # Index ap_data[type][iouIdx][classIdx] ap_data = { 'box' : [[APDataObject() for _ in cfg.dataset.class_names] for _ in iou_thresholds], 'mask': [[APDataObject() for _ in cfg.dataset.class_names] for _ in iou_thresholds] } detections = Detections() else: timer.disable('Load Data')
# GPU net = net.cuda() torch.set_default_tensor_type('torch.cuda.FloatTensor') x = torch.zeros((1, 3, cfg.max_size, cfg.max_size)) y = net(x) for p in net.prediction_layers: print(p.last_conv_size) print() for k, a in y.items(): print(k + ': ', a.size(), torch.sum(a)) exit() net(x) # timer.disable('pass2') avg = MovingAverage() try: while True: timer.reset() with timer.env('everything else'): net(x) avg.add(timer.total_time()) print('\033[2J') # Moves console cursor to 0,0 timer.print_stats() print('Avg fps: %.2f\tAvg ms: %.2f ' % (1/avg.get_avg(), avg.get_avg()*1000)) except KeyboardInterrupt: pass
def train(args, dataset, val_dataset, data_loader, yolact_net, netloss, optimizer, log): # loss counters #loc_loss = 0 #conf_loss = 0 iteration = max(args.start_iter, 0) last_time = time.time() # Which learning rate adjustment step are we on? lr' = lr * gamma ^ step_index step_index = 0 time_avg = MovingAverage() #global loss_types # Forms the print order loss_avgs = get_default_log_avgs() epoch_size = len(dataset) // args.batch_size num_epochs = math.ceil(cfg.max_iter / epoch_size) print('Begin training!') print("num_epochs", num_epochs) for epoch in range(num_epochs): # Resume from start_iter if (epoch+1)*epoch_size < iteration: continue for datum in data_loader: # Stop if we've reached an epoch if we're resuming from start_iter if iteration == (epoch+1)*epoch_size: break # Stop at the configured number of iterations even if mid-epoch if iteration == cfg.max_iter: break update_cfg_lr(iteration, optimizer, loss_avgs, args) # Warm up by linearly interpolating the learning rate from some smaller value if cfg.lr_warmup_until > 0 and iteration <= cfg.lr_warmup_until: set_lr(optimizer, (args.lr - cfg.lr_warmup_init) * (iteration / cfg.lr_warmup_until) + cfg.lr_warmup_init) # Adjust the learning rate at the given iterations, but also if we resume from past that iteration while step_index < len(cfg.lr_steps) and iteration >= cfg.lr_steps[step_index]: step_index += 1 set_lr(optimizer, args.lr * (args.gamma ** step_index)) # Zero the grad to get ready to compute gradients optimizer.zero_grad() # Forward Pass + Compute loss at the same time (see CustomDataParallel and NetLoss) losses = netloss(datum) losses = { k: (v).mean() for k,v in losses.items() } # Mean here because Dataparallel loss = sum([losses[k] for k in losses]) # no_inf_mean removes some components from the loss, so make sure to backward through all of it # all_loss = sum([v.mean() for v in losses.values()]) # Backprop loss.backward() # Do this to free up vram even if loss is not finite if torch.isfinite(loss).item(): optimizer.step() # Add the loss to the moving average for bookkeeping for k in losses: loss_avgs[k].add(losses[k].item()) cur_time = time.time() elapsed = cur_time - last_time last_time = cur_time # Exclude graph setup from the timing information if iteration != args.start_iter: time_avg.add(elapsed) if iteration % 10 == 0: prompt_progress(epoch, iteration, elapsed, time_avg, loss_avgs, losses) if args.log: log_iteration(log, losses, loss, iteration, epoch, elapsed, args) iteration += 1 if iteration % args.save_interval == 0 and iteration != args.start_iter: save_yolact_net(yolact_net, args, iteration, epoch) # This is done per epoch if args.validation_epoch > 0: if epoch % args.validation_epoch == 0 and epoch > 0: compute_validation_map(epoch, iteration, yolact_net, val_dataset, log if args.log else None) if sig_num is not None: print("#r# break traning loop due to sig_num", sig_num) break # Compute validation mAP after training is finished compute_validation_map(epoch, iteration, yolact_net, val_dataset, log if args.log else None) saved_pathname = None if sig_num is not None: saved_pathname = save_yolact_net(yolact_net, args, iteration, epoch, mode="sig_num") else: saved_pathname = save_yolact_net(yolact_net, args, iteration, epoch, mode="finial") return saved_pathname
class yolact_prediction(object): def __init__(self): parser = argparse.ArgumentParser(description='YOLACT Predict in ROS') parser.add_argument( '--visual_top_k', default=100, type=int, help='Further restrict the number of predictions to parse') parser.add_argument('--traditional_nms', default=False, action='store_true', help='Whether to use traditional nms.') parser.add_argument('--hide_mask', default=False, action='store_true', help='Whether to display masks') parser.add_argument('--hide_bbox', default=True, action='store_true', help='Whether to display bboxes') parser.add_argument('--hide_score', default=True, action='store_true', help='Whether to display scores') parser.add_argument( '--show_lincomb', default=False, action='store_true', help='Whether to show the generating process of masks.') parser.add_argument( '--no_crop', default=False, action='store_true', help='Do not crop output masks with the predicted bounding box.') parser.add_argument('--real_time', default=True, action='store_true', help='Show the detection results real-timely.') parser.add_argument( '--visual_thre', default=0.3, type=float, help='Detections with a score under this threshold will be removed.' ) self.args = parser.parse_args() r = rospkg.RosPack() self.bridge = CvBridge() self.path = r.get_path('yolact_prediction') model_name = "/src/weights/best_89.48_res101_custom_610000.pth" strs = model_name.split('_') config = strs[-3] + "_" + strs[-2] + "_config" update_config(config) print("Using " + config + " according to the trained_model.") with torch.no_grad(): self.cuda = torch.cuda.is_available() if self.cuda: cudnn.benchmark = True cudnn.fastest = True torch.set_default_tensor_type('torch.cuda.FloatTensor') else: torch.set_default_tensor_type('torch.FloatTensor') self.net = Yolact() self.net.load_weights(self.path + model_name, self.cuda) print('Model loaded.') if self.cuda: self.net = self.net.cuda() self.time_here = 0 self.frame_times = MovingAverage() #### Publisher self.rgb_pub = rospy.Publisher("Yolact_predict_img/", Image, queue_size=1) image_sub = rospy.Subscriber("/camera/color/image_raw", Image, self.img_cb, queue_size=1) print("============ Ready ============") def img_cb(self, rgb_data): self.rgb_data = rgb_data if self.rgb_data is not None: cv_image = self.bridge.imgmsg_to_cv2(self.rgb_data, "bgr8") predict_img = self.predict(cv_image) self.rgb_pub.publish(self.bridge.cv2_to_imgmsg( predict_img, "bgr8")) self.rgb_data = None def predict(self, img): rgb_origin = img img_numpy = img img = torch.from_numpy(img.copy()).float() img = img.cuda() img_h, img_w = img.shape[0], img.shape[1] img_trans = FastBaseTransform()(img.unsqueeze(0)) net_outs = self.net(img_trans) nms_outs = NMS(net_outs, 0) results = after_nms(nms_outs, img_h, img_w, crop_masks=not self.args.no_crop, visual_thre=self.args.visual_thre) torch.cuda.synchronize() temp = self.time_here self.time_here = time.time() self.frame_times.add(self.time_here - temp) fps = 1 / self.frame_times.get_avg() frame_numpy = draw_img(results, img, self.args, class_color=True, fps=fps) return frame_numpy def onShutdown(self): rospy.loginfo("Shutdown.") torch.cuda.empty_cache()
def evalvideo(net: Sewer, path: str, out_path: str = None): is_webcam = path.isdigit() cudnn.benchmark = True if is_webcam: vid = cv2.VideoCapture(int(path)) else: vid = cv2.VideoCapture(path) if not vid.isOpened(): print('Could not open video "%s"' % path) exit(-1) target_fps = round(vid.get(cv2.CAP_PROP_FPS)) frame_width = round(vid.get(cv2.CAP_PROP_FRAME_WIDTH)) frame_height = round(vid.get(cv2.CAP_PROP_FRAME_HEIGHT)) if is_webcam: num_frames = float('inf') else: num_frames = round(vid.get(cv2.CAP_PROP_FRAME_COUNT)) net = CustomDataParallel(net).cuda() transform = torch.nn.DataParallel(FastBaseTransform()).cuda() frame_times = MovingAverage(100) fps = 0 frame_time_target = 1 / target_fps running = True fps_str = '' vid_done = False frames_displayed = 0 out_images_path = path.split('.')[0] base_name = os.path.splitext(os.path.basename(path))[0] createFolder(out_images_path) ori_folder = os.path.join(out_images_path + '\#ori') res_folder = os.path.join(out_images_path + '\#res') createFolder(ori_folder) createFolder(res_folder) if out_path is not os.path.isdir(out_path): out = cv2.VideoWriter(out_path, cv2.VideoWriter_fourcc(*'avc1'), target_fps, (frame_width, frame_height)) def cleanup_and_exit(): print() pool.terminate() vid.release() if out_path is not None: out.release() cv2.destroyAllWindows() #exit() def get_next_frame(vid): frames = [] for idx in range(args.video_multiframe): frame = vid.read()[1] if frame is None: return frames frames.append(frame) return frames def transform_frame(frames): with torch.no_grad(): frames = [ torch.from_numpy(frame).cuda().float() for frame in frames ] return frames, transform(torch.stack(frames, 0)) def eval_network(inp): with torch.no_grad(): frames, imgs = inp num_extra = 0 while imgs.size(0) < args.video_multiframe: imgs = torch.cat([imgs, imgs[0].unsqueeze(0)], dim=0) num_extra += 1 out = net(imgs) if num_extra > 0: out = out[:-num_extra] return frames, out def prep_frame(inp, fps_str, save_folder=None): with torch.no_grad(): frame, preds = inp return prep_display_for_video(preds, frame, save_folder=save_folder, undo_transform=False, class_color=True, fps_str=fps_str, override_args=args) frame_buffer = Queue() video_fps = 0 def play_video(): try: nonlocal frame_buffer, running, video_fps, is_webcam, num_frames, frames_displayed, vid_done video_frame_times = MovingAverage(100) frame_time_stabilizer = frame_time_target last_time = None stabilizer_step = 0.0005 progress_bar = ProgressBar(30, num_frames) while running: frame_time_start = time.time() if not frame_buffer.empty(): next_time = time.time() if last_time is not None: video_frame_times.add(next_time - last_time) video_fps = 1 / video_frame_times.get_avg() if out_path is None or os.path.isdir(out_path): cv2.imshow(path, frame_buffer.get()[0]) else: out.write(frame_buffer.get()[0]) frames_displayed += 1 last_time = next_time if out_path is not None and not os.path.isdir(out_path): if video_frame_times.get_avg() == 0: fps = 0 else: fps = 1 / video_frame_times.get_avg() progress = frames_displayed / num_frames * 100 progress_bar.set_val(frames_displayed) print( '\rProcessing Frames %s %6d / %6d (%5.2f%%) %5.2f fps ' % (repr(progress_bar), frames_displayed, num_frames, progress, fps), end='') if (out_path is None or os.path.isdir(out_path)) and cv2.waitKey(1) == 27: running = False if not vid_done: buffer_size = frame_buffer.qsize() if buffer_size < args.video_multiframe: frame_time_stabilizer += stabilizer_step elif buffer_size > args.video_multiframe: frame_time_stabilizer -= stabilizer_step if frame_time_stabilizer < 0: frame_time_stabilizer = 0 new_target = frame_time_stabilizer if is_webcam else max( frame_time_stabilizer, frame_time_target) else: new_target = frame_time_target next_frame_target = max( 2 * new_target - video_frame_times.get_avg(), 0) target_time = frame_time_start + next_frame_target - 0.001 if out_path is None or os.path.isdir( out_path) or args.emulate_playback: while time.time() < target_time: time.sleep(0.001) else: time.sleep(0.001) except: import traceback traceback.print_exc() extract_frame = lambda x, i: (x[0][i] if x[1][i]['detection'] is None else x[0][i].to(x[1][i]['detection']['box'].device ), [x[1][i]]) print('Initializing model... ', end='') first_batch = eval_network(transform_frame(get_next_frame(vid))) print('Done.') sequence = [prep_frame, eval_network, transform_frame] pool = ThreadPool(processes=len(sequence) + args.video_multiframe + 2) pool.apply_async(play_video) active_frames = [{ 'value': extract_frame(first_batch, i), 'idx': 0 } for i in range(len(first_batch[0]))] print() if out_path is None or os.path.isdir(out_path): print('Press Escape to close.') try: while vid.isOpened() and running: while frame_buffer.qsize() > 100: time.sleep(0.001) start_time = time.time() if not vid_done: next_frames = pool.apply_async(get_next_frame, args=(vid, )) else: next_frames = None if not (vid_done and len(active_frames) == 0): for frame in active_frames: _args = [frame['value']] if frame['idx'] == 0: _args.append(fps_str) if out_path is not None: _args.append([ out_images_path, base_name, ori_folder, res_folder, frames_displayed ]) frame['value'] = pool.apply_async(sequence[frame['idx']], args=_args) for frame in active_frames: if frame['idx'] == 0: frame_buffer.put(frame['value'].get()) active_frames = [x for x in active_frames if x['idx'] > 0] for frame in list(reversed(active_frames)): frame['value'] = frame['value'].get() frame['idx'] -= 1 if frame['idx'] == 0: active_frames += [{ 'value': extract_frame(frame['value'], i), 'idx': 0 } for i in range(1, len(frame['value'][0]))] frame['value'] = extract_frame(frame['value'], 0) if next_frames is not None: frames = next_frames.get() if len(frames) == 0: vid_done = True else: active_frames.append({ 'value': frames, 'idx': len(sequence) - 1 }) frame_times.add(time.time() - start_time) if frame_times.get_avg() != 0: fps = args.video_multiframe / frame_times.get_avg() else: fps = 0 running = False fps_str = 'Processing FPS: %.2f | Video Playback FPS: %.2f | Frames in Buffer: %d' % ( fps, video_fps, frame_buffer.qsize()) if not args.display_fps and os.path.isdir(out_path): print('\r' + fps_str + ' ', end='') except KeyboardInterrupt: print('\nStopping...') if os.path.isdir(out_images_path): file_csv(os.path.join(out_images_path, base_name + '.csv'), result_list) if os.path.isdir(out_images_path): file_csv(os.path.join(out_images_path, base_name + '.csv'), result_list) cleanup_and_exit()
def __init__(self): parser = argparse.ArgumentParser(description='YOLACT Predict in ROS') parser.add_argument( '--visual_top_k', default=100, type=int, help='Further restrict the number of predictions to parse') parser.add_argument('--traditional_nms', default=False, action='store_true', help='Whether to use traditional nms.') parser.add_argument('--hide_mask', default=False, action='store_true', help='Whether to display masks') parser.add_argument('--hide_bbox', default=True, action='store_true', help='Whether to display bboxes') parser.add_argument('--hide_score', default=True, action='store_true', help='Whether to display scores') parser.add_argument( '--show_lincomb', default=False, action='store_true', help='Whether to show the generating process of masks.') parser.add_argument( '--no_crop', default=False, action='store_true', help='Do not crop output masks with the predicted bounding box.') parser.add_argument('--real_time', default=True, action='store_true', help='Show the detection results real-timely.') parser.add_argument( '--visual_thre', default=0.3, type=float, help='Detections with a score under this threshold will be removed.' ) self.args = parser.parse_args() r = rospkg.RosPack() self.bridge = CvBridge() self.path = r.get_path('yolact_prediction') model_name = "/src/weights/best_89.48_res101_custom_610000.pth" strs = model_name.split('_') config = strs[-3] + "_" + strs[-2] + "_config" update_config(config) print("Using " + config + " according to the trained_model.") with torch.no_grad(): self.cuda = torch.cuda.is_available() if self.cuda: cudnn.benchmark = True cudnn.fastest = True torch.set_default_tensor_type('torch.cuda.FloatTensor') else: torch.set_default_tensor_type('torch.FloatTensor') self.net = Yolact() self.net.load_weights(self.path + model_name, self.cuda) print('Model loaded.') if self.cuda: self.net = self.net.cuda() self.time_here = 0 self.frame_times = MovingAverage() #### Publisher self.rgb_pub = rospy.Publisher("Yolact_predict_img/", Image, queue_size=1) image_sub = rospy.Subscriber("/camera/color/image_raw", Image, self.img_cb, queue_size=1) print("============ Ready ============")
net = net # cudnn.benchmark = True torch.set_default_tensor_type('torch.FloatTensor') x = torch.zeros((1, 3, cfg.max_size, cfg.max_size)) y = net(x) for p in net.prediction_layers: print(p.last_conv_size) print() for k, a in y.items(): print(k + ': ', a.size(), torch.sum(a)) exit() net(x) # timer.disable('pass2') avg = MovingAverage() try: while True: timer.reset() with timer.env('everything else'): net(x) avg.add(timer.total_time()) print('\033[2J') # Moves console cursor to 0,0 timer.print_stats() print('Avg fps: %.2f\tAvg ms: %.2f ' % (1 / avg.get_avg(), avg.get_avg() * 1000)) except KeyboardInterrupt: pass
def train(): #1: train 결과를 저장할 폴더를 생성 if not os.path.exists(args.save_folder): os.mkdir(args.save_folder) #2: MSCOCO에서 제공하는 API를 통해 train dataset을 준비한다. dataset = COCODetection(image_path=cfg.dataset.train_images, info_file=cfg.dataset.train_info, transform=SSDAugmentation(MEANS)) # 만약 train-validation기법을 사용한다면, eval dataset도 준비한다. if args.validation_epoch > 0: setup_eval() val_dataset = COCODetection(image_path=cfg.dataset.valid_images, info_file=cfg.dataset.valid_info, transform=BaseTransform(MEANS)) #3: 구현한 yolact() class의 객체를 만들고 train모드로 설정. #주의 : net과 yolact_net은 메모리에 저장된 같은 객체를 공유한다. # 다만 net은 이후에 yolact와 MultiBoxLoss가 결함되어 train을 위한 # 통합된 객체로 다시 정의되기 때문에 yolact넷 객체에만 따로 접근하기 위해 # yolact_net을 deep copy본으로 가지고 있는다. yolact_net = Yolact() net = yolact_net net.train() ####################################################################### #######RESUME 관련##################################################### #4: args.log와 args.resume은 train도중 log를 남기는 것과, train이 # 불가피하게 중도에 정지되었을 경우, 중단 지점부터 재시작할 수 있도록 # 기능을 만든 것이므로 필요한 경우에만 더 자세히 보도록 하자. if args.log: log = Log(cfg.name, args.log_folder, dict(args._get_kwargs()), overwrite=(args.resume is None), log_gpu_stats=args.log_gpu) # I don't use the timer during training (I use a different timing method). # Apparently there's a race condition with multiple GPUs, so disable it just to be safe. timer.disable_all() # Both of these can set args.resume to None, so do them before the check if args.resume == 'interrupt': args.resume = SavePath.get_interrupt(args.save_folder) elif args.resume == 'latest': args.resume = SavePath.get_latest(args.save_folder, cfg.name) if args.resume is not None: print('Resuming training, loading {}...'.format(args.resume)) yolact_net.load_weights(args.resume) if args.start_iter == -1: args.start_iter = SavePath.from_str(args.resume).iteration else: print('Initializing weights...') yolact_net.init_weights(backbone_path=args.save_folder + cfg.backbone.path) #######END############################################################# ####################################################################### #5: yolact의 optimizer와 loss함수를 설정한다. optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.decay) criterion = MultiBoxLoss(num_classes=cfg.num_classes, pos_threshold=cfg.positive_iou_threshold, neg_threshold=cfg.negative_iou_threshold, negpos_ratio=cfg.ohem_negpos_ratio) #6: 멀티 GPU를 사용하는 경우 각 GPU에 batch size를 분할해준다. # 만약 총 Batch size가 맞지 않으면 뭔가 잘못된 것이므로 프로그램 종료. if args.batch_alloc is not None: args.batch_alloc = [int(x) for x in args.batch_alloc.split(',')] if sum(args.batch_alloc) != args.batch_size: print( 'Error: Batch allocation (%s) does not sum to batch size (%s).' % (args.batch_alloc, args.batch_size)) exit(-1) #7: 현재까지 설정된 net과 loss 함수를 엮어 더 통합된 net으로 만듬. # 이제 net을 호출하면, bbox를 detection하고, fast nms를 거쳐 한 번 # 필터링을 한 후, ground truth와 비교하여 loss를 계산하고, 이 과정을 # 멀티 GPU일 경우 알아서 각 device에 작업을 분할해준다. # yolact_net은 net에 포함된 yolact()만을 가리킨다. net = CustomDataParallel(NetLoss(net, criterion)) if args.cuda: net = net.cuda() #8: yolact_net의 batch_normalization layer를 모두 false로 만든 뒤에 # 0만을 가지고 있는 zero_tensor를 모델에 통과시켜, 파라미터를 초기화시켜준다. # 그 후에 다시 batch_normalization layer를 train모드로 바꿔준다. # 굳이 이런 과정을 거치는 이유는 저자가 batch_normalization에 미리 넣어놓은 # 평균/분산 값은 초기화하고 싶지 않기 때문이다. if not cfg.freeze_bn: yolact_net.freeze_bn() # Freeze bn so we don't kill our means (torch.zeros(1, 3, cfg.max_size, cfg.max_size).cuda()) if not cfg.freeze_bn: yolact_net.freeze_bn(True) #9: loss counters # bbox의 위치에 대한 loss와, class confidence에 대한 loss 를 담을 변수를 생성하고, # batch_size와 dataset의 크기에 맞는 1 epoch의 size와 몇 epoch를 돌려야하는지 구한다. loc_loss = 0 conf_loss = 0 iteration = max(args.start_iter, 0) #cw : 음수입력을 허용치 않기 위해... GOOD last_time = time.time() epoch_size = len(dataset) // args.batch_size num_epochs = math.ceil(cfg.max_iter / epoch_size) #10:Which learning rate adjustment step are we on? lr' = lr * gamma ^ step_index # step_index는 learning rate decay를 위해 사용하는 index이다. # data_loader는 train중에 순서대로 데이터셋을 준비해서 넘겨주는 class이다. # 여기서 객체를 만들어 저장한다. step_index = 0 data_loader = data.DataLoader(dataset, args.batch_size, num_workers=args.num_workers, shuffle=True, collate_fn=detection_collate, pin_memory=True) #11:특정 epoch와 iteration에 도달했을 때, 중간 과정을 save_path에 저장하기 위한 # 람다 함수를 정의하고, time_avg와 loss_avg는 MovingAverage 클래스의 객체로써 # 훈련 중간 과정의 loss를 이동평균 값으로 보여주기 위해 선언되는 객체이다. save_path = lambda epoch, iteration: SavePath( cfg.name, epoch, iteration).get_path(root=args.save_folder) time_avg = MovingAverage() global loss_types # Forms the print order loss_avgs = {k: MovingAverage(100) for k in loss_types} #12: main train이 시작되는 부분(#A ~ #F) print('Begin training!') print() # A # try-except를 사용하여 ctrl+c(keyboardInterrupt)를 통해 # 훈련을 중단하고 진행내용은 저장할 수 있다. # 중단지점부터 재시작하고 싶으면 train.py실행 시 --resume인자를 사용한다. try: #9에서 계산된 num_epochs만큼 반복. for epoch in range(num_epochs): # B # --resume을 이용해 시작했다면, 재시작 iter에 도달할 때까지 continue, # 또한 data_loader에서 data를 불러오며 loss를 계산하는데, # 도중에 목표 iteration에 도달했으면 break하여 1 epoch를 종료한다. if (epoch + 1) * epoch_size < iteration: continue for datum in data_loader: # 목표한만큼 훈련이 되었다면, 종료한다. # Stop if we've reached an epoch if we're resuming from start_iter if iteration == (epoch + 1) * epoch_size: break # 목표로 설정된 반복횟수가 max_iter보다 크면 max_iter에서 훈련을 마친다. # Stop at the configured number of iterations even if mid-epoch if iteration == cfg.max_iter: break # 특정 iteration에 config값이 바뀌도록 할 경우의 작업을 수행한다. # Change a config setting if we've reached the specified iteration changed = False for change in cfg.delayed_settings: if iteration >= change[0]: changed = True cfg.replace(change[1]) # Reset the loss averages because things might have changed for avg in loss_avgs: avg.reset() # If a config setting was changed, remove it from the list so we don't keep checking if changed: cfg.delayed_settings = [ x for x in cfg.delayed_settings if x[0] > iteration ] # C # [learning rate 조정] # train시작한지 얼마 안되었을 경우(lr_warmup_until기준) 훈련을 조금 가속시키기 위해 조정. # Warm up by linearly interpolating the learning rate from some smaller value if cfg.lr_warmup_until > 0 and iteration <= cfg.lr_warmup_until: set_lr(optimizer, (args.lr - cfg.lr_warmup_init) * (iteration / cfg.lr_warmup_until) + cfg.lr_warmup_init) # 특정 iteration에 도달할 때마다 learning rate decay수행. # Adjust the learning rate at the given iterations, but also if we resume from past that iteration while step_index < len( cfg.lr_steps ) and iteration >= cfg.lr_steps[step_index]: step_index += 1 set_lr(optimizer, args.lr * (args.gamma**step_index)) # D # loss 함수 계산. # Zero the grad to get ready to compute gradients optimizer.zero_grad() # Forward Propagation을 수행하고 수행 결과로 loss 함수를 통해 1 iteration의 loss를 계산한다. # 구체적인 동작은 Backbone.py의 resnet101, yolact.py의 yolact, MultiBoxLoss.py의 MultiBoxLoss 클래스를 모두 보아야 한다. # (see CustomDataParallel and NetLoss) losses = net(datum) losses = {k: (v).mean() for k, v in losses.items() } # Mean here because Dataparallel loss = sum([losses[k] for k in losses]) # no_inf_mean removes some components from the loss, so make sure to backward through all of it # all_loss = sum([v.mean() for v in losses.values()]) # E # Backward Propagation을 수행하고, # 계산가능한 값일 경우, optimizer.step()을 통해 parameters에 적용 # Backprop loss.backward() # Do this to free up vram even if loss is not finite if torch.isfinite(loss).item(): optimizer.step() # F # train진행 과정에서 소요 시간과, 중간 loss값을 출력하여 중간 성과를 # 파악 할 수 있도록 해주는 파트. # Add the loss to the moving average for bookkeeping for k in losses: loss_avgs[k].add(losses[k].item()) cur_time = time.time() elapsed = cur_time - last_time last_time = cur_time # Exclude graph setup from the timing information if iteration != args.start_iter: time_avg.add(elapsed) if iteration % 10 == 0: eta_str = str( datetime.timedelta(seconds=(cfg.max_iter - iteration) * time_avg.get_avg())).split('.')[0] total = sum([loss_avgs[k].get_avg() for k in losses]) loss_labels = sum([[k, loss_avgs[k].get_avg()] for k in loss_types if k in losses], []) print(('[%3d] %7d ||' + (' %s: %.3f |' * len(losses)) + ' T: %.3f || ETA: %s || timer: %.3f') % tuple([epoch, iteration] + loss_labels + [total, eta_str, elapsed]), flush=True) # log를 파일로 기록 if args.log: precision = 5 loss_info = { k: round(losses[k].item(), precision) for k in losses } loss_info['T'] = round(loss.item(), precision) if args.log_gpu: log.log_gpu_stats = (iteration % 10 == 0 ) # nvidia-smi is sloooow log.log('train', loss=loss_info, epoch=epoch, iter=iteration, lr=round(cur_lr, 10), elapsed=elapsed) log.log_gpu_stats = args.log_gpu # ~F # 1번 반복하면, 1 iter증가. iteration += 1 # 주기마다 진행과정을 저장하는 작업 수행. if iteration % args.save_interval == 0 and iteration != args.start_iter: if args.keep_latest: latest = SavePath.get_latest(args.save_folder, cfg.name) print('Saving state, iter:', iteration) yolact_net.save_weights(save_path(epoch, iteration)) if args.keep_latest and latest is not None: if args.keep_latest_interval <= 0 or iteration % args.keep_latest_interval != args.save_interval: print('Deleting old save...') os.remove(latest) # train-validation으로 작업을 수행하는 경우, # 1 epoch를 돌렸을 때 validation 주기에 도달한 epoch였으면 validate 1회 진행하여 mAP측정. if args.validation_epoch > 0: if epoch % args.validation_epoch == 0 and epoch > 0: compute_validation_map(epoch, iteration, yolact_net, val_dataset, log if args.log else None) # Compute validation mAP after training is finished compute_validation_map(epoch, iteration, yolact_net, val_dataset, log if args.log else None) #13: Ctrl + c를 이용하여 훈련을 중단했을 경우, save_foler에 weights를 저장하고 중단하여 # 다음에 다시 재시작할 수 있도록 한다. except KeyboardInterrupt: if args.interrupt: print('Stopping early. Saving network...') # Delete previous copy of the interrupted network so we don't spam the weights folder SavePath.remove_interrupt(args.save_folder) yolact_net.save_weights( save_path(epoch, repr(iteration) + '_interrupt')) exit() yolact_net.save_weights(save_path(epoch, iteration))
def evaluate(net: Yolact, dataset, train_mode=False): net.detect.use_fast_nms = args.fast_nms net.detect.use_cross_class_nms = args.cross_class_nms cfg.mask_proto_debug = args.mask_proto_debug # TODO Currently we do not support Fast Mask Re-scroing in evalimage, evalimages, and evalvideo if args.image is not None: if ':' in args.image: inp, out = args.image.split(':') evalimage(net, inp, out) else: evalimage(net, args.image) return elif args.images is not None: inp, out = args.images.split(':') evalimages(net, inp, out) return elif args.video is not None: if ':' in args.video: inp, out = args.video.split(':') evalvideo(net, inp, out) else: evalvideo(net, args.video) return frame_times = MovingAverage() dataset_size = len(dataset) if args.max_images < 0 else min( args.max_images, len(dataset)) progress_bar = ProgressBar(30, dataset_size) print() if not args.display and not args.benchmark: # For each class and iou, stores tuples (score, isPositive) # Index ap_data[type][iouIdx][classIdx] ap_data = { 'box': [[APDataObject() for _ in cfg.dataset.class_names] for _ in iou_thresholds], 'mask': [[APDataObject() for _ in cfg.dataset.class_names] for _ in iou_thresholds] } detections = Detections() else: timer.disable('Load Data') dataset_indices = list(range(len(dataset))) if args.shuffle: random.shuffle(dataset_indices) elif not args.no_sort: # Do a deterministic shuffle based on the image ids # # I do this because on python 3.5 dictionary key order is *random*, while in 3.6 it's # the order of insertion. That means on python 3.6, the images come in the order they are in # in the annotations file. For some reason, the first images in the annotations file are # the hardest. To combat this, I use a hard-coded hash function based on the image ids # to shuffle the indices we use. That way, no matter what python version or how pycocotools # handles the data, we get the same result every time. hashed = [badhash(x) for x in dataset.ids] dataset_indices.sort(key=lambda x: hashed[x]) # dataset_size=1000 dataset_indices = dataset_indices[:dataset_size] try: # Main eval loop dataset.batch_size = 1 dataset.num_workers = 1 for it, batch in enumerate(dataset): timer.reset() image_idx, img, gt, gt_masks, h, w, num_crowd = batch[0] if not args.benchmark: gt = gt.numpy() gt_masks = gt_masks.numpy() batch = img.reshape(1, img.shape[0], img.shape[1], img.shape[2]) # batch = jt.array([img]) with timer.env('Network Extra'): preds = net(batch) if args.display: img_numpy = prep_display(preds, img, h, w) elif args.benchmark: prep_benchmark(preds, h, w) else: prep_metrics(ap_data, preds, img, gt, gt_masks, h, w, num_crowd, dataset.ids[image_idx], detections) # First couple of images take longer because we're constructing the graph. # Since that's technically initialization, don't include those in the FPS calculations. if it > 1: frame_times.add(timer.total_time()) if args.display: if it > 1: print('Avg FPS: %.4f' % (1 / frame_times.get_avg())) plt.imshow(img_numpy) plt.title(str(dataset.ids[image_idx])) plt.show() elif not args.no_bar: if it > 1: fps = 1 / frame_times.get_avg() else: fps = 0 progress = (it + 1) / dataset_size * 100 progress_bar.set_val(it + 1) print( '\rProcessing Images %s %6d / %6d (%5.2f%%) %5.2f fps ' % (repr(progress_bar), it + 1, dataset_size, progress, fps), end='') jt.sync_all(True) if not args.display and not args.benchmark: print() if args.output_coco_json: print('Dumping detections..') if args.output_web_json: detections.dump_web() else: detections.dump() else: if not train_mode: print('Saving data..') with open(args.ap_data_file, 'wb') as f: pickle.dump(ap_data, f) return calc_map(ap_data) elif args.benchmark: print() print() print('Stats for the last frame:') timer.print_stats() avg_seconds = frame_times.get_avg() print('Average: %5.2f fps, %5.2f ms' % (1 / frame_times.get_avg(), 1000 * avg_seconds)) except KeyboardInterrupt: print('Stopping..')
# detect videos elif args.video is not None: vid = cv2.VideoCapture('videos/' + args.video) target_fps = round(vid.get(cv2.CAP_PROP_FPS)) frame_width = round(vid.get(cv2.CAP_PROP_FRAME_WIDTH)) frame_height = round(vid.get(cv2.CAP_PROP_FRAME_HEIGHT)) num_frames = round(vid.get(cv2.CAP_PROP_FRAME_COUNT)) name = args.video.split('/')[-1] video_writer = cv2.VideoWriter(f'results/videos/{name}', cv2.VideoWriter_fourcc(*"mp4v"), target_fps, (frame_width, frame_height)) frame_times = MovingAverage() progress_bar = ProgressBar(40, num_frames) time_here = 0 fps = 0 for i in range(num_frames): frame_origin = torch.from_numpy(vid.read()[1]).cuda().float() img_h, img_w = frame_origin.shape[0], frame_origin.shape[1] frame_trans = FastBaseTransform()(frame_origin.unsqueeze(0)) net_outs = net(frame_trans) nms_outs = NMS(net_outs, args.traditional_nms) results = after_nms(nms_outs, img_h, img_w, crop_masks=not args.no_crop, visual_thre=args.visual_thre)
def play_video(): try: nonlocal frame_buffer, running, video_fps, is_webcam, num_frames, frames_displayed, vid_done video_frame_times = MovingAverage(100) frame_time_stabilizer = frame_time_target last_time = None stabilizer_step = 0.0005 progress_bar = ProgressBar(30, num_frames) while running: frame_time_start = time.time() if not frame_buffer.empty(): next_time = time.time() if last_time is not None: video_frame_times.add(next_time - last_time) video_fps = 1 / video_frame_times.get_avg() if out_path is None: cv2.imshow(path, frame_buffer.get()) else: out.write(frame_buffer.get()) frames_displayed += 1 last_time = next_time if out_path is not None: if video_frame_times.get_avg() == 0: fps = 0 else: fps = 1 / video_frame_times.get_avg() progress = frames_displayed / num_frames * 100 progress_bar.set_val(frames_displayed) print( '\rProcessing Frames %s %6d / %6d (%5.2f%%) %5.2f fps ' % (repr(progress_bar), frames_displayed, num_frames, progress, fps), end='') # This is split because you don't want savevideo to require cv2 display functionality (see #197) if out_path is None and cv2.waitKey(1) == 27: # Press Escape to close running = False if not (frames_displayed < num_frames): running = False if not vid_done: buffer_size = frame_buffer.qsize() if buffer_size < args.video_multiframe: frame_time_stabilizer += stabilizer_step elif buffer_size > args.video_multiframe: frame_time_stabilizer -= stabilizer_step if frame_time_stabilizer < 0: frame_time_stabilizer = 0 new_target = frame_time_stabilizer if is_webcam else max( frame_time_stabilizer, frame_time_target) else: new_target = frame_time_target next_frame_target = max( 2 * new_target - video_frame_times.get_avg(), 0) target_time = frame_time_start + next_frame_target - 0.001 # Let's just subtract a millisecond to be safe if out_path is None or args.emulate_playback: # This gives more accurate timing than if sleeping the whole amount at once while time.time() < target_time: time.sleep(0.001) else: # Let's not starve the main thread, now time.sleep(0.001) except: # See issue #197 for why this is necessary import traceback traceback.print_exc()
def train(): if not os.path.exists(args.save_folder): os.mkdir(args.save_folder) dataset = COCODetection(image_path=cfg.dataset.train_images, info_file=cfg.dataset.train_info, transform=SSDAugmentation(MEANS)) print("dataset:", dataset[0]) if args.validation_epoch > 0: setup_eval() val_dataset = COCODetection(image_path=cfg.dataset.valid_images, info_file=cfg.dataset.valid_info, transform=BaseTransform(MEANS)) # Parallel wraps the underlying module, but when saving and loading we don't want that yolact_net = Yolact() net = yolact_net net.train() if args.log: log = Log(cfg.name, args.log_folder, dict(args._get_kwargs()), overwrite=(args.resume is None), log_gpu_stats=args.log_gpu) # I don't use the timer during training (I use a different timing method). # Apparently there's a race condition with multiple GPUs, so disable it just to be safe. timer.disable_all() # Both of these can set args.resume to None, so do them before the check if args.resume == 'interrupt': args.resume = SavePath.get_interrupt(args.save_folder) elif args.resume == 'latest': args.resume = SavePath.get_latest(args.save_folder, cfg.name) if args.resume is not None: print('Resuming training, loading {}...'.format(args.resume)) yolact_net.load_weights(args.resume) if args.start_iter == -1: args.start_iter = SavePath.from_str(args.resume).iteration else: print('Initializing weights...') yolact_net.init_weights(backbone_path=args.save_folder + cfg.backbone.path) optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.decay) criterion = MultiBoxLoss(num_classes=cfg.num_classes, pos_threshold=cfg.positive_iou_threshold, neg_threshold=cfg.negative_iou_threshold, negpos_ratio=cfg.ohem_negpos_ratio) if args.batch_alloc is not None: args.batch_alloc = [int(x) for x in args.batch_alloc.split(',')] if sum(args.batch_alloc) != args.batch_size: print( 'Error: Batch allocation (%s) does not sum to batch size (%s).' % (args.batch_alloc, args.batch_size)) exit(-1) net = CustomDataParallel(NetLoss(net, criterion)) if args.cuda: net = net.cuda() # Initialize everything if not cfg.freeze_bn: yolact_net.freeze_bn() # Freeze bn so we don't kill our means yolact_net(torch.zeros(1, 3, cfg.max_size, cfg.max_size).cuda()) if not cfg.freeze_bn: yolact_net.freeze_bn(True) # loss counters loc_loss = 0 conf_loss = 0 iteration = max(args.start_iter, 0) last_time = time.time() epoch_size = len(dataset) // args.batch_size num_epochs = math.ceil(cfg.max_iter / epoch_size) # Which learning rate adjustment step are we on? lr' = lr * gamma ^ step_index step_index = 0 data_loader = data.DataLoader(dataset, args.batch_size, num_workers=args.num_workers, shuffle=True, collate_fn=detection_collate, pin_memory=True) save_path = lambda epoch, iteration: SavePath( cfg.name, epoch, iteration).get_path(root=args.save_folder) time_avg = MovingAverage() global loss_types # Forms the print order loss_avgs = {k: MovingAverage(100) for k in loss_types} print('Begin training!') print() # try-except so you can use ctrl+c to save early and stop training try: for epoch in range(num_epochs): # Resume from start_iter if (epoch + 1) * epoch_size < iteration: continue for datum in data_loader: # Stop if we've reached an epoch if we're resuming from start_iter if iteration == (epoch + 1) * epoch_size: break # Stop at the configured number of iterations even if mid-epoch if iteration == cfg.max_iter: break # Change a config setting if we've reached the specified iteration changed = False for change in cfg.delayed_settings: if iteration >= change[0]: changed = True cfg.replace(change[1]) # Reset the loss averages because things might have changed for avg in loss_avgs: avg.reset() # If a config setting was changed, remove it from the list so we don't keep checking if changed: cfg.delayed_settings = [ x for x in cfg.delayed_settings if x[0] > iteration ] # Warm up by linearly interpolating the learning rate from some smaller value if cfg.lr_warmup_until > 0 and iteration <= cfg.lr_warmup_until: set_lr(optimizer, (args.lr - cfg.lr_warmup_init) * (iteration / cfg.lr_warmup_until) + cfg.lr_warmup_init) # Adjust the learning rate at the given iterations, but also if we resume from past that iteration while step_index < len( cfg.lr_steps ) and iteration >= cfg.lr_steps[step_index]: step_index += 1 set_lr(optimizer, args.lr * (args.gamma**step_index)) # Zero the grad to get ready to compute gradients optimizer.zero_grad() # Forward Pass + Compute loss at the same time (see CustomDataParallel and NetLoss) losses = net(datum) losses = {k: (v).mean() for k, v in losses.items() } # Mean here because Dataparallel loss = sum([losses[k] for k in losses]) # no_inf_mean removes some components from the loss, so make sure to backward through all of it # all_loss = sum([v.mean() for v in losses.values()]) # Backprop loss.backward( ) # Do this to free up vram even if loss is not finite if torch.isfinite(loss).item(): optimizer.step() # Add the loss to the moving average for bookkeeping for k in losses: loss_avgs[k].add(losses[k].item()) cur_time = time.time() elapsed = cur_time - last_time last_time = cur_time # Exclude graph setup from the timing information if iteration != args.start_iter: time_avg.add(elapsed) if iteration % 10 == 0: eta_str = str( datetime.timedelta(seconds=(cfg.max_iter - iteration) * time_avg.get_avg())).split('.')[0] total = sum([loss_avgs[k].get_avg() for k in losses]) loss_labels = sum([[k, loss_avgs[k].get_avg()] for k in loss_types if k in losses], []) print(('[%3d] %7d ||' + (' %s: %.3f |' * len(losses)) + ' T: %.3f || ETA: %s || timer: %.3f') % tuple([epoch, iteration] + loss_labels + [total, eta_str, elapsed]), flush=True) if args.log: precision = 5 loss_info = { k: round(losses[k].item(), precision) for k in losses } loss_info['T'] = round(losses[k].item(), precision) if args.log_gpu: log.log_gpu_stats = (iteration % 10 == 0 ) # nvidia-smi is sloooow log.log('train', loss=loss_info, epoch=epoch, iter=iteration, lr=round(cur_lr, 10), elapsed=elapsed) log.log_gpu_stats = args.log_gpu iteration += 1 if iteration % args.save_interval == 0 and iteration != args.start_iter: if args.keep_latest: latest = SavePath.get_latest(args.save_folder, cfg.name) print('Saving state, iter:', iteration) yolact_net.save_weights(save_path(epoch, iteration)) if args.keep_latest and latest is not None: if args.keep_latest_interval <= 0 or iteration % args.keep_latest_interval != args.save_interval: print('Deleting old save...') os.remove(latest) # This is done per epoch if args.validation_epoch > 0: if epoch % args.validation_epoch == 0 and epoch > 0: compute_validation_map(epoch, iteration, yolact_net, val_dataset, log if args.log else None) # Compute validation mAP after training is finished compute_validation_map(epoch, iteration, yolact_net, val_dataset, log if args.log else None) except KeyboardInterrupt: if args.interrupt: print('Stopping early. Saving network...') # Delete previous copy of the interrupted network so we don't spam the weights folder SavePath.remove_interrupt(args.save_folder) yolact_net.save_weights( save_path(epoch, repr(iteration) + '_interrupt')) exit() yolact_net.save_weights(save_path(epoch, iteration))