def _sample_next_frames(self, vid_files): views = [] all_frames_sampled = False for f in vid_files: # get next frames for vid vid = VideoFrameSampler(f, resize_shape=self.frame_size, dtype=np.float32) max_len = self.sample_size if self.sample_size >= len(vid) or self.sample_size == -1: log.warning( "vid {} wiht {}frames has less than for sample size". format(f, len(vid))) max_len = len(vid) all_frames_sampled = True elif self.sample_frames_index + self.sample_size >= len(vid): # last sample smaller max_len = len(vid) - self.sample_frames_index log.info("end {} with {} frames end size {}, index {}".format( f, len(vid), max_len, self.sample_frames_index)) all_frames_sampled = True imgs = np.empty((max_len, *self.frame_size, 3)) for j in range(max_len): imgs[j] = vid.get_frame(j + self.sample_frames_index) t = torch.Tensor(flip_imgs(imgs)) views.append(t.view(1, -1, 3, *self.frame_size)) self.sample_frames_index += self.sample_size if all_frames_sampled: self.sample_frames_index = 0 return all_frames_sampled, views
def main(): args = get_args() input_imgs_file = args.input_imgs dt = np.float32 n_views = args.num_views image_size = (args.img_height, args.img_width) imgs, titles = [], [] total_frames, n_frame = 0, args.n_frame if os.path.isdir(input_imgs_file): input_imgs_file = get_view_pair_vid_files(n_views, input_imgs_file) input_imgs_file_shuffle = sklearn.utils.shuffle(input_imgs_file) input_imgs_file = [] for f in input_imgs_file_shuffle: input_imgs_file.extend(f) else: input_imgs_file = input_imgs_file.split(",") # only show less the max img if args.max_vid_num != -1 and args.max_vid_num < len(input_imgs_file): input_imgs_file = input_imgs_file[:args.max_vid_num] assert len(input_imgs_file), "no vids found" # read vids for f in input_imgs_file: imgs_vid = VideoFrameSampler(f, dtype=dt, resize_shape=image_size, to_rgb=False).get_all() total_frames = len(imgs_vid) log.info("file {} with frames: {}".format(f, total_frames)) imgs.append(imgs_vid) show_sequence(imgs, delay=0, n_frame=n_frame, save_name=args.output_img_name, to_rgb=False)
def __init__(self, D_in, H, z_dim, d_out): super().__init__() self.z_dim = z_dim log.info( "Discriminator domain net in_channels: {} out: {} hidden {}, z dim {}" .format(D_in, d_out, H, z_dim)) self.encoder = torch.nn.Sequential( torch.nn.Linear(D_in, H), nn.Dropout2d(0.25), nn.ReLU(), nn.Linear(H, H), nn.Dropout2d(0.25), nn.ReLU(), ) self.l_mu = nn.Linear(H, z_dim) self.l_var = nn.Linear(H, z_dim) # to output class layer self.out_layer = nn.ModuleList() for out_n in d_out: out = nn.Sequential( nn.Linear(z_dim, z_dim), nn.Dropout2d(0.1), nn.ReLU(), nn.Linear(z_dim, out_n), ) self.out_layer.append(out)
def init_log_tb(save_folder): log.setLevel(logging.INFO) set_log_file(os.path.join(save_folder, "train.log")) log.info("asn commit hash: {}".format(get_git_commit_hash(asn.__file__))) tb_log_dir = os.path.join(os.path.expanduser(save_folder), "tensorboard_log") writer = SummaryWriter(tb_log_dir) return writer
def vid_writer(output_vid_name, fps, frame_shape, frame_count=None): """ manage vid cam """ fourcc = get_fourcc(output_vid_name) # vid writer if shape isknows after rot out_shape_cv = np_shape_to_cv(frame_shape[:2]) vid_writer = cv2.VideoWriter(output_vid_name, fourcc, fps, out_shape_cv) # vid_writer.write(montage_image) yield vid_writer if frame_count is not None: vid_writer.set(cv2.CAP_PROP_FRAME_COUNT, frame_count) vid_writer.set(cv2.CAP_PROP_FPS, fps) log.info("output vid: {}".format(output_vid_name)) vid_writer.release()
def get_skill_dataloader(dir_vids, num_views, batch_size, use_cuda, img_size, filter_func, label_funcs, num_domain_frames=1, stride=1): # sampler with all rand frames from alle task transformer_train = get_train_transformer(img_size=img_size) # sample differt view transformed_dataset_train_domain = DoubleViewPairDataset( vid_dir=dir_vids, number_views=num_views, # std_similar_frame_margin_distribution=sim_frames, transform_frames=transformer_train, lable_funcs=label_funcs, filter_func=filter_func) sampler = None drop_last = True log.info('transformed_dataset_train_domain len: {}'.format( len(transformed_dataset_train_domain))) if num_domain_frames > 1: assert batch_size % num_domain_frames == 0, 'wrong batch size for multi frames ' sampler = SkillViewPairSequenceSampler( dataset=transformed_dataset_train_domain, stride=stride, allow_same_frames_in_seq=True, sequence_length=num_domain_frames, sequences_per_vid_in_batch=1, batch_size=batch_size) log.info('use multi frame dir {} len sampler: {}'.format( dir_vids, len(sampler))) drop_last = len(sampler) >= batch_size # random smaple vid dataloader_train_domain = DataLoader( transformed_dataset_train_domain, drop_last=drop_last, batch_size=batch_size, shuffle=True if sampler is None else False, num_workers=4, sampler=sampler, pin_memory=use_cuda) if sampler is not None and len(sampler) <= batch_size: log.warn("dataset sampler batch size") return dataloader_train_domain
def val_fit_task_label(vid_name_to_task, all_view_pair_names): """ returns func to encode a single video file name to labels """ all_view_pair_names = [vid_name_to_task(f) for f in all_view_pair_names] comm_name_to_lable = preprocessing.LabelEncoder() comm_name_to_lable.fit(all_view_pair_names) # lable_domain = comm_name_to_lable.transform(all_view_pair_names) # tset fit num_classes = len(comm_name_to_lable.classes_) name_classes = comm_name_to_lable.classes_ log.info("number of vid domains task: {}".format(len(comm_name_to_lable.classes_))) log.info("vid domains in train set: {}".format(name_classes)) def transform_comm_name(vid_file_comm, *args, **kwargs): return comm_name_to_lable.transform([vid_name_to_task(vid_file_comm)])[0] return transform_comm_name, num_classes, name_classes
def main(): args = get_args() input_imgs_file = os.path.expanduser(args.input_imgs) n_views = args.num_views image_size = (args.img_height, args.img_width) assert os.path.isdir(input_imgs_file), "input not a dir" input_imgs_file = get_view_pair_vid_files(n_views, input_imgs_file) assert len(input_imgs_file), "no vids found" view_pair_idx = args.view_idx input_imgs_file = [ view_piar_vid[view_pair_idx] for view_piar_vid in input_imgs_file ] input_imgs_file = sklearn.utils.shuffle(input_imgs_file) fourcc = get_fourcc(args.output_vid_name) log.info("output vid: {}".format(args.output_vid_name)) fps = args.fps vid_writer = None for frames in tqdm( get_frames(input_imgs_file, args.mun_col * args.mun_row, args.num_frames, image_size), desc="frame", total=args.num_frames, ): imgs = [[frames[y] for y in range(x, x + args.mun_row)] for x in range(0, len(frames), args.mun_row)] margin = 2 montage_image = montage( imgs, margin_color_bgr=[0, 0, 0], margin_top=margin, margin_bottom=margin, margin_left=margin, margin_right=margin, margin_separate_vertical=margin, margin_separate_horizontal=margin, ) montage_image = convert_to_uint8(montage_image) if vid_writer is None: # vid writer if shape isknows after rot out_shape_cv = np_shape_to_cv(montage_image.shape[:2]) vid_writer = cv2.VideoWriter(args.output_vid_name, fourcc, fps, out_shape_cv) vid_writer.write(montage_image) vid_writer.set(cv2.CAP_PROP_FRAME_COUNT, args.num_frames) vid_writer.set(cv2.CAP_PROP_FPS, fps) vid_writer.release()
def log_train(writer, mi, loss_metric, criterion_metric, entropy, global_step): """ log to tb and print log msg """ msg = "steps {}, dist: pos {:.2},neg {:.2},neg cos dist: pos {:.2},cos_neg {:.2}, loss metric:{:.3}".format( global_step, mi["dist pos"], mi["dist neg"], mi["dist pos cos"], mi["dist neg cos"], loss_metric ) log.info(msg) writer.add_scalar("train/loss" + criterion_metric.__class__.__name__, loss_metric, global_step) writer.add_scalars("train/distane", {"positive": mi["dist pos"], "negative": mi["dist neg"]}, global_step) writer.add_scalars("train/product", {"positive": mi["dist pos dot"], "negative": mi["dist pos dot"]}, global_step) writer.add_scalars( "train/negativ_cosin_dist", {"positive": mi["dist pos cos"], "negative": mi["dist neg cos"]}, global_step ) writer.add_scalar("train/loss_entro", entropy, global_step)
def get_vid_aligment_loss_pair(embeddings, fill_frame_diff=True): """ embeddings(dict), key common view name and values list view embs for each video view""" k = 1 loss, nn_dist, dist_view_pairs = [], [], [] # compute the nn for all permutations # TODO permutations better but combinations used in TF tImplementation for comm_name, view_pair_task_emb in embeddings.items(): for emb1, emb2 in itertools.combinations(view_pair_task_emb, 2): if fill_frame_diff: # fill frame diff with the last embeddings # similar to the tf implementation max_diff = len(emb1) - len(emb2) size_embedding = emb1.shape[1] if max_diff > 0: emb2 = np.concatenate((emb2, np.full((max_diff, size_embedding), emb1[-1]))) elif max_diff < 0: emb1 = np.concatenate((emb1, np.full((-max_diff, size_embedding), emb2[-1]))) knn_img_indexes = get_all_knn_indexes(emb1, [emb2], k=k) # get loss assuption view paire n_frames = knn_img_indexes.shape[0] correct_index = np.arange(n_frames) # index for nn with smallest distance index_for_nn = knn_img_indexes[:, 0, 2] abs_frame_error = np.abs(correct_index - index_for_nn) loss_comp = np.mean(abs_frame_error / float(n_frames)) loss.append(loss_comp) # histogram error loss frames index bin count error_hist_cnts = [] for i, abs_err in enumerate(abs_frame_error): error_hist_cnts.extend([i] * int(abs_err)) nn_dist.append(np.mean(knn_img_indexes[:, 0, 1])) # print infos view_pair_lens = "->".join([str(len(e)) for e in [emb1, emb2]]) log.info( "aligment loss pair {:>30} with {} frames, loss {:>6.5}, mean nn dist {:>6.5}".format( comm_name, view_pair_lens, loss_comp, np.mean(nn_dist) ) ) # get the distances for all view paris for the same frame for emb1, emb2 in itertools.combinations(view_pair_task_emb, 2): min_frame_len = min(np.shape(emb1)[0], np.shape(emb2)[0]) dist_view_i = [get_distances(e1, e2) for e1, e2 in zip(emb1[:min_frame_len], emb2[:min_frame_len])] dist_view_pairs.append(np.mean(dist_view_i)) loss, nn_dist, dist_view_pairs = [np.mean(i) for i in [loss, nn_dist, dist_view_pairs]] return loss, nn_dist, dist_view_pairs, error_hist_cnts
def get_dataloader_train(dir_vids, num_views, batch_size, use_cuda, img_size=299, filter_func=None, label_funcs=None, examples_per_seq=None): transformer_train = get_train_transformer(img_size) sampler = None shuffle = True if examples_per_seq is None: # sample one vid per batch used for lifted loss # used for default tcn for lifted and npair loss examples_per_batch = batch_size else: examples_per_batch = batch_size // examples_per_seq log.info( 'train data loader example per sequence: {}'.format(examples_per_seq)) shuffle = False transformed_dataset_train = DoubleViewPairDataset( vid_dir=dir_vids, number_views=num_views, filter_func=filter_func, lable_funcs=label_funcs, # random_view_index=True, # std_similar_frame_margin_distribution=sim_frames, transform_frames=transformer_train) # sample so that only one view pairs is in a batch sampler = ViewPairSequenceSampler( dataset=transformed_dataset_train, examples_per_sequence=examples_per_batch, # similar_frame_margin=3,# TODO batch_size=batch_size) dataloader_train = DataLoader(transformed_dataset_train, drop_last=True, batch_size=batch_size, shuffle=shuffle, sampler=sampler, num_workers=4, pin_memory=use_cuda) return dataloader_train
def web_cam_samper(port): """ cv2 webcam manger """ video_capture = cv2.VideoCapture(port) if not video_capture.isOpened(): log.error("port is open {}".format(port)) close_open_web_cams() video_capture = cv2.VideoCapture(port) # test vid sample assert sample_image( video_capture) is not None, " cam failed port {} ".format(port) # When everything is done, release the capture try: yield video_capture except Exception: video_capture.release() log.info("release video_capture: {} port {}".format( video_capture, port))
def save_webcam_frames(p_ranke, port, event_sync, result_frames_q): """ sample frame on event set """ frame_cnt = 0 with web_cam_samper(port) as camera: log.info("port: {}".format(port)) adjust_brightness(camera) while True: frame = sample_image(camera) sample_time = time.time() # log.info('port {} sample_time: {},frame_cnt {}'.format(port,sample_time,frame_cnt)) result_frames_q.put({ "frame": frame, "time": sample_time, "num": frame_cnt }) frame_cnt += 1 event_sync.wait() event_sync.clear()
def save_model(model, optimizer, training_args, is_best, model_folder, step): state = { "datetime": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "step": step, "training_args": training_args, "model_state_dict": model.state_dict(), "optimizer_state_dict": optimizer.state_dict(), } model_folder = os.path.expanduser(model_folder) if not os.path.exists(model_folder): os.makedirs(model_folder) filename = os.path.join(model_folder, "model.pth.tar") torch.save(state, filename) log.info("Saved Model from: {}, step {}".format(filename, step)) if is_best: filename_copy = os.path.join(model_folder, "model_best.pth.tar") shutil.copyfile(filename, filename_copy) log.info("copyed to model_best!")
def create_model(use_cuda, load_model_file=None, **kwargs): asn = define_model(use_cuda, **kwargs) start_step = 0 optimizer_state_dict = None training_args = None if load_model_file: load_model_file = os.path.expanduser(load_model_file) assert os.path.isfile(load_model_file), "file not found {}".format( load_model_file) checkpoint = torch.load(load_model_file) start_step = checkpoint.get("step", 0) training_args = checkpoint.get("training_args", None) optimizer_state_dict = checkpoint["optimizer_state_dict"] asn.load_state_dict(checkpoint["model_state_dict"], strict=False) log.info("Restoring Model from: {}, step {}, datetime {}".format( load_model_file, start_step, checkpoint.get("datetime"))) if use_cuda: asn = asn.cuda() return asn, start_step, optimizer_state_dict, training_args
def save_model(model, optimizer, training_args, is_best, model_folder, step): state = { 'datetime': datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), 'step': step, 'training_args': training_args, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), } model_folder = os.path.expanduser(model_folder) if not os.path.exists(model_folder): os.makedirs(model_folder) filename = os.path.join(model_folder, 'model.pth.tar') torch.save(state, filename) checkpoint = torch.load(filename) log.info("Saved Model from: {}, step {}".format(filename, step)) if is_best: filename_copy = os.path.join(model_folder, 'model_best.pth.tar') shutil.copyfile(filename, filename_copy) log.info("copyed to model_best!")
def show_sequence(imgs, delay=0, n_frame=1, save_name=None, to_rgb=True): """ shows a 2d imgs array like [[img1,img2], with frame counter as titles """ if n_frame != 1: imgs = [i_v[::n_frame] for i_v in imgs] titles = [["frame {}".format(n * n_frame) for n in range(len(i))] for i in imgs] # take one the longest title row to show on top titles = [sorted(titles, key=len, reverse=True)[0]] montage_image = montage(imgs, titles=titles, margin_separate_vertical=0, margin_separate_horizontal=5) montage_image = convert_to_uint8(montage_image) if to_rgb: montage_image = cv2.cvtColor(montage_image, cv2.COLOR_RGB2BGR) if save_name is not None: cv2.imwrite(save_name, montage_image) cv2.imshow('sequence', montage_image) log.info('click image and then a key to continue') cv2.waitKey(delay) # == 27: # ESC cv2.destroyAllWindows()
def log_train(writer, mi, loss_metric, criterion_metric, entropy, global_step): """ log to tb and print log msg """ msg = "steps {}, dist: pos {:.2},neg {:.2},neg cos dist: pos {:.2},cos_neg {:.2}, loss metric:{:.3}".format( global_step, mi['dist pos'], mi["dist neg"], mi['dist pos cos'], mi['dist neg cos'], loss_metric) log.info(msg) writer.add_scalar('train/loss' + criterion_metric.__class__.__name__, loss_metric, global_step) writer.add_scalars('train/distane', { 'positive': mi['dist pos'], 'negative': mi['dist neg'] }, global_step) writer.add_scalars('train/product', { 'positive': mi['dist pos dot'], 'negative': mi['dist pos dot'] }, global_step) writer.add_scalars('train/negativ_cosin_dist', { 'positive': mi['dist pos cos'], 'negative': mi['dist neg cos'] }, global_step) writer.add_scalar('train/loss_entro', entropy, global_step)
def close_open_web_cams(): # TODO checko for port her # Try to find and kill hanging cv2 process_ids. try: output = subprocess.check_output(["lsof -t /dev/video*"], shell=True) log.info("Found hanging cv2 process_ids:") log.info(output) log.info("Killing hanging processes...") output = str(output) for process_id in output.split("\n")[:-1]: subprocess.call(["kill %s" % process_id], shell=True) time.sleep(3) # Recapture webcams. except subprocess.CalledProcessError: raise ValueError( "Cannot connect to cameras. Try running: \n" "ls -ltrh /dev/video* \n " "to see which ports your webcams are connected to. Then hand those " "ports as a comma-separated list to --webcam_ports, e.g. " "--webcam_ports 0,1")
parser.add_argument('--load-model', type=str, required=False) parser.add_argument('--val-dir-metric', type=str, default='~/asn_data/val') parser.add_argument('--batch-size', type=int, default=1) parser.add_argument('--num-views', type=int, default=2) parser.add_argument( '--task', type=str, default="cstack", help='dataset, load tasks for real block data (cstack)') return parser.parse_args() if __name__ == '__main__': args = get_args() log.info("args: {}".format(args)) use_cuda = torch.cuda.is_available() print('use_cuda: {}'.format(use_cuda)) asn, start_epoch, global_step, _ = create_model(use_cuda, args.load_model) log.info('start_epoch: {}'.format(start_epoch)) log.info('asn: {}'.format(asn.__class__.__name__)) img_size = 299 vid_name_to_task_func = transform_vid_name_to_task(args.task) log.info('args.val_dir_metric: {}'.format(args.val_dir_metric)) dataloader_val = get_dataloader_val(args.val_dir_metric, args.num_views, args.batch_size, use_cuda) if use_cuda: asn.cuda() def model_forward(frame_batch): if use_cuda:
def _print_dataset_info_txt(self): info_txt_file = os.path.join(self.vid_dir, "../../dataset_info.txt") if os.path.exists(info_txt_file): with open(info_txt_file, 'r') as f: log.info("dataset info:\n {}".format(f.read()))
def main(): args = get_args() args.out_dir = os.path.expanduser(args.out_dir) ports = list(map(int, args.ports.split(","))) log.info("ports: {}".format(ports)) sample_events = [multiprocessing.Event() for _ in ports] num_frames = args.max_frame if args.display: disp_q = multiprocessing.Queue() p = Process(target=display_worker, args=(disp_q, ), daemon=True) p.start() # process to save images as a file im_data_q, im_file_q = multiprocessing.Queue(), multiprocessing.Queue() img_folder = os.path.join(args.out_dir, "images", args.set_name, args.tag) vid_folder = os.path.join(args.out_dir, "videos", args.set_name) img_args = (ports, img_folder, args.tag, im_data_q, im_file_q) p = Process(target=save_img_worker, args=img_args, daemon=True) p.start() log.info("img_folder: {}".format(img_folder)) log.info("vid_folder: {}".format(vid_folder)) log.info("fps: {}".format(args.fps)) try: time_prev = time.time() # loop to sample frames with events for frame_cnt, port_data in enumerate( sample_frames(ports, sample_events, num_frames)): sample_time_dt = time.time() - time_prev if frame_cnt % 10 == 0: log.info("frame {} time_prev: {}".format( frame_cnt, time.time() - time_prev)) time_prev = time.time() # set events to trigger cams for e in sample_events: e.set() if frame_cnt == 0: # skip first frame because not synchronized with event log.info("START: {}".format(frame_cnt)) continue elif (sample_time_dt - 1.0 / args.fps) > 0.1: log.warn("sampling frame taks too long for fps") # check sampel time diff if len(ports) > 1: dt = [ np.abs(p1["time"] - p2["time"]) for p1, p2 in combinations(port_data.values(), 2) ] # log.info('dt: {}'.format(np.mean(dt))) if np.max(dt) > 0.1: log.warn( "camera sample max time dt: {}, check light condition and camera models" .format(np.max(dt))) assert all(frame_cnt == d["num"] for d in port_data.values()), "out of sync" im_data_q.put(port_data) if args.display: disp_q.put(port_data) time.sleep(1.0 / args.fps) except KeyboardInterrupt: # create vids form images save before im_shape = {p: d["frame"].shape for p, d in port_data.items()} img_files = defaultdict(list) for d in get_all_queue_result(im_file_q): for p, f in d.items(): img_files[p].append(f) # TODO start for each a procresss and join for view_i, p in enumerate(port_data.keys()): save_vid_worker(img_files[p], view_i, vid_folder, args.tag, im_shape[p], args.fps) cv2.destroyAllWindows()
def print_frame_len_info(self): max_len_vid = max(max(l) for l in self.frame_lengths) min_len_vid = min(min(l) for l in self.frame_lengths) mean_len_vid = int(np.mean(self.frame_lengths)) log.info("{} videos frame len mean : {}, min: {}, max: {}".format( self.vid_dir, mean_len_vid, min_len_vid, max_len_vid))
def main(): args = get_args() log.info("args: {}".format(args)) writer = init_log_tb(args.save_folder) use_cuda = torch.cuda.is_available() print('use_cuda: {}'.format(use_cuda)) criterion = {"lifted": LiftedStruct(), "liftedcombi": LiftedCombined()}[args.loss] log.info("criterion: for {} ".format( criterion.__class__.__name__)) asn, global_step_start, _, _ = create_model( use_cuda, args.load_model, embedding_size=args.emb_dim) log.info('asn: {}'.format(asn.__class__.__name__)) asn.train() # load function which maps video file name to task for different datasets vid_name_to_task = transform_vid_name_to_task(args.task) dataloader_val = get_dataloader_val(args.val_dir_metric, args.num_views, args.batch_size, use_cuda) train_filter_func = None if args.train_filter_tasks is not None: # filter out tasks by names for the training set train_filter_tasks = args.train_filter_tasks.split(',') log.info('train_filter_tasks: {}'.format(train_filter_tasks)) def train_filter_func(name, n_frames): return all(task not in name for task in train_filter_tasks) # ABD->C examples_per_seq = args.num_example_batch dataloader_train = get_dataloader_train(args.train_dir, args.num_views, args.batch_size, use_cuda, img_size=299, filter_func=train_filter_func, examples_per_seq=examples_per_seq) all_view_pair_names = dataloader_train.dataset.get_all_comm_view_pair_names() all_view_pair_frame_lengths = dataloader_train.dataset.frame_lengths # for every task one label based on video name # not used to train the models transform_comm_name, num_domain_task_classes, task_names = val_fit_task_label(vid_name_to_task, all_view_pair_names) log.info('task names: {}'.format(task_names)) # func to transform video name to a task label label_funcs = {'domain task label': transform_comm_name} num_domain_frames = args.num_domain_frames # embedding class log.info('num_domain_frames: {}'.format(num_domain_frames)) # Discriminator network with iputs outputs depending on the args settings net_input = args.emb_dim * num_domain_frames d_net = Discriminator(net_input, H=args.d_net_hidden_dim, z_dim=args.d_net_z_dim, d_out=[num_domain_task_classes]) # DATA domain # filter out fake examples and tasks for D net stride = args.multi_domain_frames_stride if args.train_filter_tasks is not None: def filter_func_domain(name, frames_cnt): """ return no fake examples for filtered tasks""" return "fake" not in name and all(task not in name for task in train_filter_tasks) else: filter_func_domain = None def filter_func_domain(name, frames_cnt): """ return no fake exmaples for filtered tasks""" return "fake" not in name dataloader_train_domain = get_skill_dataloader(args.train_dir, args.num_views, args.batch_size, use_cuda, img_size=299, filter_func=filter_func_domain, label_funcs=label_funcs, num_domain_frames=num_domain_frames, stride=stride) if use_cuda: torch.cuda.seed() criterion.cuda() asn.cuda() d_net.cuda() model_forward_cuda = functools.partial(model_forward, mdl=asn, use_cuda=use_cuda, to_numpy=False) model_forward_np = functools.partial(model_forward, mdl=asn, use_cuda=use_cuda, to_numpy=True) # define optimizer for encoder (g) and Discriminator (d) params_asn = filter(lambda p: p.requires_grad, asn.parameters()) optimizer_g = optim.Adam(params_asn, lr=args.lr_d) optimizer_d = optim.Adam(d_net.parameters(), lr=args.lr_g) assert isinstance(criterion, (LiftedStruct, LiftedCombined)) key_views = ["frames views {}".format(i) for i in range(args.num_views)] iter_metric = iter(data_loader_cycle(dataloader_train)) iter_domain = iter(data_loader_cycle(dataloader_train_domain)) loss_val_min = None loss_val_min_step = 0 for global_step in range(global_step_start, args.steps): # ======================================================= # update the encoder network sample_batched = next(iter_metric) # metric loss img = torch.cat([sample_batched[key_views[0]], sample_batched[key_views[1]]]) embeddings = model_forward_cuda(Variable(img)) n = sample_batched[key_views[0]].size(0) anchor_emb, positive_emb = embeddings[:n], embeddings[n:] label_positive_pair = np.arange(n) labels = Variable(torch.Tensor(np.concatenate([label_positive_pair, label_positive_pair]))).cuda() # METRIC loss if examples_per_seq == 1: loss_metric = criterion(embeddings, labels) else: loss_metric = multi_vid_batch_loss(criterion, embeddings, labels, num_vid_example=examples_per_seq) # set input and targets sample_batched_domain = next(iter_domain) img_domain = torch.cat([sample_batched_domain[key_views[0]], sample_batched_domain[key_views[1]]]) emb_asn = model_forward_cuda(Variable(img_domain)) if num_domain_frames != 1: # multiple frames as skills bl = emb_asn.size(0) emb_size = emb_asn.size(1) emb_asn = emb_asn.view(bl // num_domain_frames, num_domain_frames * emb_size) # mask out label for cat view kl_loss, d_out_gen = d_net(emb_asn) d_out_gen = d_out_gen[0] # min the entropy for different classes optimizer_g.zero_grad() optimizer_d.zero_grad() # ensure equal usage of fake samples loss_g = loss_metric * 0.1 # maximize the entropy entropy_fake = entropy(d_out_gen) entropy_fake.backward(retain_graph=True) entropy_margin = -1. * marginalized_entropy(d_out_gen) # ensure equal usage of fake samples entropy_margin.backward(retain_graph=True) # update the encoder network loss_g.backward(retain_graph=True) optimizer_g.step() optimizer_g.zero_grad() optimizer_d.zero_grad() # ======================================================= # update the Discriminator # maximize marginalized entropy over real samples to ensure equal usage entropy_margin = -1. * marginalized_entropy(d_out_gen) entropy_margin.backward(retain_graph=True) # minimize entropy to make certain prediction of real sample entropy_real = -1. * entropy(d_out_gen) entropy_real.backward(retain_graph=True) kl_loss.backward() optimizer_d.step() if global_step % 100 == 0 or global_step == 1: # log training loss_metric = loss_g.data.cpu().numpy().item() mi = get_metric_info_multi_example(anchor_emb.data.cpu().numpy(), positive_emb.data.cpu().numpy()) log_train(writer, mi, loss_metric, criterion, entropy_fake, global_step) # ======================================================= # Validation if global_step % args.val_step == 0 and global_step > global_step_start: log.info("==============================") asn.eval() if args.plot_tsne and global_step % 20000 == 0: # save a tsne plot visualize_embeddings(model_forward_cuda, dataloader_val, summary_writer=None, global_step=global_step, save_dir=args.save_folder, label_func=vid_name_to_task) loss_val, nn_dist, dist_view_pais, frame_distribution_err_cnt = view_pair_alignment_loss(model_forward_np, args.num_views, dataloader_val) asn.train() writer.add_histogram("val/frame_error_count", np.array(frame_distribution_err_cnt), global_step) writer.add_scalar('val/alignment_loss', loss_val, global_step) writer.add_scalar('val/nn_distance', nn_dist, global_step) writer.add_scalar( 'val/distance_view_pairs_same_frame', dist_view_pais, global_step) is_best = False if loss_val_min is None or loss_val < loss_val_min: loss_val_min = loss_val loss_val_min_step = global_step is_best = True msg = "Validation alignment loss: {}, nn mean dist {:.3}, lowest loss {:.4} at {} steps".format( loss_val, nn_dist, loss_val_min, loss_val_min_step) log.info(msg) save_model(asn, optimizer_g, args, is_best, args.save_folder, global_step) writer.close()
def __init__( self, inception, additional_conv_sizes=[512, 512], fc_hidden_sizes=[2048], embedding_size=32, dp_ratio_pretrained_act=0.2, dp_ratio_conv=1.0, dp_ratio_fc=0.2, rnn_type=None, mode_gaussian_dist=False, latent_z_dim=512, rnn_forward_seqarade=False, l2_normalize_output=False, finetune_inception=False, ): super().__init__() self.gaussian_mode = mode_gaussian_dist self.embedding_size = embedding_size log.info("finetune_inception: {}".format(finetune_inception)) if not finetune_inception: # disable training for inception v3 for child in inception.children(): for param in child.parameters(): param.requires_grad = False # see: # https://github.com/pytorch/vision/blob/master/torchvision/models/inception.py self.inception_end_point_mixed_5d = nn.ModuleList([ inception.Conv2d_1a_3x3, inception.Conv2d_2a_3x3, inception.Conv2d_2b_3x3, nn.MaxPool2d(kernel_size=3, stride=2), inception.Conv2d_3b_1x1, inception.Conv2d_4a_3x3, nn.MaxPool2d(kernel_size=3, stride=2), inception.Mixed_5b, inception.Mixed_5c, inception.Mixed_5d, ]) in_channels = 288 self.Conv2d_6n_3x3 = nn.ModuleList() if dp_ratio_pretrained_act < 1.0: self.Conv2d_6n_3x3.append(nn.Dropout(p=dp_ratio_pretrained_act)) # padding=1 so like in the tf =SAME for i, out_channels in enumerate(additional_conv_sizes): self.Conv2d_6n_3x3.append( BNConv2d(in_channels, out_channels, padding=1, kernel_size=3, stride=1)) if dp_ratio_conv < 1.0: self.Conv2d_6n_3x3.append(nn.Dropout(p=dp_ratio_conv)) in_channels = out_channels # Take the spatial soft arg-max of the last convolutional layer. self.SpatialSoftmax = SpatialSoftmax(channel=512, height=35, width=35) # nn.Softmax2d() self.FullyConnected7n = nn.ModuleList([Flatten()]) in_channels = 1024 # out of SpatialSoftmax self.num_freatures = int(in_channels) for i, num_hidden in enumerate(fc_hidden_sizes): self.FullyConnected7n.append( Dense(in_channels, num_hidden, activation=F.relu)) if dp_ratio_fc > 0.0: self.FullyConnected7n.append(nn.Dropout(p=dp_ratio_fc)) in_channels = num_hidden if self.gaussian_mode: self.FullyConnected7n.append( Dense(in_channels, 512, activation=F.relu)) self.l_mu = Dense(512, latent_z_dim) self.l_var = Dense(512, latent_z_dim) # out layer for sampeld lat var self.lat_sampled_out_emb = nn.ModuleList([ Dense(latent_z_dim, 512, activation=F.relu), nn.Dropout(p=0.2), Dense(512, 512, activation=F.relu), nn.Dropout(p=0.2), Dense(512, embedding_size), ]) self._sequential_z_out = nn.Sequential(*self.lat_sampled_out_emb) else: self.FullyConnected7n.append(Dense(in_channels, embedding_size)) self._all_sequential_feature = nn.Sequential( *self.inception_end_point_mixed_5d, *self.Conv2d_6n_3x3, self.SpatialSoftmax) self._all_sequential_emb = nn.Sequential(*self.FullyConnected7n) self.l2_normalize_output = l2_normalize_output # use l2 norm with triplet loss if l2_normalize_output: log.info("TCN with l2 norm out")
def plot_embedding(X, labels_str, title, imgs=None, save_dir=None, frame_lable=None, max_frame=None, vid_lable=None): # http://scikit-learn.org/stable/auto_examples/manifold/plot_lle_digits.html x_min, x_max = np.min(X, 0), np.max(X, 0) X = (X - x_min) / (x_max - x_min) if imgs is not None: fig = plt.figure(figsize=(20, 20)) ax = plt.subplot(221) else: fig = plt.figure() ax = fig.gca() # labels blow plt n_classes, y, colors, legend_elements = plt_labeled_data(ax, X, labels_str) plt.title(title) if imgs is not None: # plt again but with image overlay ax = plt.subplot(222) ax.set_title("image overlay") ax.scatter(X[:, 0], X[:, 1], color=colors) if hasattr(offsetbox, "AnnotationBbox"): # only print thumbnails with matplotlib > 1.0 shown_images = np.array([[1.0, 1.0]]) # just something big for i in range(X.shape[0]): dist = np.sum((X[i] - shown_images)**2, 1) if np.min(dist) < 5e-3: # don't show points that are too close continue shown_images = np.r_[shown_images, [X[i]]] imagebox = offsetbox.AnnotationBbox(offsetbox.OffsetImage( imgs[i], cmap=plt.cm.gray_r, zoom=0.75), X[i], pad=0.0) ax.add_artist(imagebox) # plt legend same as befor plt_labels_blow(ax, list(legend_elements.values())) if frame_lable is not None: # plt the frames classe # show color for ever 50 frame in legend ax = plt.subplot(223) plt_labeled_data( ax, X, frame_lable, label_filter_legend=lambda l: l % 50 == 0, plt_cm=plt.cm.Spectral, index_color_factor=max_frame, ) ax.set_title("frames as label (color range normalized for every vid)") if vid_lable is not None: # plt the view pair as classe ax = plt.subplot(224) plt_labeled_data(ax, X, vid_lable, label_filter_legend=lambda x: False) ax.set_title("view pair as label") if save_dir is not None: create_dir_if_not_exists(save_dir) save_dir = os.path.expanduser(save_dir) title = os.path.join(save_dir, title) fig.savefig(title + ".pdf", bbox_inches="tight") log.info("save TSNE plt to: {}".format(title)) plt.close("all")
def visualize_embeddings( func_model_forward, data_loader, summary_writer=None, global_step=0, seq_len=None, stride=None, label_func=None, save_dir=None, tag="", emb_size=32, ): """visualize embeddings with tensorboardX Args: summary_writer(tensorboardX.SummaryWriter): data_loader(ViewPairDataset): with shuffle false label_func: function to label a frame: input is (vid_file_comm,frame_idx=None,vid_len=None,csv_file=None,state_label=None) Returns: None :param func_model_forward: :param global_step: :param seq_len: :param stride: :param save_dir: :param tag: :param emb_size: """ assert isinstance( data_loader.dataset, ViewPairDataset), "dataset must be form type ViewPairDataset" data_len = len(data_loader.dataset) vid_dir = data_loader.dataset.vid_dir if seq_len: assert stride is not None # cut off first frames data_len -= seq_len * stride * len(data_loader.dataset.video_paths) embeddings = np.empty((data_len, emb_size)) img_size = 50 # image size to plot frames = torch.empty((data_len, 3, img_size, img_size)) # trans form the image to plot it later trans = transforms.Compose([ transforms.ToPILImage(), # expects rgb, moves channel to front transforms.Resize(img_size), transforms.ToTensor(), # image 0-255 to 0. - 1.0 ]) cnt_data = 0 labels = [] view_pair_name_labels = [] labels_frame_idx = [] vid_len_frame_idx = [] with tqdm(total=len(data_loader), desc="computing embeddings for {} frames".format( len(data_loader))) as pbar: for i, data in enumerate(data_loader): # compute the emb for a batch frames_batch = data["frame"] if seq_len is None: emb = func_model_forward(frames_batch) # add emb to dict and to quue if all frames # for e, name, view, last in zip(emb, data["common name"], data["view"].numpy(), data['is last frame'].numpy()): # transform all frames to a smaller image to plt later for e, frame in zip(emb, frames_batch): embeddings[cnt_data] = e # transform only for on img possible frames[cnt_data] = trans(frame).cpu() cnt_data += 1 if data_len == cnt_data: break state_label = data.get("state lable", None) comm_name = data["common name"] frame_idx = data["frame index"] vid_len = data["video len"] labels_frame_idx.extend(frame_idx.numpy()) vid_len_frame_idx.extend(vid_len.numpy()) if label_func is not None: state_label = len(comm_name) * [ None ] if state_label is None else state_label state_label = [ label_func(c, i, v_len, get_video_csv_file(vid_dir, c), la) for c, la, i, v_len in zip(comm_name, state_label, frame_idx, vid_len) ] else: state_label = comm_name labels.extend(state_label) view_pair_name_labels.extend(comm_name) if data_len == cnt_data: break else: raise NotImplementedError() pbar.update(1) log.info("number of found labels: {}".format(len(labels))) if len(labels) != len(embeddings): # in case of rnn seq cut cuff an the end, in case of drop last log.warn( "number of labels {} smaller than embeddings, changing embeddings size" .format(len(labels))) embeddings = embeddings[:len(labels)] frames = frames[:len(labels)] if len(labels) == 0: log.warn("length of labels is zero!") else: log.info("start TSNE fit") labels = labels[:data_len] imgs = flip_imgs(frames.numpy(), rgb_to_front=False) rnn_tag = "_seq{}_stride{}".format( seq_len, stride) if seq_len is not None else "" X_tsne = TSNE_multi(n_jobs=4, perplexity=40).fit_transform( embeddings) # perplexity = 40, theta=0.5 create_time_vid(X_tsne, labels_frame_idx, vid_len_frame_idx) plot_embedding( X_tsne, labels, title=tag + "multi-t-sne_perplexity40_theta0.5_step" + str(global_step) + rnn_tag, imgs=imgs, save_dir=save_dir, frame_lable=labels_frame_idx, max_frame=vid_len_frame_idx, vid_lable=view_pair_name_labels, )