def _sample_next_frames(self, vid_files):
     views = []
     all_frames_sampled = False
     for f in vid_files:
         # get next frames for vid
         vid = VideoFrameSampler(f,
                                 resize_shape=self.frame_size,
                                 dtype=np.float32)
         max_len = self.sample_size
         if self.sample_size >= len(vid) or self.sample_size == -1:
             log.warning(
                 "vid {} wiht {}frames has less than for sample size".
                 format(f, len(vid)))
             max_len = len(vid)
             all_frames_sampled = True
         elif self.sample_frames_index + self.sample_size >= len(vid):
             # last sample smaller
             max_len = len(vid) - self.sample_frames_index
             log.info("end {} with {} frames end size {}, index {}".format(
                 f, len(vid), max_len, self.sample_frames_index))
             all_frames_sampled = True
         imgs = np.empty((max_len, *self.frame_size, 3))
         for j in range(max_len):
             imgs[j] = vid.get_frame(j + self.sample_frames_index)
         t = torch.Tensor(flip_imgs(imgs))
         views.append(t.view(1, -1, 3, *self.frame_size))
     self.sample_frames_index += self.sample_size
     if all_frames_sampled:
         self.sample_frames_index = 0
     return all_frames_sampled, views
Example #2
0
def main():
    args = get_args()
    input_imgs_file = args.input_imgs
    dt = np.float32
    n_views = args.num_views
    image_size = (args.img_height, args.img_width)
    imgs, titles = [], []
    total_frames, n_frame = 0, args.n_frame
    if os.path.isdir(input_imgs_file):
        input_imgs_file = get_view_pair_vid_files(n_views, input_imgs_file)
        input_imgs_file_shuffle = sklearn.utils.shuffle(input_imgs_file)
        input_imgs_file = []
        for f in input_imgs_file_shuffle:
            input_imgs_file.extend(f)
    else:
        input_imgs_file = input_imgs_file.split(",")
    # only show less the max img
    if args.max_vid_num != -1 and args.max_vid_num < len(input_imgs_file):
        input_imgs_file = input_imgs_file[:args.max_vid_num]
    assert len(input_imgs_file), "no vids found"
    # read vids
    for f in input_imgs_file:
        imgs_vid = VideoFrameSampler(f,
                                     dtype=dt,
                                     resize_shape=image_size,
                                     to_rgb=False).get_all()
        total_frames = len(imgs_vid)
        log.info("file {} with frames: {}".format(f, total_frames))
        imgs.append(imgs_vid)

    show_sequence(imgs,
                  delay=0,
                  n_frame=n_frame,
                  save_name=args.output_img_name,
                  to_rgb=False)
Example #3
0
 def __init__(self, D_in, H, z_dim, d_out):
     super().__init__()
     self.z_dim = z_dim
     log.info(
         "Discriminator domain net in_channels: {} out: {} hidden {}, z dim {}"
         .format(D_in, d_out, H, z_dim))
     self.encoder = torch.nn.Sequential(
         torch.nn.Linear(D_in, H),
         nn.Dropout2d(0.25),
         nn.ReLU(),
         nn.Linear(H, H),
         nn.Dropout2d(0.25),
         nn.ReLU(),
     )
     self.l_mu = nn.Linear(H, z_dim)
     self.l_var = nn.Linear(H, z_dim)
     # to output class layer
     self.out_layer = nn.ModuleList()
     for out_n in d_out:
         out = nn.Sequential(
             nn.Linear(z_dim, z_dim),
             nn.Dropout2d(0.1),
             nn.ReLU(),
             nn.Linear(z_dim, out_n),
         )
         self.out_layer.append(out)
Example #4
0
def init_log_tb(save_folder):
    log.setLevel(logging.INFO)
    set_log_file(os.path.join(save_folder, "train.log"))

    log.info("asn commit hash: {}".format(get_git_commit_hash(asn.__file__)))
    tb_log_dir = os.path.join(os.path.expanduser(save_folder), "tensorboard_log")
    writer = SummaryWriter(tb_log_dir)
    return writer
def vid_writer(output_vid_name, fps, frame_shape, frame_count=None):
    """ manage vid cam """
    fourcc = get_fourcc(output_vid_name)
    # vid writer if shape isknows after rot
    out_shape_cv = np_shape_to_cv(frame_shape[:2])
    vid_writer = cv2.VideoWriter(output_vid_name, fourcc, fps, out_shape_cv)
    # vid_writer.write(montage_image)
    yield vid_writer
    if frame_count is not None:
        vid_writer.set(cv2.CAP_PROP_FRAME_COUNT, frame_count)
    vid_writer.set(cv2.CAP_PROP_FPS, fps)
    log.info("output vid: {}".format(output_vid_name))
    vid_writer.release()
def get_skill_dataloader(dir_vids,
                         num_views,
                         batch_size,
                         use_cuda,
                         img_size,
                         filter_func,
                         label_funcs,
                         num_domain_frames=1,
                         stride=1):
    # sampler with all rand frames from alle task
    transformer_train = get_train_transformer(img_size=img_size)
    # sample differt view
    transformed_dataset_train_domain = DoubleViewPairDataset(
        vid_dir=dir_vids,
        number_views=num_views,
        # std_similar_frame_margin_distribution=sim_frames,
        transform_frames=transformer_train,
        lable_funcs=label_funcs,
        filter_func=filter_func)

    sampler = None
    drop_last = True
    log.info('transformed_dataset_train_domain len: {}'.format(
        len(transformed_dataset_train_domain)))
    if num_domain_frames > 1:
        assert batch_size % num_domain_frames == 0, 'wrong batch size for multi frames '
        sampler = SkillViewPairSequenceSampler(
            dataset=transformed_dataset_train_domain,
            stride=stride,
            allow_same_frames_in_seq=True,
            sequence_length=num_domain_frames,
            sequences_per_vid_in_batch=1,
            batch_size=batch_size)
        log.info('use multi frame dir {} len sampler: {}'.format(
            dir_vids, len(sampler)))
        drop_last = len(sampler) >= batch_size

    # random smaple vid
    dataloader_train_domain = DataLoader(
        transformed_dataset_train_domain,
        drop_last=drop_last,
        batch_size=batch_size,
        shuffle=True if sampler is None else False,
        num_workers=4,
        sampler=sampler,
        pin_memory=use_cuda)

    if sampler is not None and len(sampler) <= batch_size:
        log.warn("dataset sampler batch size")
    return dataloader_train_domain
Example #7
0
def val_fit_task_label(vid_name_to_task, all_view_pair_names):
    """ returns func to encode a single video file name to labels """
    all_view_pair_names = [vid_name_to_task(f) for f in all_view_pair_names]
    comm_name_to_lable = preprocessing.LabelEncoder()
    comm_name_to_lable.fit(all_view_pair_names)
    # lable_domain = comm_name_to_lable.transform(all_view_pair_names)  # tset fit
    num_classes = len(comm_name_to_lable.classes_)
    name_classes = comm_name_to_lable.classes_
    log.info("number of vid domains task: {}".format(len(comm_name_to_lable.classes_)))
    log.info("vid domains in train set: {}".format(name_classes))

    def transform_comm_name(vid_file_comm, *args, **kwargs):
        return comm_name_to_lable.transform([vid_name_to_task(vid_file_comm)])[0]

    return transform_comm_name, num_classes, name_classes
Example #8
0
def main():
    args = get_args()
    input_imgs_file = os.path.expanduser(args.input_imgs)
    n_views = args.num_views
    image_size = (args.img_height, args.img_width)
    assert os.path.isdir(input_imgs_file), "input not a dir"
    input_imgs_file = get_view_pair_vid_files(n_views, input_imgs_file)
    assert len(input_imgs_file), "no vids found"
    view_pair_idx = args.view_idx
    input_imgs_file = [
        view_piar_vid[view_pair_idx] for view_piar_vid in input_imgs_file
    ]
    input_imgs_file = sklearn.utils.shuffle(input_imgs_file)
    fourcc = get_fourcc(args.output_vid_name)
    log.info("output vid: {}".format(args.output_vid_name))
    fps = args.fps
    vid_writer = None
    for frames in tqdm(
            get_frames(input_imgs_file, args.mun_col * args.mun_row,
                       args.num_frames, image_size),
            desc="frame",
            total=args.num_frames,
    ):

        imgs = [[frames[y] for y in range(x, x + args.mun_row)]
                for x in range(0, len(frames), args.mun_row)]

        margin = 2
        montage_image = montage(
            imgs,
            margin_color_bgr=[0, 0, 0],
            margin_top=margin,
            margin_bottom=margin,
            margin_left=margin,
            margin_right=margin,
            margin_separate_vertical=margin,
            margin_separate_horizontal=margin,
        )
        montage_image = convert_to_uint8(montage_image)
        if vid_writer is None:
            # vid writer if shape isknows after rot
            out_shape_cv = np_shape_to_cv(montage_image.shape[:2])
            vid_writer = cv2.VideoWriter(args.output_vid_name, fourcc, fps,
                                         out_shape_cv)
        vid_writer.write(montage_image)
    vid_writer.set(cv2.CAP_PROP_FRAME_COUNT, args.num_frames)
    vid_writer.set(cv2.CAP_PROP_FPS, fps)
    vid_writer.release()
Example #9
0
def log_train(writer, mi, loss_metric, criterion_metric, entropy, global_step):
    """ log to tb and print log msg """

    msg = "steps {}, dist: pos {:.2},neg {:.2},neg cos dist: pos {:.2},cos_neg {:.2}, loss metric:{:.3}".format(
        global_step, mi["dist pos"], mi["dist neg"], mi["dist pos cos"], mi["dist neg cos"], loss_metric
    )
    log.info(msg)
    writer.add_scalar("train/loss" + criterion_metric.__class__.__name__, loss_metric, global_step)
    writer.add_scalars("train/distane", {"positive": mi["dist pos"], "negative": mi["dist neg"]}, global_step)
    writer.add_scalars("train/product", {"positive": mi["dist pos dot"], "negative": mi["dist pos dot"]}, global_step)

    writer.add_scalars(
        "train/negativ_cosin_dist", {"positive": mi["dist pos cos"], "negative": mi["dist neg cos"]}, global_step
    )

    writer.add_scalar("train/loss_entro", entropy, global_step)
Example #10
0
def get_vid_aligment_loss_pair(embeddings, fill_frame_diff=True):
    """ embeddings(dict), key common view name and values list view embs for each video view"""
    k = 1
    loss, nn_dist, dist_view_pairs = [], [], []
    # compute the nn for all permutations
    # TODO   permutations better but combinations used in TF tImplementation
    for comm_name, view_pair_task_emb in embeddings.items():
        for emb1, emb2 in itertools.combinations(view_pair_task_emb, 2):
            if fill_frame_diff:
                # fill frame diff with the last embeddings
                # similar to the tf implementation
                max_diff = len(emb1) - len(emb2)
                size_embedding = emb1.shape[1]
                if max_diff > 0:
                    emb2 = np.concatenate((emb2, np.full((max_diff, size_embedding), emb1[-1])))
                elif max_diff < 0:
                    emb1 = np.concatenate((emb1, np.full((-max_diff, size_embedding), emb2[-1])))
            knn_img_indexes = get_all_knn_indexes(emb1, [emb2], k=k)
            # get loss assuption view paire
            n_frames = knn_img_indexes.shape[0]
            correct_index = np.arange(n_frames)
            # index for nn with smallest distance
            index_for_nn = knn_img_indexes[:, 0, 2]

            abs_frame_error = np.abs(correct_index - index_for_nn)
            loss_comp = np.mean(abs_frame_error / float(n_frames))
            loss.append(loss_comp)
            # histogram error loss frames index bin count
            error_hist_cnts = []
            for i, abs_err in enumerate(abs_frame_error):
                error_hist_cnts.extend([i] * int(abs_err))
            nn_dist.append(np.mean(knn_img_indexes[:, 0, 1]))
            # print infos
            view_pair_lens = "->".join([str(len(e)) for e in [emb1, emb2]])
            log.info(
                "aligment loss pair {:>30} with {} frames, loss {:>6.5}, mean nn dist {:>6.5}".format(
                    comm_name, view_pair_lens, loss_comp, np.mean(nn_dist)
                )
            )

        # get the distances for all view paris for the same frame
        for emb1, emb2 in itertools.combinations(view_pair_task_emb, 2):
            min_frame_len = min(np.shape(emb1)[0], np.shape(emb2)[0])
            dist_view_i = [get_distances(e1, e2) for e1, e2 in zip(emb1[:min_frame_len], emb2[:min_frame_len])]
            dist_view_pairs.append(np.mean(dist_view_i))
        loss, nn_dist, dist_view_pairs = [np.mean(i) for i in [loss, nn_dist, dist_view_pairs]]
    return loss, nn_dist, dist_view_pairs, error_hist_cnts
def get_dataloader_train(dir_vids,
                         num_views,
                         batch_size,
                         use_cuda,
                         img_size=299,
                         filter_func=None,
                         label_funcs=None,
                         examples_per_seq=None):
    transformer_train = get_train_transformer(img_size)
    sampler = None
    shuffle = True
    if examples_per_seq is None:
        # sample one vid per batch used for lifted loss
        # used for default tcn for lifted and npair loss
        examples_per_batch = batch_size
    else:

        examples_per_batch = batch_size // examples_per_seq
    log.info(
        'train data loader example per sequence: {}'.format(examples_per_seq))

    shuffle = False
    transformed_dataset_train = DoubleViewPairDataset(
        vid_dir=dir_vids,
        number_views=num_views,
        filter_func=filter_func,
        lable_funcs=label_funcs,
        # random_view_index=True,
        # std_similar_frame_margin_distribution=sim_frames,
        transform_frames=transformer_train)
    # sample so that only one view pairs is in a batch
    sampler = ViewPairSequenceSampler(
        dataset=transformed_dataset_train,
        examples_per_sequence=examples_per_batch,
        # similar_frame_margin=3,# TODO
        batch_size=batch_size)

    dataloader_train = DataLoader(transformed_dataset_train,
                                  drop_last=True,
                                  batch_size=batch_size,
                                  shuffle=shuffle,
                                  sampler=sampler,
                                  num_workers=4,
                                  pin_memory=use_cuda)

    return dataloader_train
def web_cam_samper(port):
    """ cv2 webcam manger """
    video_capture = cv2.VideoCapture(port)
    if not video_capture.isOpened():
        log.error("port is open {}".format(port))
        close_open_web_cams()
        video_capture = cv2.VideoCapture(port)

    # test vid sample
    assert sample_image(
        video_capture) is not None, " cam failed port {} ".format(port)
    # When everything is done, release the capture
    try:
        yield video_capture
    except Exception:
        video_capture.release()
        log.info("release video_capture: {} port {}".format(
            video_capture, port))
 def save_webcam_frames(p_ranke, port, event_sync, result_frames_q):
     """ sample frame on event set """
     frame_cnt = 0
     with web_cam_samper(port) as camera:
         log.info("port: {}".format(port))
         adjust_brightness(camera)
         while True:
             frame = sample_image(camera)
             sample_time = time.time()
             # log.info('port {} sample_time: {},frame_cnt {}'.format(port,sample_time,frame_cnt))
             result_frames_q.put({
                 "frame": frame,
                 "time": sample_time,
                 "num": frame_cnt
             })
             frame_cnt += 1
             event_sync.wait()
             event_sync.clear()
Example #14
0
def save_model(model, optimizer, training_args, is_best, model_folder, step):
    state = {
        "datetime": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "step": step,
        "training_args": training_args,
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
    }

    model_folder = os.path.expanduser(model_folder)
    if not os.path.exists(model_folder):
        os.makedirs(model_folder)
    filename = os.path.join(model_folder, "model.pth.tar")
    torch.save(state, filename)

    log.info("Saved Model from: {}, step {}".format(filename, step))
    if is_best:
        filename_copy = os.path.join(model_folder, "model_best.pth.tar")
        shutil.copyfile(filename, filename_copy)
        log.info("copyed to model_best!")
Example #15
0
def create_model(use_cuda, load_model_file=None, **kwargs):
    asn = define_model(use_cuda, **kwargs)
    start_step = 0
    optimizer_state_dict = None
    training_args = None
    if load_model_file:
        load_model_file = os.path.expanduser(load_model_file)
        assert os.path.isfile(load_model_file), "file not found {}".format(
            load_model_file)
        checkpoint = torch.load(load_model_file)
        start_step = checkpoint.get("step", 0)
        training_args = checkpoint.get("training_args", None)
        optimizer_state_dict = checkpoint["optimizer_state_dict"]
        asn.load_state_dict(checkpoint["model_state_dict"], strict=False)
        log.info("Restoring Model from: {}, step {}, datetime {}".format(
            load_model_file, start_step, checkpoint.get("datetime")))

    if use_cuda:
        asn = asn.cuda()
    return asn, start_step, optimizer_state_dict, training_args
Example #16
0
def save_model(model, optimizer, training_args, is_best, model_folder, step):
    state = {
        'datetime': datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        'step': step,
        'training_args': training_args,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }

    model_folder = os.path.expanduser(model_folder)
    if not os.path.exists(model_folder):
        os.makedirs(model_folder)
    filename = os.path.join(model_folder, 'model.pth.tar')
    torch.save(state, filename)
    checkpoint = torch.load(filename)

    log.info("Saved Model from: {}, step {}".format(filename, step))
    if is_best:
        filename_copy = os.path.join(model_folder, 'model_best.pth.tar')
        shutil.copyfile(filename, filename_copy)
        log.info("copyed to model_best!")
Example #17
0
def show_sequence(imgs, delay=0, n_frame=1, save_name=None, to_rgb=True):
    """ shows a 2d imgs array like [[img1,img2], with frame counter as titles """
    if n_frame != 1:
        imgs = [i_v[::n_frame] for i_v in imgs]
    titles = [["frame {}".format(n * n_frame) for n in range(len(i))]
              for i in imgs]
    # take one the longest title row to show on top
    titles = [sorted(titles, key=len, reverse=True)[0]]
    montage_image = montage(imgs,
                            titles=titles,
                            margin_separate_vertical=0,
                            margin_separate_horizontal=5)
    montage_image = convert_to_uint8(montage_image)
    if to_rgb:
        montage_image = cv2.cvtColor(montage_image, cv2.COLOR_RGB2BGR)
    if save_name is not None:
        cv2.imwrite(save_name, montage_image)
    cv2.imshow('sequence', montage_image)
    log.info('click image and then a key to continue')
    cv2.waitKey(delay)  # == 27:  # ESC
    cv2.destroyAllWindows()
def log_train(writer, mi, loss_metric, criterion_metric, entropy, global_step):
    """ log to tb and print log msg """

    msg = "steps {}, dist: pos {:.2},neg {:.2},neg cos dist: pos {:.2},cos_neg {:.2}, loss metric:{:.3}".format(
        global_step, mi['dist pos'], mi["dist neg"], mi['dist pos cos'],
        mi['dist neg cos'], loss_metric)
    log.info(msg)
    writer.add_scalar('train/loss' + criterion_metric.__class__.__name__,
                      loss_metric, global_step)
    writer.add_scalars('train/distane', {
        'positive': mi['dist pos'],
        'negative': mi['dist neg']
    }, global_step)
    writer.add_scalars('train/product', {
        'positive': mi['dist pos dot'],
        'negative': mi['dist pos dot']
    }, global_step)

    writer.add_scalars('train/negativ_cosin_dist', {
        'positive': mi['dist pos cos'],
        'negative': mi['dist neg cos']
    }, global_step)

    writer.add_scalar('train/loss_entro', entropy, global_step)
def close_open_web_cams():
    # TODO checko for port her
    # Try to find and kill hanging cv2 process_ids.
    try:
        output = subprocess.check_output(["lsof -t /dev/video*"], shell=True)
        log.info("Found hanging cv2 process_ids:")
        log.info(output)
        log.info("Killing hanging processes...")
        output = str(output)
        for process_id in output.split("\n")[:-1]:
            subprocess.call(["kill %s" % process_id], shell=True)
            time.sleep(3)
        # Recapture webcams.
    except subprocess.CalledProcessError:
        raise ValueError(
            "Cannot connect to cameras. Try running: \n"
            "ls -ltrh /dev/video* \n "
            "to see which ports your webcams are connected to. Then hand those "
            "ports as a comma-separated list to --webcam_ports, e.g. "
            "--webcam_ports 0,1")
    parser.add_argument('--load-model', type=str, required=False)
    parser.add_argument('--val-dir-metric', type=str, default='~/asn_data/val')
    parser.add_argument('--batch-size', type=int, default=1)
    parser.add_argument('--num-views', type=int, default=2)
    parser.add_argument(
        '--task',
        type=str,
        default="cstack",
        help='dataset, load tasks for real block data (cstack)')
    return parser.parse_args()


if __name__ == '__main__':

    args = get_args()
    log.info("args: {}".format(args))
    use_cuda = torch.cuda.is_available()
    print('use_cuda: {}'.format(use_cuda))
    asn, start_epoch, global_step, _ = create_model(use_cuda, args.load_model)
    log.info('start_epoch: {}'.format(start_epoch))
    log.info('asn: {}'.format(asn.__class__.__name__))
    img_size = 299
    vid_name_to_task_func = transform_vid_name_to_task(args.task)
    log.info('args.val_dir_metric: {}'.format(args.val_dir_metric))
    dataloader_val = get_dataloader_val(args.val_dir_metric, args.num_views,
                                        args.batch_size, use_cuda)
    if use_cuda:
        asn.cuda()

    def model_forward(frame_batch):
        if use_cuda:
 def _print_dataset_info_txt(self):
     info_txt_file = os.path.join(self.vid_dir, "../../dataset_info.txt")
     if os.path.exists(info_txt_file):
         with open(info_txt_file, 'r') as f:
             log.info("dataset info:\n {}".format(f.read()))
def main():
    args = get_args()
    args.out_dir = os.path.expanduser(args.out_dir)
    ports = list(map(int, args.ports.split(",")))
    log.info("ports: {}".format(ports))
    sample_events = [multiprocessing.Event() for _ in ports]
    num_frames = args.max_frame
    if args.display:
        disp_q = multiprocessing.Queue()
        p = Process(target=display_worker, args=(disp_q, ), daemon=True)
        p.start()
    # process to save images as a file
    im_data_q, im_file_q = multiprocessing.Queue(), multiprocessing.Queue()
    img_folder = os.path.join(args.out_dir, "images", args.set_name, args.tag)
    vid_folder = os.path.join(args.out_dir, "videos", args.set_name)
    img_args = (ports, img_folder, args.tag, im_data_q, im_file_q)
    p = Process(target=save_img_worker, args=img_args, daemon=True)
    p.start()

    log.info("img_folder: {}".format(img_folder))
    log.info("vid_folder: {}".format(vid_folder))
    log.info("fps: {}".format(args.fps))

    try:
        time_prev = time.time()
        # loop to sample frames with events
        for frame_cnt, port_data in enumerate(
                sample_frames(ports, sample_events, num_frames)):
            sample_time_dt = time.time() - time_prev
            if frame_cnt % 10 == 0:
                log.info("frame {} time_prev: {}".format(
                    frame_cnt,
                    time.time() - time_prev))

            time_prev = time.time()
            # set events to trigger cams
            for e in sample_events:
                e.set()

            if frame_cnt == 0:
                # skip first frame because not  synchronized with event
                log.info("START: {}".format(frame_cnt))
                continue
            elif (sample_time_dt - 1.0 / args.fps) > 0.1:
                log.warn("sampling frame taks too long for fps")
            # check sampel time diff
            if len(ports) > 1:
                dt = [
                    np.abs(p1["time"] - p2["time"])
                    for p1, p2 in combinations(port_data.values(), 2)
                ]
                # log.info('dt: {}'.format(np.mean(dt)))
                if np.max(dt) > 0.1:
                    log.warn(
                        "camera sample max time dt: {}, check light condition and camera models"
                        .format(np.max(dt)))
            assert all(frame_cnt == d["num"]
                       for d in port_data.values()), "out of sync"

            im_data_q.put(port_data)
            if args.display:
                disp_q.put(port_data)

            time.sleep(1.0 / args.fps)
    except KeyboardInterrupt:
        # create vids form images save before
        im_shape = {p: d["frame"].shape for p, d in port_data.items()}
        img_files = defaultdict(list)
        for d in get_all_queue_result(im_file_q):
            for p, f in d.items():
                img_files[p].append(f)
        # TODO start for each a procresss and join
        for view_i, p in enumerate(port_data.keys()):
            save_vid_worker(img_files[p], view_i, vid_folder, args.tag,
                            im_shape[p], args.fps)

    cv2.destroyAllWindows()
 def print_frame_len_info(self):
     max_len_vid = max(max(l) for l in self.frame_lengths)
     min_len_vid = min(min(l) for l in self.frame_lengths)
     mean_len_vid = int(np.mean(self.frame_lengths))
     log.info("{} videos frame len mean : {}, min: {}, max: {}".format(
         self.vid_dir, mean_len_vid, min_len_vid, max_len_vid))
def main():
    args = get_args()
    log.info("args: {}".format(args))
    writer = init_log_tb(args.save_folder)
    use_cuda = torch.cuda.is_available()
    print('use_cuda: {}'.format(use_cuda))
    criterion = {"lifted": LiftedStruct(), "liftedcombi": LiftedCombined()}[args.loss]
    log.info("criterion: for {} ".format(
        criterion.__class__.__name__))

    asn, global_step_start, _, _ = create_model(
        use_cuda, args.load_model, embedding_size=args.emb_dim)
    log.info('asn: {}'.format(asn.__class__.__name__))
    asn.train()

    # load function which maps video file name to task for different datasets
    vid_name_to_task = transform_vid_name_to_task(args.task)
    dataloader_val = get_dataloader_val(args.val_dir_metric,
                                        args.num_views, args.batch_size, use_cuda)

    train_filter_func = None
    if args.train_filter_tasks is not None:
        # filter out tasks by names for the training set
        train_filter_tasks = args.train_filter_tasks.split(',')
        log.info('train_filter_tasks: {}'.format(train_filter_tasks))

        def train_filter_func(name, n_frames):
            return all(task not in name for task in train_filter_tasks)  # ABD->C
    examples_per_seq = args.num_example_batch
    dataloader_train = get_dataloader_train(args.train_dir, args.num_views, args.batch_size,
                                            use_cuda,
                                            img_size=299,
                                            filter_func=train_filter_func,
                                            examples_per_seq=examples_per_seq)

    all_view_pair_names = dataloader_train.dataset.get_all_comm_view_pair_names()
    all_view_pair_frame_lengths = dataloader_train.dataset.frame_lengths

    # for every task one label based on video name
    # not used to train the models
    transform_comm_name, num_domain_task_classes, task_names = val_fit_task_label(vid_name_to_task, all_view_pair_names)
    log.info('task names: {}'.format(task_names))

    # func to transform video name to a task label
    label_funcs = {'domain task label': transform_comm_name}
    num_domain_frames = args.num_domain_frames

    # embedding class
    log.info('num_domain_frames: {}'.format(num_domain_frames))

    # Discriminator network with iputs outputs depending on the args settings
    net_input = args.emb_dim * num_domain_frames
    d_net = Discriminator(net_input, H=args.d_net_hidden_dim, z_dim=args.d_net_z_dim, d_out=[num_domain_task_classes])

    # DATA domain
    # filter out fake examples and tasks for D net
    stride = args.multi_domain_frames_stride
    if args.train_filter_tasks is not None:
        def filter_func_domain(name, frames_cnt):
            """ return no fake examples for filtered tasks"""
            return "fake" not in name and all(task not in name for task in train_filter_tasks)
    else:
        filter_func_domain = None

        def filter_func_domain(name, frames_cnt):
            """ return no fake exmaples for filtered tasks"""
            return "fake" not in name

    dataloader_train_domain = get_skill_dataloader(args.train_dir,
                                                   args.num_views,
                                                   args.batch_size,
                                                   use_cuda,
                                                   img_size=299,
                                                   filter_func=filter_func_domain,
                                                   label_funcs=label_funcs,
                                                   num_domain_frames=num_domain_frames,
                                                   stride=stride)
    if use_cuda:
        torch.cuda.seed()
        criterion.cuda()
        asn.cuda()
        d_net.cuda()

    model_forward_cuda = functools.partial(model_forward, mdl=asn, use_cuda=use_cuda, to_numpy=False)
    model_forward_np = functools.partial(model_forward, mdl=asn, use_cuda=use_cuda, to_numpy=True)

    # define optimizer for encoder (g) and Discriminator (d)
    params_asn = filter(lambda p: p.requires_grad, asn.parameters())
    optimizer_g = optim.Adam(params_asn, lr=args.lr_d)
    optimizer_d = optim.Adam(d_net.parameters(), lr=args.lr_g)

    assert isinstance(criterion, (LiftedStruct, LiftedCombined))
    key_views = ["frames views {}".format(i) for i in range(args.num_views)]
    iter_metric = iter(data_loader_cycle(dataloader_train))
    iter_domain = iter(data_loader_cycle(dataloader_train_domain))
    loss_val_min = None
    loss_val_min_step = 0

    for global_step in range(global_step_start, args.steps):

        # =======================================================
        # update the encoder network
        sample_batched = next(iter_metric)
        # metric loss
        img = torch.cat([sample_batched[key_views[0]],
                         sample_batched[key_views[1]]])
        embeddings = model_forward_cuda(Variable(img))
        n = sample_batched[key_views[0]].size(0)
        anchor_emb, positive_emb = embeddings[:n], embeddings[n:]
        label_positive_pair = np.arange(n)
        labels = Variable(torch.Tensor(np.concatenate([label_positive_pair, label_positive_pair]))).cuda()

        # METRIC loss
        if examples_per_seq == 1:
            loss_metric = criterion(embeddings, labels)
        else:
            loss_metric = multi_vid_batch_loss(criterion, embeddings, labels,
                                               num_vid_example=examples_per_seq)

        # set input and targets
        sample_batched_domain = next(iter_domain)

        img_domain = torch.cat([sample_batched_domain[key_views[0]],
                                sample_batched_domain[key_views[1]]])
        emb_asn = model_forward_cuda(Variable(img_domain))

        if num_domain_frames != 1:
            # multiple frames as skills
            bl = emb_asn.size(0)
            emb_size = emb_asn.size(1)
            emb_asn = emb_asn.view(bl // num_domain_frames, num_domain_frames * emb_size)
            # mask out label for cat view

        kl_loss, d_out_gen = d_net(emb_asn)
        d_out_gen = d_out_gen[0]

        # min the entropy for different classes
        optimizer_g.zero_grad()
        optimizer_d.zero_grad()

        # ensure equal usage of fake samples
        loss_g = loss_metric * 0.1

        # maximize the entropy
        entropy_fake = entropy(d_out_gen)
        entropy_fake.backward(retain_graph=True)
        entropy_margin = -1. * marginalized_entropy(d_out_gen)
        # ensure equal usage of fake samples
        entropy_margin.backward(retain_graph=True)

        # update the encoder network
        loss_g.backward(retain_graph=True)
        optimizer_g.step()

        optimizer_g.zero_grad()
        optimizer_d.zero_grad()

        # =======================================================
        # update the Discriminator

        # maximize marginalized entropy over real samples to ensure equal usage
        entropy_margin = -1. * marginalized_entropy(d_out_gen)
        entropy_margin.backward(retain_graph=True)
        # minimize entropy to make certain prediction of real sample
        entropy_real = -1. * entropy(d_out_gen)
        entropy_real.backward(retain_graph=True)
        kl_loss.backward()
        optimizer_d.step()

        if global_step % 100 == 0 or global_step == 1:
            # log training
            loss_metric = loss_g.data.cpu().numpy().item()
            mi = get_metric_info_multi_example(anchor_emb.data.cpu().numpy(), positive_emb.data.cpu().numpy())
            log_train(writer, mi, loss_metric, criterion, entropy_fake, global_step)

        # =======================================================
        # Validation
        if global_step % args.val_step == 0 and global_step > global_step_start:
            log.info("==============================")
            asn.eval()

            if args.plot_tsne and global_step % 20000 == 0:
                # save a tsne plot
                visualize_embeddings(model_forward_cuda, dataloader_val, summary_writer=None,
                                     global_step=global_step, save_dir=args.save_folder,
                                     label_func=vid_name_to_task)
            loss_val, nn_dist, dist_view_pais, frame_distribution_err_cnt = view_pair_alignment_loss(model_forward_np,
                                                                                                     args.num_views,
                                                                                                     dataloader_val)
            asn.train()

            writer.add_histogram("val/frame_error_count",
                                 np.array(frame_distribution_err_cnt), global_step)
            writer.add_scalar('val/alignment_loss',
                              loss_val, global_step)
            writer.add_scalar('val/nn_distance',
                              nn_dist, global_step)
            writer.add_scalar(
                'val/distance_view_pairs_same_frame', dist_view_pais, global_step)

            is_best = False
            if loss_val_min is None or loss_val < loss_val_min:
                loss_val_min = loss_val
                loss_val_min_step = global_step
                is_best = True

            msg = "Validation alignment loss: {}, nn mean dist {:.3}, lowest loss {:.4} at {} steps".format(
                loss_val, nn_dist, loss_val_min, loss_val_min_step)
            log.info(msg)
            save_model(asn, optimizer_g, args, is_best,
                       args.save_folder, global_step)

    writer.close()
Example #25
0
    def __init__(
        self,
        inception,
        additional_conv_sizes=[512, 512],
        fc_hidden_sizes=[2048],
        embedding_size=32,
        dp_ratio_pretrained_act=0.2,
        dp_ratio_conv=1.0,
        dp_ratio_fc=0.2,
        rnn_type=None,
        mode_gaussian_dist=False,
        latent_z_dim=512,
        rnn_forward_seqarade=False,
        l2_normalize_output=False,
        finetune_inception=False,
    ):
        super().__init__()
        self.gaussian_mode = mode_gaussian_dist
        self.embedding_size = embedding_size
        log.info("finetune_inception: {}".format(finetune_inception))
        if not finetune_inception:
            # disable training for inception v3
            for child in inception.children():
                for param in child.parameters():
                    param.requires_grad = False

        # see:
        # https://github.com/pytorch/vision/blob/master/torchvision/models/inception.py
        self.inception_end_point_mixed_5d = nn.ModuleList([
            inception.Conv2d_1a_3x3,
            inception.Conv2d_2a_3x3,
            inception.Conv2d_2b_3x3,
            nn.MaxPool2d(kernel_size=3, stride=2),
            inception.Conv2d_3b_1x1,
            inception.Conv2d_4a_3x3,
            nn.MaxPool2d(kernel_size=3, stride=2),
            inception.Mixed_5b,
            inception.Mixed_5c,
            inception.Mixed_5d,
        ])

        in_channels = 288
        self.Conv2d_6n_3x3 = nn.ModuleList()
        if dp_ratio_pretrained_act < 1.0:
            self.Conv2d_6n_3x3.append(nn.Dropout(p=dp_ratio_pretrained_act))
        # padding=1 so like in the tf =SAME
        for i, out_channels in enumerate(additional_conv_sizes):
            self.Conv2d_6n_3x3.append(
                BNConv2d(in_channels,
                         out_channels,
                         padding=1,
                         kernel_size=3,
                         stride=1))
            if dp_ratio_conv < 1.0:
                self.Conv2d_6n_3x3.append(nn.Dropout(p=dp_ratio_conv))
            in_channels = out_channels

        # Take the spatial soft arg-max of the last convolutional layer.
        self.SpatialSoftmax = SpatialSoftmax(channel=512, height=35,
                                             width=35)  # nn.Softmax2d()
        self.FullyConnected7n = nn.ModuleList([Flatten()])
        in_channels = 1024  # out of SpatialSoftmax

        self.num_freatures = int(in_channels)
        for i, num_hidden in enumerate(fc_hidden_sizes):
            self.FullyConnected7n.append(
                Dense(in_channels, num_hidden, activation=F.relu))
            if dp_ratio_fc > 0.0:
                self.FullyConnected7n.append(nn.Dropout(p=dp_ratio_fc))
            in_channels = num_hidden

        if self.gaussian_mode:
            self.FullyConnected7n.append(
                Dense(in_channels, 512, activation=F.relu))
            self.l_mu = Dense(512, latent_z_dim)
            self.l_var = Dense(512, latent_z_dim)
            # out layer for sampeld lat var
            self.lat_sampled_out_emb = nn.ModuleList([
                Dense(latent_z_dim, 512, activation=F.relu),
                nn.Dropout(p=0.2),
                Dense(512, 512, activation=F.relu),
                nn.Dropout(p=0.2),
                Dense(512, embedding_size),
            ])
            self._sequential_z_out = nn.Sequential(*self.lat_sampled_out_emb)
        else:
            self.FullyConnected7n.append(Dense(in_channels, embedding_size))

        self._all_sequential_feature = nn.Sequential(
            *self.inception_end_point_mixed_5d, *self.Conv2d_6n_3x3,
            self.SpatialSoftmax)

        self._all_sequential_emb = nn.Sequential(*self.FullyConnected7n)
        self.l2_normalize_output = l2_normalize_output
        # use l2 norm with triplet loss
        if l2_normalize_output:
            log.info("TCN with l2 norm out")
def plot_embedding(X,
                   labels_str,
                   title,
                   imgs=None,
                   save_dir=None,
                   frame_lable=None,
                   max_frame=None,
                   vid_lable=None):
    # http://scikit-learn.org/stable/auto_examples/manifold/plot_lle_digits.html
    x_min, x_max = np.min(X, 0), np.max(X, 0)
    X = (X - x_min) / (x_max - x_min)
    if imgs is not None:
        fig = plt.figure(figsize=(20, 20))
        ax = plt.subplot(221)
    else:
        fig = plt.figure()
        ax = fig.gca()

    # labels blow plt
    n_classes, y, colors, legend_elements = plt_labeled_data(ax, X, labels_str)

    plt.title(title)
    if imgs is not None:
        # plt again but with image overlay
        ax = plt.subplot(222)
        ax.set_title("image overlay")
        ax.scatter(X[:, 0], X[:, 1], color=colors)
        if hasattr(offsetbox, "AnnotationBbox"):
            # only print thumbnails with matplotlib > 1.0
            shown_images = np.array([[1.0, 1.0]])  # just something big
            for i in range(X.shape[0]):
                dist = np.sum((X[i] - shown_images)**2, 1)
                if np.min(dist) < 5e-3:
                    # don't show points that are too close
                    continue
                shown_images = np.r_[shown_images, [X[i]]]
                imagebox = offsetbox.AnnotationBbox(offsetbox.OffsetImage(
                    imgs[i], cmap=plt.cm.gray_r, zoom=0.75),
                                                    X[i],
                                                    pad=0.0)
                ax.add_artist(imagebox)

        # plt legend same as befor
        plt_labels_blow(ax, list(legend_elements.values()))

    if frame_lable is not None:
        # plt the frames classe
        # show color for ever 50 frame in legend
        ax = plt.subplot(223)
        plt_labeled_data(
            ax,
            X,
            frame_lable,
            label_filter_legend=lambda l: l % 50 == 0,
            plt_cm=plt.cm.Spectral,
            index_color_factor=max_frame,
        )

        ax.set_title("frames as label (color range normalized for every vid)")
    if vid_lable is not None:
        # plt the view pair as classe
        ax = plt.subplot(224)
        plt_labeled_data(ax, X, vid_lable, label_filter_legend=lambda x: False)

        ax.set_title("view pair as label")

    if save_dir is not None:
        create_dir_if_not_exists(save_dir)
        save_dir = os.path.expanduser(save_dir)
        title = os.path.join(save_dir, title)
    fig.savefig(title + ".pdf", bbox_inches="tight")
    log.info("save TSNE plt to: {}".format(title))
    plt.close("all")
def visualize_embeddings(
    func_model_forward,
    data_loader,
    summary_writer=None,
    global_step=0,
    seq_len=None,
    stride=None,
    label_func=None,
    save_dir=None,
    tag="",
    emb_size=32,
):
    """visualize embeddings with tensorboardX

    Args:
        summary_writer(tensorboardX.SummaryWriter):
        data_loader(ViewPairDataset): with shuffle false
        label_func: function to label a frame: input is (vid_file_comm,frame_idx=None,vid_len=None,csv_file=None,state_label=None)
    Returns:
        None
        :param func_model_forward:
        :param global_step:
        :param seq_len:
        :param stride:
        :param save_dir:
        :param tag:
        :param emb_size:

    """
    assert isinstance(
        data_loader.dataset,
        ViewPairDataset), "dataset must be form type ViewPairDataset"
    data_len = len(data_loader.dataset)
    vid_dir = data_loader.dataset.vid_dir

    if seq_len:
        assert stride is not None
        # cut off first frames
        data_len -= seq_len * stride * len(data_loader.dataset.video_paths)
    embeddings = np.empty((data_len, emb_size))
    img_size = 50  # image size to plot
    frames = torch.empty((data_len, 3, img_size, img_size))
    # trans form the image to plot it later
    trans = transforms.Compose([
        transforms.ToPILImage(),  # expects rgb, moves channel to front
        transforms.Resize(img_size),
        transforms.ToTensor(),  # image 0-255 to 0. - 1.0
    ])
    cnt_data = 0
    labels = []
    view_pair_name_labels = []
    labels_frame_idx = []
    vid_len_frame_idx = []
    with tqdm(total=len(data_loader),
              desc="computing embeddings for {} frames".format(
                  len(data_loader))) as pbar:
        for i, data in enumerate(data_loader):
            # compute the emb for a batch
            frames_batch = data["frame"]
            if seq_len is None:
                emb = func_model_forward(frames_batch)
                # add emb to dict and to quue if all frames
                # for e, name, view, last in zip(emb, data["common name"], data["view"].numpy(), data['is last frame'].numpy()):
                # transform all frames to a smaller image to plt later
                for e, frame in zip(emb, frames_batch):
                    embeddings[cnt_data] = e
                    # transform only for on img possible
                    frames[cnt_data] = trans(frame).cpu()
                    cnt_data += 1
                    if data_len == cnt_data:
                        break
                state_label = data.get("state lable", None)
                comm_name = data["common name"]
                frame_idx = data["frame index"]
                vid_len = data["video len"]
                labels_frame_idx.extend(frame_idx.numpy())
                vid_len_frame_idx.extend(vid_len.numpy())
                if label_func is not None:
                    state_label = len(comm_name) * [
                        None
                    ] if state_label is None else state_label
                    state_label = [
                        label_func(c, i, v_len, get_video_csv_file(vid_dir, c),
                                   la) for c, la, i, v_len in
                        zip(comm_name, state_label, frame_idx, vid_len)
                    ]
                else:
                    state_label = comm_name
                labels.extend(state_label)
                view_pair_name_labels.extend(comm_name)
                if data_len == cnt_data:
                    break
            else:
                raise NotImplementedError()

            pbar.update(1)

    log.info("number of found labels: {}".format(len(labels)))
    if len(labels) != len(embeddings):
        # in case of rnn seq cut cuff an the end, in case of drop last
        log.warn(
            "number of labels {} smaller than embeddings, changing embeddings size"
            .format(len(labels)))
        embeddings = embeddings[:len(labels)]
        frames = frames[:len(labels)]
    if len(labels) == 0:
        log.warn("length of labels is zero!")
    else:
        log.info("start TSNE fit")
        labels = labels[:data_len]
        imgs = flip_imgs(frames.numpy(), rgb_to_front=False)
        rnn_tag = "_seq{}_stride{}".format(
            seq_len, stride) if seq_len is not None else ""
        X_tsne = TSNE_multi(n_jobs=4, perplexity=40).fit_transform(
            embeddings)  # perplexity = 40, theta=0.5
        create_time_vid(X_tsne, labels_frame_idx, vid_len_frame_idx)
        plot_embedding(
            X_tsne,
            labels,
            title=tag + "multi-t-sne_perplexity40_theta0.5_step" +
            str(global_step) + rnn_tag,
            imgs=imgs,
            save_dir=save_dir,
            frame_lable=labels_frame_idx,
            max_frame=vid_len_frame_idx,
            vid_lable=view_pair_name_labels,
        )