Ejemplo n.º 1
0
def train_model(train_batches, word2idx, epochs, valid_batches, model_save, params, use_gpu):
    for epoch in range(epochs):
        total_loss = 0
        for batch in tqdm(train_batches, desc="Epoch %d/%d"%(epoch+1, epochs)):
            model.zero_grad()
            model.hidden = model.init_hidden()
            
            X = utils.prepare_input(batch[:-1,:])
            y = utils.prepare_input(batch[1:,:])

            if use_gpu:
                X = X.cuda()
                y = y.cuda()

            output_scores = model(X)

            true_y = y.contiguous().view(-1, 1).squeeze()
            pred_y = output_scores.view(-1, len(word2idx))
            
            loss = loss_function(pred_y, true_y)
            total_loss += loss.data
            
            loss.backward()
            optimizer.step()

        params["model"] = model.state_dict()
        params["optimizer"] = optimizer.state_dict()
        params["epoch"] = epoch
        torch.save(params, model_save+"_"+str(epoch))

        print ("Training loss: ", total_loss.data.cpu().numpy()/len(train_batches))
        model.eval()
        print ("Validation loss: ", utils.evaluate(model, loss_function, valid_batches)/len(valid_batches))
        model.train()
Ejemplo n.º 2
0
def render(policy, embedding_net, device):
    from torchvision import transforms
    trans = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])
    env = MarioEnvironment()
    s = env.reset()
    images = [s]
    s = s.reshape(s.shape[1], s.shape[2], s.shape[0])
    # print(s.shape)
    s = trans(s)
    # temp = s[..., np.newaxis] * np.ones(3)
    # temp = temp.squeeze()
    # temp = temp.reshape(temp.shape[1], temp.shape[0], temp.shape[2])
    # print(temp.shape)
    coin = 0
    for _ in range(40):
        # env.render()
        # s = np.reshape(s, (s.shape[0]*s.shape[1]*s.shape[2]))
        input_state = prepare_input(s)
        input_state = embedding_net(input_state)

        action_dist, action = policy(input_state)
        action_dist, action = action_dist[0], action[
            0]  # Remove the batch dimension
        s_prime, r, t, coins = env.step(action)
        # print(r, t, coins)
        coin += coins
        if t:
            break
        s = s_prime
        images.append(s)
        s = s.reshape(s.shape[1], s.shape[2], s.shape[0])
        # s = s.reshape(s.shape[0] * s.shape[1] * s.shape[2])
        s = trans(s)

        # temp = s[..., np.newaxis] * np.ones(3)
        # temp = temp.squeeze()
        # temp = temp.reshape(temp.shape[1], temp.shape[0], temp.shape[2])

    # Create gifs
    print('total coins', coin)
    make_gif(images, '0.gif')
Ejemplo n.º 3
0
def safe_decoder(model, image_file, res_type):
    try:
        if (not os.path.isfile(image_file)):
            raise IOError('fail to locate image_file')
        # load image array from file
        image_array = imread(image_file)
        if (image_array is None):
            raise IOError('fail to decode image_file')
        # prepare image array as theano tensor
        image_shape = image_array.shape[:2]
        th_tensor = prepare_input(image_array, res_type)
        if (th_tensor.mean() < 0):
            th_tensor = -th_tensor
        # decode input using the pretrained PPT text detector
        res_map = decode_image(model, th_tensor, image_shape)
    except Exception, e:
        print "WARNING: something wrong during decoding", image_file, e
        res_map = None
Ejemplo n.º 4
0
    def run(self):
        while not rospy.is_shutdown():

            data = None
            while data is None:
                try:
                    data = rospy.wait_for_message("camera", Image, timeout=10)
                except:
                    pass

            cv_image = utils.prepare_input(data, self.crop_size)
            prediction = self.model.predict(cv_image)
            segmentation_result, segmentation_overlay_result = utils.prepare_output(
                prediction[0, :, :, 1], cv_image[0], self.view_size,
                self.prediction_threshold)

            self.seg_pub.publish(utils.get_cv_msg(segmentation_result))
            self.seg_overlay_pub.publish(
                utils.get_cv_msg(segmentation_overlay_result))
Ejemplo n.º 5
0
                  on='patient_id',
                  how='left')

    # df = df[df['signal_len'] >= 15000]

    patient_ids = df['patient_id'].to_numpy()
    to_explain = patient_ids[:background * 2]

    background_patient_ids = df.head(background)['patient_id'].to_numpy()

    background_inputs = [
        os.path.join(data_dir, patient_id)
        for patient_id in background_patient_ids
    ]
    background_inputs = torch.stack([
        torch.from_numpy(prepare_input(input)).float()
        for input in background_inputs
    ]).to(device)
    background_inputs = background_inputs[:, use_leads, :]

    e = shap.GradientExplainer(model, background_inputs)

    if not os.path.exists(result_path):
        svs = []
        y_scores = []
        for patient_id in tqdm(to_explain):
            input = os.path.join(data_dir, patient_id)
            inputs = torch.stack(
                [torch.from_numpy(prepare_input(input)).float()]).to(device)
            inputs = inputs[:, use_leads, :]
            y_scores.append(
Ejemplo n.º 6
0
data.transform = Compose(
    [Lambda(lambda x: (np.array(x).reshape((28, 28)) - mean) / std)])

images = []
images_orig = []
boundaries = []

for i, (image, label) in tqdm(enumerate(data),
                              desc='Preparing dataset',
                              total=n_of_samples,
                              position=0,
                              leave=True):
    if i == n_of_samples:
        break
    images_orig.append(image)
    torch_image, bound = prepare_input(image)
    images.append(torch.from_numpy(torch_image))
    boundaries.append(bound)

dataset_size: int = len(images)
indices: list = list(range(dataset_size))
split: int = int(np.floor(val_split * dataset_size))
if shuffle_dataset:
    np.random.seed(random_seed)
    np.random.shuffle(indices)
train_indices, val_indices = indices[split:], indices[:split]

train_sampler: SubsetRandomSampler = SubsetRandomSampler(train_indices)
test_sampler: SubsetRandomSampler = SubsetRandomSampler(val_indices)

my_set = ImageSet(images, images_orig, boundaries)
Ejemplo n.º 7
0
        os.path.join(options.data_path, 'color-input', input_file + '.png'))
    rgb_bg = Image.open(
        os.path.join(options.data_path, 'color-background',
                     input_file + '.png'))
    depth_in = Image.open(
        os.path.join(options.data_path, 'depth-input', input_file + '.png'))
    depth_bg = Image.open(
        os.path.join(options.data_path, 'depth-background',
                     input_file + '.png'))
    label = Image.open(
        os.path.join(options.data_path, 'label', input_file + '.png'))
    cam_intrinsic = np.loadtxt(
        os.path.join(options.data_path, 'camera-intrinsics',
                     input_file + '.txt'))

    img_input = prepare_input(rgb_in, depth_in, options.device)

    ## inference
    print('computing inference: ', input_file)
    t = time.time()
    output = model(img_input)

    cls_pred = np.squeeze(output.data.max(1)[1].cpu().numpy(),
                          axis=0).astype(np.float64)
    cls_pred = resize(cls_pred, (options.img_height, options.img_width),
                      anti_aliasing=True,
                      mode='reflect')

    pred = np.squeeze(output.data.cpu().numpy(), axis=0)[1, :, :]
    pred = resize(pred, (options.img_height, options.img_width),
                  anti_aliasing=True,
Ejemplo n.º 8
0
    background = 100
    result_path = f'results/A{background * 2}.npy'

    df_labels = pd.read_csv(label_csv)
    df_reference = pd.read_csv(os.path.join(args.data_dir, 'reference.csv'))
    df = pd.merge(df_labels, df_reference[['patient_id', 'age', 'sex', 'signal_len']], on='patient_id', how='left')

    # df = df[df['signal_len'] >= 15000]

    patient_ids = df['patient_id'].to_numpy()
    to_explain = patient_ids[:background * 2]

    background_patient_ids = df.head(background)['patient_id'].to_numpy()
    background_inputs = [os.path.join(data_dir, patient_id) for patient_id in background_patient_ids]
    background_inputs = torch.stack([torch.from_numpy(prepare_input(input)).float() for input in background_inputs]).to(device)
    
    e = shap.GradientExplainer(model, background_inputs)

    if not os.path.exists(result_path):
        svs = []
        y_scores = []
        for patient_id in tqdm(to_explain):
            input = os.path.join(data_dir, patient_id)
            inputs = torch.stack([torch.from_numpy(prepare_input(input)).float()]).to(device)
            y_scores.append(torch.sigmoid(model(inputs)).detach().cpu().numpy())
            sv = np.array(e.shap_values(inputs)) # (n_classes, n_samples, n_leads, n_points)
            svs.append(sv)
        svs = np.concatenate(svs, axis=1)
        y_scores = np.concatenate(y_scores, axis=0)
        np.save(result_path, (svs, y_scores))
Ejemplo n.º 9
0
                         dtype=np.float64)  # [inference, post-processing]
    failed = []
    for n, fname in tqdm.tqdm(enumerate(test_img_list),
                              total=len(test_img_list),
                              desc='eval',
                              ncols=80,
                              leave=False):
        # print(fname, "%d/%d: " % (n, test_len), end='')

        color_in = Image.open(
            os.path.join(options.data_path, 'color-input', fname + '.png'))
        depth_in = Image.open(
            os.path.join(options.data_path, 'depth-input', fname + '.png'))
        label = Image.open(
            os.path.join(options.data_path, 'label', fname + '.png'))
        rgb_input, ddd_input = prepare_input(color_in, depth_in,
                                             options.device)

        ## forward pass
        with torch.no_grad():
            t = time.perf_counter()
            output = model(rgb_input, ddd_input)
            inf_time = time.perf_counter() - t

            ## moving out to cpu, so that does not impact time measurement
            torch.cuda.synchronize()
            time.sleep(0.1)

            t = time.perf_counter()
            cls_pred = output.data.argmax(1).detach().cpu().numpy().squeeze(0)
            ## get the first channel -> the probability of success
            pred = output.data.detach().cpu().numpy().squeeze(0)[1]
def main(args):
    # const
    threshold_up = [
        .7,  # "road",
        .7,  # "sidewalk",
        .4,  # "building",
        .5,  # "wall",
        .6,  # "fence",
        .65,  # "pole",
        .65,  # "traffic light",
        .65,  # "traffic sign",
        .4,  # "vegetation",
        .7,  # "terrain",
        .4,  # "sky",
    ]

    threshold_down = [
        .4,  # "road",
        .4,  # "sidewalk",
        .7,  # "building",
        .5,  # "wall",
        .6,  # "fence",
        .65,  # "pole",
        .65,  # "traffic light",
        .65,  # "traffic sign",
        .7,  # "vegetation",
        .4,  # "terrain",
        .7,  # "sky",
    ]

    # model
    model = torch.hub.load('zhanghang1989/ResNeSt',
                           'resnest101',
                           pretrained=False)
    model.fc = nn.Linear(2048, 19, bias=True)
    model.load_state_dict(torch.load(args.weight)['state_dict'])
    model.eval()
    model = model.cuda()

    # CAM
    target_layer = model.layer4[2].conv3
    wrapped_model = GradCAMpp(model, target_layer)

    # dataset
    files = [
        f for f in glob(args.root + '/**', recursive=True) if os.path.isfile(f)
    ]
    for filename in tqdm(files):
        origin_img = Image.open(filename)
        inputs = prepare_input(np.array(origin_img))
        inputs = inputs.view(3, 1024, 4, 512).permute(2, 0, 1, 3).reshape(
            4, 3, 2, 512, 512).permute(0, 2, 1, 3, 4).reshape(8, 3, 512, 512)

        masks = []
        for i in range(inputs.shape[0]):
            if i % 2 == 0:  # up part
                threshold = threshold_up
            else:
                threshold = threshold_down
            tensor = inputs[i].unsqueeze(0).cuda()
            target = model(tensor).cpu().sigmoid()
            indices = (target[0][:11] > 0.5).nonzero().view(-1).tolist()
            cams = []
            for j in indices:
                cam, idx = wrapped_model(tensor, idx=j)
                cam = nn.functional.interpolate(cam.cpu(),
                                                size=tuple(tensor.size()[-2:]),
                                                mode='bilinear',
                                                align_corners=False)
                cam = cam.squeeze(0).squeeze(0).numpy()
                cams.append(cam)

            area = np.zeros(len(indices), dtype=np.uint32)
            for (idx, c) in enumerate(cams):
                area[idx] = (c >= threshold[indices[idx]]).sum()
            order = area.argsort()[::-1]
            mask = np.zeros((512, 512), dtype=np.uint8)
            mask.fill(255)
            for o in order:
                c, idx = cams[o], indices[o]
                mask[c >= threshold[idx]] = idx
            masks.append(mask)
        out_array = np.hstack([
            np.vstack((masks[i], masks[i + 1]))
            for i in range(0, inputs.shape[0], 2)
        ])
        out_img = Image.fromarray(out_array)
        out_img.putpalette(city_palette)
        out_img.save(os.path.join(args.output, filename.split('\\')[-1]))
Ejemplo n.º 11
0
                         input_path + '.png'))
        color_bg = Image.open(
            os.path.join(options.data_path, 'color-background',
                         input_path + '.png'))
        depth_in = Image.open(
            os.path.join(options.data_path, 'depth-input',
                         input_path + '.png'))
        depth_bg = Image.open(
            os.path.join(options.data_path, 'depth-background',
                         input_path + '.png'))
        label = Image.open(
            os.path.join(options.data_path, 'label', input_path + '.png'))
        cam_intrinsic = np.loadtxt(
            os.path.join(options.data_path, 'camera-intrinsics',
                         input_path + '.txt'))
        img_input = prepare_input(color_in, depth_in, options.device)

        ## forward pass
        t = time.time()
        output = model(img_input)
        inf_t = time.time() - t

        ## get segmentation class prediction
        cls_pred = np.squeeze(output.data.max(1)[1].cpu().numpy(),
                              axis=0).astype(np.float64)
        cls_pred = resize(cls_pred, (options.img_height, options.img_width),
                          anti_aliasing=True,
                          mode='reflect')

        ## get the probability of suction area (index 1)
        pred = np.squeeze(output.data.cpu().numpy(), axis=0)[1, :, :]
Ejemplo n.º 12
0
from matplotlib import pyplot as plt
import matplotlib.patches as patches
import sys

if __name__ == '__main__':
    assert len(sys.argv) > 1

    model = Network().double().to(device)
    data: dict = torch.load(model_filename)
    model.load_state_dict(data['model_state_dict'])
    mean = data['mean']
    std = data['std']
    img = np.array(Image.open(sys.argv[1]).convert('L'))
    assert img.shape == (28, 28)
    image_orig = copy.deepcopy(img)
    img, boundaries = prepare_input((img - mean) / std)

    with torch.no_grad():
        model.train(False)
        res = model(to_tensor(img).to(device).view(1, *img.shape))
        a = (res[:, k:k + k * n].reshape(k, n) * std + mean).cpu().data.numpy()
        p = res[:, :k].reshape(-1).cpu().data.numpy()

        h = sum(p[i] * a[i] for i in range(k))
        h[h < 0] = 0
        h[h > 255] = 255

    ax = plt.gca()
    (hole_beg_x, hole_end_x), (hole_beg_y, hole_end_y) = boundaries
    rect = patches.Rectangle((hole_beg_y - 0.5, hole_beg_x - 0.5),
                             10,
def main(args):
    # const
    threshold = [
        .5,  # "road",
        .5,  # "sidewalk",
        .5,  # "building",
        .5,  # "wall",
        .7,  # "fence",
        .7,  # "pole",
        .7,  # "traffic light",
        .7,  # "traffic sign",
        .5,  # "vegetation",
        .5,  # "terrain",
        .5,  # "sky",
    ]

    # dataset
    filename = "sample/aachen_000001_000019_leftImg8bit.png"
    origin_img = Image.open(filename)
    inputs = prepare_input(np.array(origin_img))

    # model
    model = torch.hub.load('zhanghang1989/ResNeSt',
                           'resnest101',
                           pretrained=False)
    model.fc = nn.Linear(2048, 19, bias=True)
    model.load_state_dict(torch.load(args.weight)['state_dict'])
    model.eval()
    model = model.cuda()

    # CAM
    target_layer = model.layer4[2].conv3
    wrapped_model = GradCAMpp(model, target_layer)

    tensor = inputs.cuda()
    target = model(tensor).cpu().sigmoid()
    indices = (target[0][:11] > 0.5).nonzero().view(-1).tolist()
    cams = []
    for j in indices:
        cam, idx = wrapped_model(tensor, idx=j)
        cam = nn.functional.interpolate(cam.cpu(),
                                        size=tuple(tensor.size()[-2:]),
                                        mode='bilinear',
                                        align_corners=False)
        cam = cam.squeeze(0).squeeze(0).numpy()
        cams.append(cam)

    area = np.zeros(len(indices), dtype=np.uint32)
    for (idx, c) in enumerate(cams):
        i = indices[idx]
        area[idx] = (c >= threshold[i]).sum()
    order = area.argsort()[::-1]
    mask = np.zeros_like(cams[0], dtype=np.uint8)
    mask.fill(255)
    for o in order:
        c, i = cams[o], indices[o]
        mask[c >= threshold[i]] = i

    out_img = Image.fromarray(mask)
    out_img.putpalette(city_palette)
    out_img.save(os.path.join(args.output, filename.split('/')[-1]))
Ejemplo n.º 14
0
def main(
    checkpoint_path: Path,
    bsldict_metadata_path: Path,
    keyword: str,
    input_path: Path,
    viz: bool,
    output_path: Path,
    viz_with_dict: bool,
    gen_gif: bool,
    similarity_thres: float,
    batch_size: int,
    stride: int = 1,
    num_in_frames: int = 16,
    fps: int = 25,
    embd_dim: int = 256,
):
    """
    Run sign spotting demo:
    1) load the pre-extracted dictionary video features,
    2) load the pretrained model,
    3) read the input video, preprocess it into sliding windows, extract its features,
    4) compare the input video features at every time step with the dictionary features
        corresponding to the keyword
    5) select the location with the highest similarity, if above a threshold, as spotting,
    6) (optional) visualize the similarity plots for each dictionary version corresponding to the keyword,
        save the visualization as video (and gif).
    
    The parameters are explained in the help value for each argument at the bottom of this code file.

    :param checkpoint_path: default `../models/i3d_mlp.pth.tar` should be used
    :param bsldict_metadata_path: default `../bsldict/bsldict_v1.pkl` should be used
    :param keyword: a search keyword, by default "apple", should exist in the dictionary
    :param input_path: path to the continuous test video
    :param viz: if 1, saves .mp4 visualization video
    :param output_path: path to the .mp4 visualization (used if viz)
    :param viz_with_dict: if 1, adds the dictionary frames to the visualization (downloads dictionary videos and takes middle frames)
    :param similarity_thres: similarity threshold that determines when a spotting occurs, 0.7 is observed to be a good value
    :param batch_size: how many sliding window clips to group when applying the model, this depends on the hardware resources, but doesn't change the results
    :param stride: how many frames to stride when applying sliding windows to the input video (1 obtains best performance)
    :param num_in_frames: number of frames processed at a time by the model (I3D model is trained with 16 frames)
    :param fps: the frame rate at which to read the input video
    :param embd_dim: the video feature dimensionality, always 256 for the MLP model output.
    """
    msg = "Please download the BSLDict metadata at bsldict/download_bsldict_metadata.sh"
    assert bsldict_metadata_path.exists(), msg
    print(
        f"Loading BSLDict data (words & features) from {bsldict_metadata_path}"
    )
    with open(bsldict_metadata_path, "rb") as f:
        bsldict_metadata = pkl.load(f)

    msg = f"Search item '{keyword} does not exist in the sign dictionary."
    assert keyword in bsldict_metadata["words"], msg

    # Find dictionary videos whose sign corresponds to the search key
    dict_ix = np.where(
        np.array(bsldict_metadata["videos"]["word"]) == keyword)[0]
    print(f"Found {len(dict_ix)} dictionary videos for the keyword {keyword}.")
    dict_features = np.array(
        bsldict_metadata["videos"]["features"]["mlp"])[dict_ix]
    dict_video_urls = np.array(
        bsldict_metadata["videos"]["video_link_db"])[dict_ix]
    dict_youtube_ids = np.array(
        bsldict_metadata["videos"]["youtube_identifier_db"])[dict_ix]
    for vi, v in enumerate(dict_video_urls):
        print(f"v{vi + 1}: {v}")

    msg = "Please download the pretrained model at models/download_models.sh"
    assert checkpoint_path.exists(), msg
    print(f"Loading model from {checkpoint_path}")
    model = load_model(checkpoint_path=checkpoint_path)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Moving model to {device}")
    model = model.to(device)
    # Load the continuous RGB video input
    rgb_orig = load_rgb_video(
        video_path=input_path,
        fps=fps,
    )
    # Prepare: resize/crop/normalize
    rgb_input = prepare_input(rgb_orig)
    # Sliding window
    rgb_slides, t_mid = sliding_windows(
        rgb=rgb_input,
        stride=stride,
        num_in_frames=num_in_frames,
    )
    # Number of windows/clips
    num_clips = rgb_slides.shape[0]
    # Group the clips into batches
    num_batches = math.ceil(num_clips / batch_size)
    continuous_features = np.empty((0, embd_dim), dtype=float)
    for b in range(num_batches):
        inp = rgb_slides[b * batch_size:(b + 1) * batch_size]
        inp = inp.to(device)
        # Forward pass
        out = model(inp)
        continuous_features = np.append(continuous_features,
                                        out["embds"].cpu().detach().numpy(),
                                        axis=0)
    # Compute distance between continuous and dictionary features
    dst = pairwise_distances(continuous_features,
                             dict_features,
                             metric="cosine")
    # Convert to [0, 1] similarity. Dimensionality: [ContinuousTimes x DictionaryVersions]
    sim = 1 - dst / 2
    # Time where the similarity peaks
    peak_ix = sim.max(axis=1).argmax()
    # Dictionary version which responds with highest similarity
    version_ix = sim.argmax(axis=1)[peak_ix]
    max_sim = sim[peak_ix, version_ix]
    # If above a threhsold: spotted
    if sim[peak_ix, version_ix] >= similarity_thres:
        print(
            f"Sign '{keyword}' spotted at timeframe {peak_ix} "
            f"with similarity {max_sim:.2f} for the dictionary version {version_ix + 1}."
        )
    else:
        print(f"Sign {keyword} not spotted.")

    # Visualize similarity plot
    if viz:
        output_path.parent.mkdir(exist_ok=True, parents=True)
        # Save visualization video
        viz_similarities(
            rgb=rgb_orig,
            t_mid=t_mid,
            sim=sim,
            similarity_thres=similarity_thres,
            keyword=keyword,
            output_path=output_path,
            viz_with_dict=viz_with_dict,
            dict_video_links=(dict_video_urls, dict_youtube_ids),
        )
        # Generate a gif
        if gen_gif:
            gif_path = output_path.with_suffix(".gif")
            cmd = f"ffmpeg -loglevel panic -y -i {output_path} -f gif {gif_path}"
            print(f"Generating gif of output at {gif_path}")
            os.system(cmd)