def train_model(train_batches, word2idx, epochs, valid_batches, model_save, params, use_gpu): for epoch in range(epochs): total_loss = 0 for batch in tqdm(train_batches, desc="Epoch %d/%d"%(epoch+1, epochs)): model.zero_grad() model.hidden = model.init_hidden() X = utils.prepare_input(batch[:-1,:]) y = utils.prepare_input(batch[1:,:]) if use_gpu: X = X.cuda() y = y.cuda() output_scores = model(X) true_y = y.contiguous().view(-1, 1).squeeze() pred_y = output_scores.view(-1, len(word2idx)) loss = loss_function(pred_y, true_y) total_loss += loss.data loss.backward() optimizer.step() params["model"] = model.state_dict() params["optimizer"] = optimizer.state_dict() params["epoch"] = epoch torch.save(params, model_save+"_"+str(epoch)) print ("Training loss: ", total_loss.data.cpu().numpy()/len(train_batches)) model.eval() print ("Validation loss: ", utils.evaluate(model, loss_function, valid_batches)/len(valid_batches)) model.train()
def render(policy, embedding_net, device): from torchvision import transforms trans = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) env = MarioEnvironment() s = env.reset() images = [s] s = s.reshape(s.shape[1], s.shape[2], s.shape[0]) # print(s.shape) s = trans(s) # temp = s[..., np.newaxis] * np.ones(3) # temp = temp.squeeze() # temp = temp.reshape(temp.shape[1], temp.shape[0], temp.shape[2]) # print(temp.shape) coin = 0 for _ in range(40): # env.render() # s = np.reshape(s, (s.shape[0]*s.shape[1]*s.shape[2])) input_state = prepare_input(s) input_state = embedding_net(input_state) action_dist, action = policy(input_state) action_dist, action = action_dist[0], action[ 0] # Remove the batch dimension s_prime, r, t, coins = env.step(action) # print(r, t, coins) coin += coins if t: break s = s_prime images.append(s) s = s.reshape(s.shape[1], s.shape[2], s.shape[0]) # s = s.reshape(s.shape[0] * s.shape[1] * s.shape[2]) s = trans(s) # temp = s[..., np.newaxis] * np.ones(3) # temp = temp.squeeze() # temp = temp.reshape(temp.shape[1], temp.shape[0], temp.shape[2]) # Create gifs print('total coins', coin) make_gif(images, '0.gif')
def safe_decoder(model, image_file, res_type): try: if (not os.path.isfile(image_file)): raise IOError('fail to locate image_file') # load image array from file image_array = imread(image_file) if (image_array is None): raise IOError('fail to decode image_file') # prepare image array as theano tensor image_shape = image_array.shape[:2] th_tensor = prepare_input(image_array, res_type) if (th_tensor.mean() < 0): th_tensor = -th_tensor # decode input using the pretrained PPT text detector res_map = decode_image(model, th_tensor, image_shape) except Exception, e: print "WARNING: something wrong during decoding", image_file, e res_map = None
def run(self): while not rospy.is_shutdown(): data = None while data is None: try: data = rospy.wait_for_message("camera", Image, timeout=10) except: pass cv_image = utils.prepare_input(data, self.crop_size) prediction = self.model.predict(cv_image) segmentation_result, segmentation_overlay_result = utils.prepare_output( prediction[0, :, :, 1], cv_image[0], self.view_size, self.prediction_threshold) self.seg_pub.publish(utils.get_cv_msg(segmentation_result)) self.seg_overlay_pub.publish( utils.get_cv_msg(segmentation_overlay_result))
on='patient_id', how='left') # df = df[df['signal_len'] >= 15000] patient_ids = df['patient_id'].to_numpy() to_explain = patient_ids[:background * 2] background_patient_ids = df.head(background)['patient_id'].to_numpy() background_inputs = [ os.path.join(data_dir, patient_id) for patient_id in background_patient_ids ] background_inputs = torch.stack([ torch.from_numpy(prepare_input(input)).float() for input in background_inputs ]).to(device) background_inputs = background_inputs[:, use_leads, :] e = shap.GradientExplainer(model, background_inputs) if not os.path.exists(result_path): svs = [] y_scores = [] for patient_id in tqdm(to_explain): input = os.path.join(data_dir, patient_id) inputs = torch.stack( [torch.from_numpy(prepare_input(input)).float()]).to(device) inputs = inputs[:, use_leads, :] y_scores.append(
data.transform = Compose( [Lambda(lambda x: (np.array(x).reshape((28, 28)) - mean) / std)]) images = [] images_orig = [] boundaries = [] for i, (image, label) in tqdm(enumerate(data), desc='Preparing dataset', total=n_of_samples, position=0, leave=True): if i == n_of_samples: break images_orig.append(image) torch_image, bound = prepare_input(image) images.append(torch.from_numpy(torch_image)) boundaries.append(bound) dataset_size: int = len(images) indices: list = list(range(dataset_size)) split: int = int(np.floor(val_split * dataset_size)) if shuffle_dataset: np.random.seed(random_seed) np.random.shuffle(indices) train_indices, val_indices = indices[split:], indices[:split] train_sampler: SubsetRandomSampler = SubsetRandomSampler(train_indices) test_sampler: SubsetRandomSampler = SubsetRandomSampler(val_indices) my_set = ImageSet(images, images_orig, boundaries)
os.path.join(options.data_path, 'color-input', input_file + '.png')) rgb_bg = Image.open( os.path.join(options.data_path, 'color-background', input_file + '.png')) depth_in = Image.open( os.path.join(options.data_path, 'depth-input', input_file + '.png')) depth_bg = Image.open( os.path.join(options.data_path, 'depth-background', input_file + '.png')) label = Image.open( os.path.join(options.data_path, 'label', input_file + '.png')) cam_intrinsic = np.loadtxt( os.path.join(options.data_path, 'camera-intrinsics', input_file + '.txt')) img_input = prepare_input(rgb_in, depth_in, options.device) ## inference print('computing inference: ', input_file) t = time.time() output = model(img_input) cls_pred = np.squeeze(output.data.max(1)[1].cpu().numpy(), axis=0).astype(np.float64) cls_pred = resize(cls_pred, (options.img_height, options.img_width), anti_aliasing=True, mode='reflect') pred = np.squeeze(output.data.cpu().numpy(), axis=0)[1, :, :] pred = resize(pred, (options.img_height, options.img_width), anti_aliasing=True,
background = 100 result_path = f'results/A{background * 2}.npy' df_labels = pd.read_csv(label_csv) df_reference = pd.read_csv(os.path.join(args.data_dir, 'reference.csv')) df = pd.merge(df_labels, df_reference[['patient_id', 'age', 'sex', 'signal_len']], on='patient_id', how='left') # df = df[df['signal_len'] >= 15000] patient_ids = df['patient_id'].to_numpy() to_explain = patient_ids[:background * 2] background_patient_ids = df.head(background)['patient_id'].to_numpy() background_inputs = [os.path.join(data_dir, patient_id) for patient_id in background_patient_ids] background_inputs = torch.stack([torch.from_numpy(prepare_input(input)).float() for input in background_inputs]).to(device) e = shap.GradientExplainer(model, background_inputs) if not os.path.exists(result_path): svs = [] y_scores = [] for patient_id in tqdm(to_explain): input = os.path.join(data_dir, patient_id) inputs = torch.stack([torch.from_numpy(prepare_input(input)).float()]).to(device) y_scores.append(torch.sigmoid(model(inputs)).detach().cpu().numpy()) sv = np.array(e.shap_values(inputs)) # (n_classes, n_samples, n_leads, n_points) svs.append(sv) svs = np.concatenate(svs, axis=1) y_scores = np.concatenate(y_scores, axis=0) np.save(result_path, (svs, y_scores))
dtype=np.float64) # [inference, post-processing] failed = [] for n, fname in tqdm.tqdm(enumerate(test_img_list), total=len(test_img_list), desc='eval', ncols=80, leave=False): # print(fname, "%d/%d: " % (n, test_len), end='') color_in = Image.open( os.path.join(options.data_path, 'color-input', fname + '.png')) depth_in = Image.open( os.path.join(options.data_path, 'depth-input', fname + '.png')) label = Image.open( os.path.join(options.data_path, 'label', fname + '.png')) rgb_input, ddd_input = prepare_input(color_in, depth_in, options.device) ## forward pass with torch.no_grad(): t = time.perf_counter() output = model(rgb_input, ddd_input) inf_time = time.perf_counter() - t ## moving out to cpu, so that does not impact time measurement torch.cuda.synchronize() time.sleep(0.1) t = time.perf_counter() cls_pred = output.data.argmax(1).detach().cpu().numpy().squeeze(0) ## get the first channel -> the probability of success pred = output.data.detach().cpu().numpy().squeeze(0)[1]
def main(args): # const threshold_up = [ .7, # "road", .7, # "sidewalk", .4, # "building", .5, # "wall", .6, # "fence", .65, # "pole", .65, # "traffic light", .65, # "traffic sign", .4, # "vegetation", .7, # "terrain", .4, # "sky", ] threshold_down = [ .4, # "road", .4, # "sidewalk", .7, # "building", .5, # "wall", .6, # "fence", .65, # "pole", .65, # "traffic light", .65, # "traffic sign", .7, # "vegetation", .4, # "terrain", .7, # "sky", ] # model model = torch.hub.load('zhanghang1989/ResNeSt', 'resnest101', pretrained=False) model.fc = nn.Linear(2048, 19, bias=True) model.load_state_dict(torch.load(args.weight)['state_dict']) model.eval() model = model.cuda() # CAM target_layer = model.layer4[2].conv3 wrapped_model = GradCAMpp(model, target_layer) # dataset files = [ f for f in glob(args.root + '/**', recursive=True) if os.path.isfile(f) ] for filename in tqdm(files): origin_img = Image.open(filename) inputs = prepare_input(np.array(origin_img)) inputs = inputs.view(3, 1024, 4, 512).permute(2, 0, 1, 3).reshape( 4, 3, 2, 512, 512).permute(0, 2, 1, 3, 4).reshape(8, 3, 512, 512) masks = [] for i in range(inputs.shape[0]): if i % 2 == 0: # up part threshold = threshold_up else: threshold = threshold_down tensor = inputs[i].unsqueeze(0).cuda() target = model(tensor).cpu().sigmoid() indices = (target[0][:11] > 0.5).nonzero().view(-1).tolist() cams = [] for j in indices: cam, idx = wrapped_model(tensor, idx=j) cam = nn.functional.interpolate(cam.cpu(), size=tuple(tensor.size()[-2:]), mode='bilinear', align_corners=False) cam = cam.squeeze(0).squeeze(0).numpy() cams.append(cam) area = np.zeros(len(indices), dtype=np.uint32) for (idx, c) in enumerate(cams): area[idx] = (c >= threshold[indices[idx]]).sum() order = area.argsort()[::-1] mask = np.zeros((512, 512), dtype=np.uint8) mask.fill(255) for o in order: c, idx = cams[o], indices[o] mask[c >= threshold[idx]] = idx masks.append(mask) out_array = np.hstack([ np.vstack((masks[i], masks[i + 1])) for i in range(0, inputs.shape[0], 2) ]) out_img = Image.fromarray(out_array) out_img.putpalette(city_palette) out_img.save(os.path.join(args.output, filename.split('\\')[-1]))
input_path + '.png')) color_bg = Image.open( os.path.join(options.data_path, 'color-background', input_path + '.png')) depth_in = Image.open( os.path.join(options.data_path, 'depth-input', input_path + '.png')) depth_bg = Image.open( os.path.join(options.data_path, 'depth-background', input_path + '.png')) label = Image.open( os.path.join(options.data_path, 'label', input_path + '.png')) cam_intrinsic = np.loadtxt( os.path.join(options.data_path, 'camera-intrinsics', input_path + '.txt')) img_input = prepare_input(color_in, depth_in, options.device) ## forward pass t = time.time() output = model(img_input) inf_t = time.time() - t ## get segmentation class prediction cls_pred = np.squeeze(output.data.max(1)[1].cpu().numpy(), axis=0).astype(np.float64) cls_pred = resize(cls_pred, (options.img_height, options.img_width), anti_aliasing=True, mode='reflect') ## get the probability of suction area (index 1) pred = np.squeeze(output.data.cpu().numpy(), axis=0)[1, :, :]
from matplotlib import pyplot as plt import matplotlib.patches as patches import sys if __name__ == '__main__': assert len(sys.argv) > 1 model = Network().double().to(device) data: dict = torch.load(model_filename) model.load_state_dict(data['model_state_dict']) mean = data['mean'] std = data['std'] img = np.array(Image.open(sys.argv[1]).convert('L')) assert img.shape == (28, 28) image_orig = copy.deepcopy(img) img, boundaries = prepare_input((img - mean) / std) with torch.no_grad(): model.train(False) res = model(to_tensor(img).to(device).view(1, *img.shape)) a = (res[:, k:k + k * n].reshape(k, n) * std + mean).cpu().data.numpy() p = res[:, :k].reshape(-1).cpu().data.numpy() h = sum(p[i] * a[i] for i in range(k)) h[h < 0] = 0 h[h > 255] = 255 ax = plt.gca() (hole_beg_x, hole_end_x), (hole_beg_y, hole_end_y) = boundaries rect = patches.Rectangle((hole_beg_y - 0.5, hole_beg_x - 0.5), 10,
def main(args): # const threshold = [ .5, # "road", .5, # "sidewalk", .5, # "building", .5, # "wall", .7, # "fence", .7, # "pole", .7, # "traffic light", .7, # "traffic sign", .5, # "vegetation", .5, # "terrain", .5, # "sky", ] # dataset filename = "sample/aachen_000001_000019_leftImg8bit.png" origin_img = Image.open(filename) inputs = prepare_input(np.array(origin_img)) # model model = torch.hub.load('zhanghang1989/ResNeSt', 'resnest101', pretrained=False) model.fc = nn.Linear(2048, 19, bias=True) model.load_state_dict(torch.load(args.weight)['state_dict']) model.eval() model = model.cuda() # CAM target_layer = model.layer4[2].conv3 wrapped_model = GradCAMpp(model, target_layer) tensor = inputs.cuda() target = model(tensor).cpu().sigmoid() indices = (target[0][:11] > 0.5).nonzero().view(-1).tolist() cams = [] for j in indices: cam, idx = wrapped_model(tensor, idx=j) cam = nn.functional.interpolate(cam.cpu(), size=tuple(tensor.size()[-2:]), mode='bilinear', align_corners=False) cam = cam.squeeze(0).squeeze(0).numpy() cams.append(cam) area = np.zeros(len(indices), dtype=np.uint32) for (idx, c) in enumerate(cams): i = indices[idx] area[idx] = (c >= threshold[i]).sum() order = area.argsort()[::-1] mask = np.zeros_like(cams[0], dtype=np.uint8) mask.fill(255) for o in order: c, i = cams[o], indices[o] mask[c >= threshold[i]] = i out_img = Image.fromarray(mask) out_img.putpalette(city_palette) out_img.save(os.path.join(args.output, filename.split('/')[-1]))
def main( checkpoint_path: Path, bsldict_metadata_path: Path, keyword: str, input_path: Path, viz: bool, output_path: Path, viz_with_dict: bool, gen_gif: bool, similarity_thres: float, batch_size: int, stride: int = 1, num_in_frames: int = 16, fps: int = 25, embd_dim: int = 256, ): """ Run sign spotting demo: 1) load the pre-extracted dictionary video features, 2) load the pretrained model, 3) read the input video, preprocess it into sliding windows, extract its features, 4) compare the input video features at every time step with the dictionary features corresponding to the keyword 5) select the location with the highest similarity, if above a threshold, as spotting, 6) (optional) visualize the similarity plots for each dictionary version corresponding to the keyword, save the visualization as video (and gif). The parameters are explained in the help value for each argument at the bottom of this code file. :param checkpoint_path: default `../models/i3d_mlp.pth.tar` should be used :param bsldict_metadata_path: default `../bsldict/bsldict_v1.pkl` should be used :param keyword: a search keyword, by default "apple", should exist in the dictionary :param input_path: path to the continuous test video :param viz: if 1, saves .mp4 visualization video :param output_path: path to the .mp4 visualization (used if viz) :param viz_with_dict: if 1, adds the dictionary frames to the visualization (downloads dictionary videos and takes middle frames) :param similarity_thres: similarity threshold that determines when a spotting occurs, 0.7 is observed to be a good value :param batch_size: how many sliding window clips to group when applying the model, this depends on the hardware resources, but doesn't change the results :param stride: how many frames to stride when applying sliding windows to the input video (1 obtains best performance) :param num_in_frames: number of frames processed at a time by the model (I3D model is trained with 16 frames) :param fps: the frame rate at which to read the input video :param embd_dim: the video feature dimensionality, always 256 for the MLP model output. """ msg = "Please download the BSLDict metadata at bsldict/download_bsldict_metadata.sh" assert bsldict_metadata_path.exists(), msg print( f"Loading BSLDict data (words & features) from {bsldict_metadata_path}" ) with open(bsldict_metadata_path, "rb") as f: bsldict_metadata = pkl.load(f) msg = f"Search item '{keyword} does not exist in the sign dictionary." assert keyword in bsldict_metadata["words"], msg # Find dictionary videos whose sign corresponds to the search key dict_ix = np.where( np.array(bsldict_metadata["videos"]["word"]) == keyword)[0] print(f"Found {len(dict_ix)} dictionary videos for the keyword {keyword}.") dict_features = np.array( bsldict_metadata["videos"]["features"]["mlp"])[dict_ix] dict_video_urls = np.array( bsldict_metadata["videos"]["video_link_db"])[dict_ix] dict_youtube_ids = np.array( bsldict_metadata["videos"]["youtube_identifier_db"])[dict_ix] for vi, v in enumerate(dict_video_urls): print(f"v{vi + 1}: {v}") msg = "Please download the pretrained model at models/download_models.sh" assert checkpoint_path.exists(), msg print(f"Loading model from {checkpoint_path}") model = load_model(checkpoint_path=checkpoint_path) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Moving model to {device}") model = model.to(device) # Load the continuous RGB video input rgb_orig = load_rgb_video( video_path=input_path, fps=fps, ) # Prepare: resize/crop/normalize rgb_input = prepare_input(rgb_orig) # Sliding window rgb_slides, t_mid = sliding_windows( rgb=rgb_input, stride=stride, num_in_frames=num_in_frames, ) # Number of windows/clips num_clips = rgb_slides.shape[0] # Group the clips into batches num_batches = math.ceil(num_clips / batch_size) continuous_features = np.empty((0, embd_dim), dtype=float) for b in range(num_batches): inp = rgb_slides[b * batch_size:(b + 1) * batch_size] inp = inp.to(device) # Forward pass out = model(inp) continuous_features = np.append(continuous_features, out["embds"].cpu().detach().numpy(), axis=0) # Compute distance between continuous and dictionary features dst = pairwise_distances(continuous_features, dict_features, metric="cosine") # Convert to [0, 1] similarity. Dimensionality: [ContinuousTimes x DictionaryVersions] sim = 1 - dst / 2 # Time where the similarity peaks peak_ix = sim.max(axis=1).argmax() # Dictionary version which responds with highest similarity version_ix = sim.argmax(axis=1)[peak_ix] max_sim = sim[peak_ix, version_ix] # If above a threhsold: spotted if sim[peak_ix, version_ix] >= similarity_thres: print( f"Sign '{keyword}' spotted at timeframe {peak_ix} " f"with similarity {max_sim:.2f} for the dictionary version {version_ix + 1}." ) else: print(f"Sign {keyword} not spotted.") # Visualize similarity plot if viz: output_path.parent.mkdir(exist_ok=True, parents=True) # Save visualization video viz_similarities( rgb=rgb_orig, t_mid=t_mid, sim=sim, similarity_thres=similarity_thres, keyword=keyword, output_path=output_path, viz_with_dict=viz_with_dict, dict_video_links=(dict_video_urls, dict_youtube_ids), ) # Generate a gif if gen_gif: gif_path = output_path.with_suffix(".gif") cmd = f"ffmpeg -loglevel panic -y -i {output_path} -f gif {gif_path}" print(f"Generating gif of output at {gif_path}") os.system(cmd)