def eval_split_by_lev(model, vocab, split): df = get_split_df(split) pp = ProgressPrinter(df.shape[0], 5) hypes = [] gts = [] with torch.no_grad(): for idx in range(df.shape[0]): row = df.iloc[idx] gt = vocab.encode(row.annotation) video_path, feat_path = get_video_path(row, split) tensor_video = torch.load(feat_path).unsqueeze(0).to(DEVICE) pred = model(tensor_video).squeeze(1).log_softmax(dim=1).argmax( dim=1).cpu().numpy() hypo = [] for i in range(len(pred)): if pred[i] == 0 or (i > 0 and pred[i] == pred[i - 1]): continue hypo.append(pred[i]) gts += gt hypes += hypo pp.show(idx) pp.end() hypes = "".join([chr(x) for x in hypes]) gts = "".join([chr(x) for x in gts]) wer = Lev.distance(hypes, gts) / len(gts) * 100 print(wer)
def generate_openpose_features_split(pose_estimator, split): with torch.no_grad(): df = get_split_df(split) print(SOURCE, "Feature extraction:", STF_MODEL, split, "split") L = df.shape[0] pp = ProgressPrinter(L, 1) for idx in range(L): row = df.iloc[idx] video_dir, feat_path = get_video_path(row, split, feat_ext=".npy") if os.path.exists(feat_path): pp.omit() continue feat_dir = os.path.split(feat_path)[0] feats = pose_estimator.estimate_video_pose(video_dir) if not os.path.exists(feat_dir): os.makedirs(feat_dir) np.save(feat_path, feats) if SHOW_PROGRESS: pp.show(idx) if SHOW_PROGRESS: pp.end() print()
def _get_feat(self, row, glosses=None): video_path, feat_path = get_video_path(row, self.split) feat = get_images(video_path) feat_len = len(feat) if feat_len < len(glosses) * 4: return None, None, None return video_path, feat, feat_len
def _get_feat(self, row, glosses=None): video_path, feat_path = get_video_path(row, self.split) if not os.path.exists(feat_path): return None, None, None feat = torch.load(feat_path) feat_len = len(feat) if feat_len < len(glosses) or len(feat.shape) < 2: return None, None, None return feat_path, feat, feat_len
def _get_feat(self, row, glosses=None): ext = ".npy" if STF_MODEL.startswith("pose") else ".pt" video_path, feat_path = get_video_path(row, self.split, feat_ext=ext, stf_feat=False) if not os.path.exists(feat_path): return None, None, None feat = np.load(feat_path) if STF_MODEL.startswith("pose") else torch.load(feat_path) feat_len = len(feat) if feat_len < len(glosses) or len(feat.shape) < 2: return None, None, None return feat_path, feat, feat_len
def clean_anno_KRSL(split, save=True): df = get_split_df(split) L = df.shape[0] to_remove = [] for i in range(L): row = df.iloc[i] video_path, _ = get_video_path(row, split) if not os.path.exists(video_path): to_remove.append(i) df = df.drop(df.index[to_remove]) if save: df.to_csv(os.path.join(ANNO_DIR, split + ".csv"), index=None) print("Cleaned ", split, "dataset, from", L, "to", df.shape[0])
def gen_img_feat_split(model, preprocess, split): if SOURCE == "KRSL" and split == "dev": split = "val" df = get_split_df(split) print(SOURCE, STF_MODEL, "feature extraction:", split, "split") L = df.shape[0] pp = ProgressPrinter(L, 10) for idx in range(L): row = df.iloc[idx] video_path, feat_path = get_video_path(row, split, stf_feat=False) if os.path.exists(feat_path) and not FEAT_OVERRIDE: pp.omit() continue feat_dir = os.path.split(feat_path)[0] images = get_images(video_path) if len(images) < 4: continue tensor_video = get_tensor_video(images, preprocess, "2D") inp = tensor_video.to(DEVICE) feat = model(inp).cpu() if not os.path.exists(feat_dir): os.makedirs(feat_dir) torch.save(feat, feat_path) if SHOW_PROGRESS: pp.show(idx) if SHOW_PROGRESS: pp.end()
def generate_gloss_dataset(vocab, stf_type=STF_TYPE, use_feat=USE_ST_FEAT): print("Generation of the Gloss-Recognition Dataset") model, loaded = get_end2end_model(vocab, True, stf_type, use_feat) mode = "3D" if stf_type else "2D" if not loaded: print("STF or SEQ2SEQ model doesn't exist") exit(0) model.eval() temp_stride = 4 rerun_out_dir = os.path.join(GR_DATASET_DIR, "STF_RERUN") rerun_out_path = os.path.join(rerun_out_dir, STF_MODEL + ".bin") stf_rerun = use_feat and os.path.exists(rerun_out_path) if stf_rerun: with open(rerun_out_path, 'rb') as f: feats_rerun_data = pickle.load(f) else: feats_rerun_data = {"frame_n": [], "gloss_paths": [], "gloss_lens": []} df = get_split_df("train") Y = [] X = [] X_lens = [] pp = ProgressPrinter(df.shape[0], 5) cur_n_gloss = 0 for idx in range(df.shape[0]): row = df.iloc[idx] video_path, feat_path = get_video_path(row, "train") if stf_rerun: frame_n = feats_rerun_data["frame_n"][idx] if frame_n < temp_stride: pp.omit() continue gloss_paths = feats_rerun_data["gloss_paths"][idx] gloss_lens = feats_rerun_data["gloss_lens"][idx] with torch.no_grad(): tensor_video = torch.load(feat_path).unsqueeze(0).to(DEVICE) else: images = get_images(video_path) frame_n = len(images) feats_rerun_data["frame_n"].append(frame_n) if frame_n < temp_stride: pp.omit() feats_rerun_data["gloss_paths"].append("") feats_rerun_data["gloss_lens"].append(0) continue gloss_paths, gloss_lens = get_gloss_paths(images, cur_n_gloss, temp_stride, mode) feats_rerun_data["gloss_paths"].append(gloss_paths) feats_rerun_data["gloss_lens"].append(gloss_lens) with torch.no_grad(): if use_feat: tensor_video = torch.load(feat_path).unsqueeze(0).to(DEVICE) else: tensor_video = get_tensor_video(images, preprocess_3d, mode).unsqueeze(0).to(DEVICE) X += gloss_paths X_lens += gloss_lens Y += get_decoded_prediction(model, tensor_video, vocab.encode(row.annotation)) assert (len(Y) == len(X) == len(X_lens)) cur_n_gloss = len(X) if SHOW_PROGRESS: pp.show(idx) shuffle_and_save_dataset(X, X_lens, Y) if use_feat and not stf_rerun: if not os.path.exists(rerun_out_dir): os.makedirs(rerun_out_dir) with(open(rerun_out_path, 'wb')) as f: pickle.dump(feats_rerun_data, f) if SHOW_PROGRESS: pp.end()