def compute_metric(self, preds, data): ''' compute f1 and extract match scores for output ''' m = collections.defaultdict(list) for ex, feat in tqdm.tqdm(data, ncols=80, desc='compute_metric'): # if 'repeat_idx' in ex: ex = self.load_task_json(self.args, ex, None)[0] key = (ex['task_id'], ex['repeat_idx']) # feat should already contain the following, since all AlfredDataset s which are fed into this function have test_mode=False # feat = self.featurize(ex, self.args, False, load_mask=True, load_frames=True) # Evaluate low level actions. label = ' '.join(self.vocab['action_low'].index2word( feat['action_low'].tolist())) pred = ' '.join(self.vocab['action_low'].index2word( preds[key]['action_low'])) label_lower = label.lower() pred_lower = pred.lower() m['action_low_f1'].append(compute_f1(label_lower, pred_lower)) m['action_low_em'].append(compute_exact(label_lower, pred_lower)) m['action_low_gold_length'].append(len(label.split())) m['action_low_pred_length'].append(len(pred.split())) m['action_low_edit_distance'].append( compute_edit_distance(label_lower, pred_lower)) # Evaluate high-level controller. # Get indexes of predicted transitions. stop_idxs = np.argwhere( np.array(preds[key]['action_low'])[:-1] == 2).flatten() high_idxs = np.append([0], stop_idxs + 1).astype(np.int32) # Get predicted submodule transitions pred_high_idx = np.array(preds[key]['controller_attn'])[high_idxs] label_high_idx = feat['module_idxs'][np.nonzero( feat['transition_mask'])] pred = ' '.join(self.vocab['high_level'].index2word( pred_high_idx.tolist())) label = ' '.join(self.vocab['high_level'].index2word( label_high_idx.tolist())) label_lower = label.lower() pred_lower = pred.lower() m['action_high_f1'].append(compute_f1(label_lower, pred_lower)) m['action_high_em'].append(compute_exact(label_lower, pred_lower)) m['action_high_gold_length'].append(len(label.split())) m['action_high_pred_length'].append(len(pred.split())) m['action_high_edit_distance'].append( compute_edit_distance(label_lower, pred_lower)) return {k: sum(v) / len(v) for k, v in m.items()}
def compute_metric(self, preds, data): ''' compute f1 and extract match scores for output ''' m = collections.defaultdict(list) for task in data: ex = self.load_task_json(task) i = self.get_task_and_ann_id(ex) label = ' '.join([ a['discrete_action']['action'] for a in ex['plan']['low_actions'] ]) m['action_low_f1'].append( compute_f1(label.lower(), preds[i]['action_low'].lower())) m['action_low_em'].append( compute_exact(label.lower(), preds[i]['action_low'].lower())) return {k: sum(v) / len(v) for k, v in m.items()}
def compute_metric(self, preds, data): ''' compute f1 and extract match scores for output ''' m = collections.defaultdict(list) for ex in data: # if 'repeat_idx'in ex: ex = self.load_task_json(ex, None)[0] key = (ex['task_id'], ex['repeat_idx']) label = ' '.join([ a['discrete_action']['action'] for a in ex['plan']['low_actions'] ]) m['action_low_f1'].append( compute_f1(label.lower(), preds[key]['action_low'].lower())) m['action_low_em'].append( compute_exact(label.lower(), preds[key]['action_low'].lower())) return {k: sum(v) / len(v) for k, v in m.items()}