def _cache_entity_data(self, csv_file_path): entity_set = set() csv_data = io.csv_to_list(csv_file_path) csv_data.pop(0) # CSV header for csv_row in csv_data: video_id = csv_row[0] entity_id = csv_row[-3] timestamp = csv_row[1] speech_label = self._postprocess_speech_label(csv_row[-2]) entity_label = self._postprocess_entity_label(csv_row[-2]) minimal_entity_data = (entity_id, timestamp, entity_label) # Store minimal entity data if video_id not in self.entity_data.keys(): self.entity_data[video_id] = {} if entity_id not in self.entity_data[video_id].keys(): self.entity_data[video_id][entity_id] = [] entity_set.add((video_id, entity_id)) self.entity_data[video_id][entity_id].append(minimal_entity_data) #Store speech meta-data if video_id not in self.speech_data.keys(): self.speech_data[video_id] = {} if timestamp not in self.speech_data[video_id].keys(): self.speech_data[video_id][timestamp] = speech_label #max operation yields if someone is speaking. new_speech_label = max(self.speech_data[video_id][timestamp], speech_label) self.speech_data[video_id][timestamp] = new_speech_label return entity_set
def _cache_entity_data_forward(self, csv_file_path, target_video): entity_list = list() csv_data = io.csv_to_list(csv_file_path) csv_data.pop(0) # CSV header for csv_row in csv_data: video_id = csv_row[0] if video_id != target_video: continue entity_id = csv_row[-3] timestamp = csv_row[1] entity_label = self._postprocess_entity_label(csv_row[-2]) entity_list.append((video_id, entity_id, timestamp)) minimal_entity_data = (entity_id, timestamp, entity_label ) # sfate to ingore label here if video_id not in self.entity_data.keys(): self.entity_data[video_id] = {} if entity_id not in self.entity_data[video_id].keys(): self.entity_data[video_id][entity_id] = [] self.entity_data[video_id][entity_id].append(minimal_entity_data) return entity_list
def _cache_feature_file(self, csv_file): entity_data = {} feature_list = [] ts_to_entity = {} print('load feature data', csv_file) csv_data = io.csv_to_list(csv_file) for csv_row in csv_data: video_id = csv_row[0] ts = csv_row[1] entity_id = csv_row[2] features = self._decode_feature_data_from_csv(csv_row[-1]) label = int(float(csv_row[3])) # entity_data if video_id not in entity_data.keys(): entity_data[video_id] = {} if entity_id not in entity_data[video_id].keys(): entity_data[video_id][entity_id] = {} if ts not in entity_data[video_id][entity_id].keys(): entity_data[video_id][entity_id][ts] = [] entity_data[video_id][entity_id][ts] = (features, label) feature_list.append((video_id, entity_id, ts)) # ts_to_entity if video_id not in ts_to_entity.keys(): ts_to_entity[video_id] = {} if ts not in ts_to_entity[video_id].keys(): ts_to_entity[video_id][ts] = [] ts_to_entity[video_id][ts].append(entity_id) print('loaded ', len(feature_list), ' features') return entity_data, feature_list, ts_to_entity
ava_ground_truth_dir = '.../AVA/csv/val' #AVA original ground truth files temporary_dir = '.../temp/activeSpeakers' #Just an empty temporary dir # The script will generate these two, use them for the official AVA evaluation dataset_predictions_csv = '.../Forwards/ActiveSpeakers/publish/final/STE.csv' #file with final predictions dataset_gt_csv = '...Forwards/ActiveSpeakers/publish/final/gt.csv' # Utility file to use the official evaluation tool #cleanup temp dir del_files = glob.glob(temporary_dir + '/*') for f in del_files: os.remove(f) pred_files, gt_files = select_files(forward_dir, ava_ground_truth_dir) for idx, (pf, gtf) in enumerate(zip(pred_files, gt_files)): prediction_data = csv_to_list(pf) gt_data = csv_to_list(gtf) print('Match', os.path.basename(pf), len(prediction_data), len(gt_data)) if len(prediction_data) != len(gt_data): raise Exception('Groundtruth and prediction dont match in lenght') post_processed_predictions = prediction_postprocessing( prediction_data, 1) #reformat into ava required style for idx in range(len(post_processed_predictions)): post_processed_predictions[idx] = [ gt_data[idx][0], gt_data[idx][1], gt_data[idx][2], gt_data[idx][3], gt_data[idx][4], gt_data[idx][5],