def test_bulk_read_write(store): data_as_dict = dict(make_entry() for _ in range(100)) bulk_write(store, data_as_dict) assert bulk_read(store) == data_as_dict assert set(store.list()) == set(data_as_dict.keys()) # Get half of the data subset = dict(islice(data_as_dict.items(), 0, int(len(data_as_dict) / 2))) assert bulk_read(store, select=subset.keys()) == subset
def Convert(config): # collect_files(get_ray_nodes()) reps = ReprStorage(os.path.join(config.repr.directory)) # print('Extracting Signatures from Video representations') sm = SimilarityModel() vid_level_iterator = bulk_read(reps.video_level) print("Prepare to update database! vid_num :" + str(len(vid_level_iterator))) if len(vid_level_iterator) > 0: signatures = sm.predict( vid_level_iterator) # Get {ReprKey => signature} dict if config.database.use: # Convert dict to list of (path, sha256, url, signature) tuples entries = [(key.path, key.hash, key.url, sig) for key, sig in signatures.items()] # Connect to database database = Database(uri=config.database.uri) database.create_tables() try: # Save signatures result_storage = DBResultStorage(database) result_storage.add_signatures(entries) # after writen to db, remove. for key, sig in signatures.items(): remove_file("/project/data/representations/video_level/" + key.path + ".npy") except Exception as e: print("save db ERROR!") print(e)
def test_intermediate_cnn_extractor(intermediate_cnn_results, repr_keys): assert set(intermediate_cnn_results.frame_level.list()) == set(repr_keys) frame_level_features = list(bulk_read(intermediate_cnn_results.frame_level).values()) shapes_correct = sum(features.shape[1] == 4096 for features in frame_level_features) assert shapes_correct == len(repr_keys)
def signatures(frame_to_video_results): """Get calculated signatures as a dict. Each test dependent on this fixture is guaranteed to be executed AFTER signatures are calculated. Returns: Signatures dict (orig_path,hash) => signature. """ reprs = frame_to_video_results sm = SimilarityModel() signatures = sm.predict(bulk_read(reprs.video_level)) for repr_key, sig_value in signatures.items(): reprs.signature.write(repr_key, sig_value) return signatures
def test_saved_signatures(reprs, repr_keys): signatures = bulk_read(reprs.signature) assert set(signatures.keys()) == set(repr_keys) signatures_array = np.array(list(signatures.values())) assert signatures_array.shape == (NUMBER_OF_TEST_VIDEOS, 500)
def test_frame_to_video_converter(frame_to_video_results, repr_keys): assert set(frame_to_video_results.video_level.list()) == set(repr_keys) video_level_features = np.array(list(bulk_read(frame_to_video_results.video_level).values())) assert video_level_features.shape == (len(repr_keys), 1, 4096)
def main(config, list_of_files, frame_sampling, save_frames): config = resolve_config(config_path=config, frame_sampling=frame_sampling, save_frames=save_frames) reps = ReprStorage(os.path.join(config.repr.directory)) reprkey = reprkey_resolver(config) print('Searching for Dataset Video Files') if len(list_of_files) == 0: videos = scan_videos(config.sources.root, '**', extensions=config.sources.extensions) else: videos = scan_videos_from_txt(list_of_files, extensions=config.sources.extensions) print('Number of files found: {}'.format(len(videos))) remaining_videos_path = [ path for path in videos if not reps.frame_level.exists(reprkey(path)) ] print('There are {} videos left'.format(len(remaining_videos_path))) VIDEOS_LIST = create_video_list(remaining_videos_path, config.proc.video_list_filename) print('Processed video List saved on :{}'.format(VIDEOS_LIST)) if len(remaining_videos_path) > 0: # Instantiates the extractor model_path = default_model_path( config.proc.pretrained_model_local_path) extractor = IntermediateCnnExtractor( video_src=VIDEOS_LIST, reprs=reps, reprkey=reprkey, frame_sampling=config.proc.frame_sampling, save_frames=config.proc.save_frames, model=(load_featurizer(model_path))) # Starts Extracting Frame Level Features extractor.start(batch_size=16, cores=4) print('Converting Frame by Frame representations to Video Representations') converter = FrameToVideoRepresentation(reps) converter.start() print('Extracting Signatures from Video representations') sm = SimilarityModel() vid_level_iterator = bulk_read(reps.video_level) assert len(vid_level_iterator) > 0, 'No Signatures left to be processed' signatures = sm.predict( vid_level_iterator) # Get {ReprKey => signature} dict print('Saving Video Signatures on :{}'.format(reps.signature.directory)) if config.database.use: # Convert dict to list of (path, sha256, signature) tuples entries = [(key.path, key.hash, key.url, sig) for key, sig in signatures.items()] # Connect to database database = Database(uri=config.database.uri) database.create_tables() # Save signatures result_storage = DBResultStorage(database) result_storage.add_signatures(entries) if config.save_files: bulk_write(reps.signature, signatures)
def main(config): print('Loading config file') config = resolve_config(config_path=config) reps = ReprStorage(config.repr.directory) # Get mapping (path,hash) => sig. print('Extracting Video Signatures') signature_iterator = bulk_read(reps.signature) if len(signature_iterator) == 0: vid_level_iterator = bulk_read(reps.video_level) assert len(vid_level_iterator) > 0, "No video_level features were found" sm = SimilarityModel() signatures_dict = sm.predict(bulk_read(reps.video_level)) # Unpack paths, hashes and signatures as separate np.arrays repr_keys, video_signatures = zip(*signatures_dict.items()) else: repr_keys, video_signatures = zip(*signature_iterator.items()) paths = np.array([key.path for key in repr_keys]) hashes = np.array([key.hash for key in repr_keys]) video_signatures = np.array(video_signatures) print('Finding Matches...') # Handles small tests for which number of videos < number of neighbors t0 = time.time() neighbors = min(20,video_signatures.shape[0]) nn = NearestNeighbors(n_neighbors=neighbors,metric='euclidean',algorithm='kd_tree') nn.fit(video_signatures) distances,indices = nn.kneighbors(video_signatures) print('{} seconds spent finding matches '.format(time.time()-t0)) results,results_distances = filter_results(config.proc.match_distance, distances, indices) ss = sorted(zip(results,results_distances),key=lambda x:len(x[0]),reverse=True) results_sorted = [x[0] for x in ss] results_sorted_distance = [x[1] for x in ss] q = [] m = [] distance = [] print('Generating Report') for i,r in enumerate(results_sorted): for j,matches in enumerate(r): if j == 0: qq = matches q.append(qq) m.append(matches) distance.append(results_sorted_distance[i][j]) match_df = pd.DataFrame({"query":q,"match":m,"distance":distance}) match_df['query_video'] = paths[match_df['query']] match_df['query_sha256'] = hashes[match_df['query']] match_df['match_video'] = paths[match_df['match']] match_df['match_sha256'] = hashes[match_df['match']] match_df['self_match'] = match_df['query_video'] == match_df['match_video'] # Remove self matches match_df = match_df.loc[~match_df['self_match'], :] # Creates unique index from query, match match_df['unique_index'] = match_df.apply(uniq, axis=1) # Removes duplicated entries (eg if A matches B, we don't need B matches A) match_df = match_df.drop_duplicates(subset=['unique_index']) REPORT_PATH = os.path.join(config.repr.directory, f'matches_at_{config.proc.match_distance}_distance.csv') print('Saving unfiltered report to {}'.format(REPORT_PATH)) match_df.to_csv(REPORT_PATH) # if config.proc.detect_scenes: # # frame_features_dict = bulk_read(reps.frame_level, select=None) # assert len(frame_features_dict) > 0, 'No Frame Level features were found.' # scenes = extract_scenes(frame_features_dict) # scene_metadata = pd.DataFrame(asdict(scenes)) # # if config.database.use: # # Connect to database # database = Database(uri=config.database.uri) # database.create_tables() # # # Save scenes # result_storage = DBResultStorage(database) # result_storage.add_scenes(zip(scenes.video_filename, scenes.video_sha256, scenes.scene_duration_seconds)) # # if config.save_files: # # SCENE_METADATA_OUTPUT_PATH = os.path.join(config.repr.directory, 'scene_metadata.csv') # scene_metadata.to_csv(SCENE_METADATA_OUTPUT_PATH) # print('Scene Metadata saved in:'.format(SCENE_METADATA_OUTPUT_PATH)) if config.proc.filter_dark_videos: print('Filtering dark and/or short videos') # Get original files for which we have both frames and frame-level features repr_keys = list(set(reps.video_level.list())) paths = [key.path for key in repr_keys] hashes = [key.hash for key in repr_keys] print('Extracting additional information from video files') brightness_estimation = np.array([get_brightness_estimation(reps, key) for key in tqdm(repr_keys)]) print(brightness_estimation.shape) metadata_df = pd.DataFrame({"fn": paths, "sha256": hashes, "gray_max":brightness_estimation.reshape(brightness_estimation.shape[0])}) # Flag videos to be discarded metadata_df['video_dark_flag'] = metadata_df.gray_max < config.proc.filter_dark_videos_thr print('Videos discarded because of darkness:{}'.format(metadata_df['video_dark_flag'].sum())) metadata_df['flagged'] = metadata_df['video_dark_flag'] # Discard videos discarded_videos = metadata_df.loc[metadata_df['flagged'], :][['fn', 'sha256']] discarded_videos = set(tuple(row) for row in discarded_videos.to_numpy()) # Function to check if the (path,hash) row is in the discarded set def is_discarded(row): return tuple(row) in discarded_videos msk_1 = match_df[['query_video', 'query_sha256']].apply(is_discarded, axis=1) msk_2 = match_df[['match_video', 'match_sha256']].apply(is_discarded, axis=1) discard_msk = msk_1 | msk_2 FILTERED_REPORT_PATH = os.path.join(config.repr.directory, f'matches_at_{config.proc.match_distance}_distance_filtered.csv') METADATA_REPORT_PATH = os.path.join(config.repr.directory, 'metadata_signatures.csv') match_df = match_df.loc[~discard_msk, :] if config.database.use: # Connect to database and ensure schema database = Database(uri=config.database.uri) database.create_tables() # Save metadata result_storage = DBResultStorage(database) if metadata_df is not None: metadata_entries = metadata_df[['fn', 'sha256']] metadata_entries['metadata'] = metadata_df.drop(columns=['fn', 'sha256']).to_dict('records') result_storage.add_metadata(metadata_entries.to_numpy()) # Save matches match_columns = ['query_video', 'query_sha256', 'match_video', 'match_sha256', 'distance'] result_storage.add_matches(match_df[match_columns].to_numpy()) if config.save_files: print('Saving metadata to {}'.format(METADATA_REPORT_PATH)) metadata_df.to_csv(METADATA_REPORT_PATH) print('Saving Filtered Matches report to {}'.format(METADATA_REPORT_PATH)) match_df.to_csv(FILTERED_REPORT_PATH)