def Convert(config):
    # collect_files(get_ray_nodes())
    reps = ReprStorage(os.path.join(config.repr.directory))
    # print('Extracting Signatures from Video representations')
    sm = SimilarityModel()
    vid_level_iterator = bulk_read(reps.video_level)
    print("Prepare to update database! vid_num :" +
          str(len(vid_level_iterator)))
    if len(vid_level_iterator) > 0:
        signatures = sm.predict(
            vid_level_iterator)  # Get {ReprKey => signature} dict

        if config.database.use:
            # Convert dict to list of (path, sha256, url, signature) tuples
            entries = [(key.path, key.hash, key.url, sig)
                       for key, sig in signatures.items()]

            # Connect to database
            database = Database(uri=config.database.uri)
            database.create_tables()

            try:
                # Save signatures
                result_storage = DBResultStorage(database)
                result_storage.add_signatures(entries)
                # after writen to db, remove.
                for key, sig in signatures.items():
                    remove_file("/project/data/representations/video_level/" +
                                key.path + ".npy")
            except Exception as e:
                print("save db ERROR!")
                print(e)
def main(override, template_dir):

    print('Loading model...')
    model_path = default_model_path(config.proc.pretrained_model_local_path)
    model = load_featurizer(model_path)

    templates_source = config.templates.source_path

    if len(template_dir) > 0:

        templates_source = template_dir

    print(f'Initiating search engine using templates from: '
          f'{templates_source} and looking at '
          f'videos located in: {config.repr.directory}')

    reprs = ReprStorage(config.repr.directory)
    se = SearchEngine(templates_root=templates_source,
                      reprs=reprs,
                      model=model)

    template_matches = se.create_annotation_report(
        threshold=DISTANCE,
        fp=TEMPLATE_TEST_OUTPUT,
        frame_sampling=config.proc.frame_sampling)

    tm_entries = template_matches[['fn', 'sha256']]
    tm_entries['template_matches'] = template_matches.drop(
        columns=['fn', 'sha256']).to_dict('records')

    if config.database.use:

        # Connect to database
        database = Database(uri=config.database.uri)
        database.create_tables()

        # Save Template Matches
        result_storage = DBResultStorage(database)
        result_storage.add_template_matches(tm_entries.to_numpy(),
                                            override=override)

    if config.save_files:

        TEMPLATE_MATCHES_REPORT_PATH = os.path.join(config.repr.directory,
                                                    'template_matches.csv')
        template_matches.to_csv(TEMPLATE_MATCHES_REPORT_PATH)

        print(
            f"Template Matches report exported to:{TEMPLATE_MATCHES_REPORT_PATH}"
        )

    print('Report saved to {}'.format(TEMPLATE_TEST_OUTPUT))
def main(config):

    print('Loading config file')
    config = resolve_config(config_path=config)
    storepath = path_resolver(config.sources.root)

    if config.database.use:
        database = Database(uri=config.database.uri)
        database.create_tables()

        with database.session_scope() as session:
            video_records = session.query(Files).yield_per(10**4)
            path_hash_pairs = [(join(config.sources.root,
                                     record.file_path), record.sha256)
                               for record in video_records]
            videos, hashes = zip(*path_hash_pairs)
    else:

        videos = scan_videos(config.sources.root,
                             '**',
                             extensions=config.sources.extensions)
        hashes = [get_hash(video) for video in videos]

    assert len(videos) > 0, 'No videos found'

    print(f'{len(videos)} videos found')

    metadata = extract_from_list_of_videos(videos)

    df = convert_to_df(metadata)

    df_parsed = parse_and_filter_metadata_df(df)

    assert len(metadata) == len(df_parsed)

    if config.save_files:

        EXIF_REPORT_PATH = join(config.repr.directory, 'exif_metadata.csv')

        df_parsed.to_csv(EXIF_REPORT_PATH)

        print(f"Exif Metadata report exported to:{EXIF_REPORT_PATH}")

    if config.database.use:
        database = Database(uri=config.database.uri)
        result_store = DBResultStorage(database)
        exif_entries = zip(map(storepath, videos), hashes,
                           df_parsed.to_dict('records'))
        result_store.add_exifs(exif_entries)
Esempio n. 4
0
def store():
    """Database storage fixture."""
    in_memory_database = Database.in_memory(echo=False)
    in_memory_database.create_tables()
    return DBResultStorage(in_memory_database)
def find_matchs(config):
    print('Reading Video Signatures')
    database = Database(uri=config.database.uri)
    with database.session_scope() as session:
        query = session.query(Files).options(joinedload(Files.signature))
        files = query.filter().all()

        signature_iterator = dict()
        for file in files:
            if file.signature is not None and check_is_signature_valid(file):
                with open("/tmp/test.txt", "wb+") as f:
                    f.write(file.signature.signature)
                    f.seek(0)
                    str = f.read()
                    len_s = len(str)
                    sig = struct.unpack(('%df' % (len_s / 4)), str)

                signature_iterator[ReprKey(path=file.file_path,
                                           hash=file.sha256,
                                           tag=file.meta,
                                           url=file.file_url)] = sig

        repr_keys, video_signatures = zip(*signature_iterator.items())
        paths = np.array([key.path for key in repr_keys])
        hashes = np.array([key.hash for key in repr_keys])
        video_signatures = np.array(video_signatures)

    print('Finding Matches...')
    # Handles small tests for which number of videos <  number of neighbors
    t0 = time.time()
    neighbors = min(20, video_signatures.shape[0])
    nn = NearestNeighbors(n_neighbors=neighbors,
                          metric='euclidean',
                          algorithm='kd_tree')
    nn.fit(video_signatures)
    distances, indices = nn.kneighbors(video_signatures)
    print('{} seconds spent finding matches '.format(time.time() - t0))
    results, results_distances = filter_results(config.proc.match_distance,
                                                distances, indices)

    ss = sorted(zip(results, results_distances),
                key=lambda x: len(x[0]),
                reverse=True)
    results_sorted = [x[0] for x in ss]
    results_sorted_distance = [x[1] for x in ss]

    q = []
    m = []
    distance = []

    print('Generating Report')
    for i, r in enumerate(results_sorted):
        for j, matches in enumerate(r):
            if j == 0:
                qq = matches
            q.append(qq)
            m.append(matches)
            distance.append(results_sorted_distance[i][j])

    match_df = pd.DataFrame({"query": q, "match": m, "distance": distance})
    match_df['query_video'] = paths[match_df['query']]
    match_df['query_sha256'] = hashes[match_df['query']]
    match_df['match_video'] = paths[match_df['match']]
    match_df['match_sha256'] = hashes[match_df['match']]
    match_df['self_match'] = match_df['query_video'] == match_df['match_video']
    # Remove self matches
    match_df = match_df.loc[~match_df['self_match'], :]
    # Creates unique index from query, match
    match_df['unique_index'] = match_df.apply(uniq, axis=1)
    # Removes duplicated entries (eg if A matches B, we don't need B matches A)
    match_df = match_df.drop_duplicates(subset=['unique_index'])

    # if config.proc.filter_dark_videos:
    #
    #     print('Filtering dark and/or short videos')
    #
    #     # Get original files for which we have both frames and frame-level features
    #     repr_keys = list(set(reps.video_level.list()))
    #     paths = [key.path for key in repr_keys]
    #     hashes = [key.hash for key in repr_keys]
    #
    #     print('Extracting additional information from video files')
    #     brightness_estimation = np.array([get_brightness_estimation(reps, key) for key in tqdm(repr_keys)])
    #     print(brightness_estimation.shape)
    #     metadata_df = pd.DataFrame({"fn": paths,
    #                                 "sha256": hashes,
    #                                 "gray_max":brightness_estimation.reshape(brightness_estimation.shape[0])})
    #
    #     # Flag videos to be discarded
    #
    #     metadata_df['video_dark_flag'] = metadata_df.gray_max < config.proc.filter_dark_videos_thr
    #
    #     print('Videos discarded because of darkness:{}'.format(metadata_df['video_dark_flag'].sum()))
    #
    #     metadata_df['flagged'] = metadata_df['video_dark_flag']
    #
    #     # Discard videos
    #     discarded_videos = metadata_df.loc[metadata_df['flagged'], :][['fn', 'sha256']]
    #     discarded_videos = set(tuple(row) for row in discarded_videos.to_numpy())
    #
    #     # Function to check if the (path,hash) row is in the discarded set
    #     def is_discarded(row):
    #         return tuple(row) in discarded_videos
    #
    #     msk_1 = match_df[['query_video', 'query_sha256']].apply(is_discarded, axis=1)
    #     msk_2 = match_df[['match_video', 'match_sha256']].apply(is_discarded, axis=1)
    #     discard_msk = msk_1 | msk_2
    #
    #     match_df = match_df.loc[~discard_msk, :]
    if config.database.use:
        # Connect to database and ensure schema
        database = Database(uri=config.database.uri)
        database.create_tables()

        # Save metadata
        result_storage = DBResultStorage(database)

        # if metadata_df is not None:
        #     metadata_entries = metadata_df[['fn', 'sha256']]
        #     metadata_entries['metadata'] = metadata_df.drop(columns=['fn', 'sha256']).to_dict('records')
        #     result_storage.add_metadata(metadata_entries.to_numpy())

        # Save matches
        match_columns = [
            'query_video', 'query_sha256', 'match_video', 'match_sha256',
            'distance'
        ]

        result_storage.add_matches(match_df[match_columns].to_numpy())
def main(config, list_of_files, frame_sampling, save_frames):
    config = resolve_config(config_path=config,
                            frame_sampling=frame_sampling,
                            save_frames=save_frames)

    reps = ReprStorage(os.path.join(config.repr.directory))
    reprkey = reprkey_resolver(config)

    print('Searching for Dataset Video Files')

    if len(list_of_files) == 0:
        videos = scan_videos(config.sources.root,
                             '**',
                             extensions=config.sources.extensions)
    else:
        videos = scan_videos_from_txt(list_of_files,
                                      extensions=config.sources.extensions)

    print('Number of files found: {}'.format(len(videos)))

    remaining_videos_path = [
        path for path in videos if not reps.frame_level.exists(reprkey(path))
    ]

    print('There are {} videos left'.format(len(remaining_videos_path)))

    VIDEOS_LIST = create_video_list(remaining_videos_path,
                                    config.proc.video_list_filename)

    print('Processed video List saved on :{}'.format(VIDEOS_LIST))

    if len(remaining_videos_path) > 0:
        # Instantiates the extractor
        model_path = default_model_path(
            config.proc.pretrained_model_local_path)
        extractor = IntermediateCnnExtractor(
            video_src=VIDEOS_LIST,
            reprs=reps,
            reprkey=reprkey,
            frame_sampling=config.proc.frame_sampling,
            save_frames=config.proc.save_frames,
            model=(load_featurizer(model_path)))
        # Starts Extracting Frame Level Features
        extractor.start(batch_size=16, cores=4)

    print('Converting Frame by Frame representations to Video Representations')

    converter = FrameToVideoRepresentation(reps)

    converter.start()

    print('Extracting Signatures from Video representations')

    sm = SimilarityModel()

    vid_level_iterator = bulk_read(reps.video_level)

    assert len(vid_level_iterator) > 0, 'No Signatures left to be processed'

    signatures = sm.predict(
        vid_level_iterator)  # Get {ReprKey => signature} dict

    print('Saving Video Signatures on :{}'.format(reps.signature.directory))

    if config.database.use:
        # Convert dict to list of (path, sha256, signature) tuples
        entries = [(key.path, key.hash, key.url, sig)
                   for key, sig in signatures.items()]

        # Connect to database
        database = Database(uri=config.database.uri)
        database.create_tables()

        # Save signatures
        result_storage = DBResultStorage(database)
        result_storage.add_signatures(entries)

    if config.save_files:
        bulk_write(reps.signature, signatures)
def main(config):

    print('Loading config file')
    config = resolve_config(config_path=config)
    reps = ReprStorage(config.repr.directory)

    # Get mapping (path,hash) => sig.
    print('Extracting Video Signatures')
    signature_iterator = bulk_read(reps.signature)

    if len(signature_iterator) == 0:

        vid_level_iterator = bulk_read(reps.video_level)
        assert len(vid_level_iterator) > 0, "No video_level features were found"
        sm = SimilarityModel()
        signatures_dict = sm.predict(bulk_read(reps.video_level))
        # Unpack paths, hashes and signatures as separate np.arrays
        repr_keys, video_signatures = zip(*signatures_dict.items())

    else:
        repr_keys, video_signatures = zip(*signature_iterator.items())
    paths = np.array([key.path for key in repr_keys])
    hashes = np.array([key.hash for key in repr_keys])
    video_signatures = np.array(video_signatures)
    
    
    print('Finding Matches...')
    # Handles small tests for which number of videos <  number of neighbors
    t0 = time.time()
    neighbors = min(20,video_signatures.shape[0])
    nn = NearestNeighbors(n_neighbors=neighbors,metric='euclidean',algorithm='kd_tree')
    nn.fit(video_signatures)
    distances,indices =  nn.kneighbors(video_signatures)
    print('{} seconds spent finding matches '.format(time.time()-t0))
    results,results_distances = filter_results(config.proc.match_distance, distances, indices)

    ss = sorted(zip(results,results_distances),key=lambda x:len(x[0]),reverse=True)
    results_sorted = [x[0] for x in ss]
    results_sorted_distance = [x[1] for x in ss]


    q = []
    m = []
    distance = []

    print('Generating Report')
    for i,r in enumerate(results_sorted):
        for j,matches in enumerate(r):
            if j == 0:
                qq = matches
            q.append(qq)
            m.append(matches)
            distance.append(results_sorted_distance[i][j])

    match_df = pd.DataFrame({"query":q,"match":m,"distance":distance})
    match_df['query_video'] = paths[match_df['query']]
    match_df['query_sha256'] = hashes[match_df['query']]
    match_df['match_video'] = paths[match_df['match']]
    match_df['match_sha256'] = hashes[match_df['match']]
    match_df['self_match'] = match_df['query_video'] == match_df['match_video']
    # Remove self matches
    match_df = match_df.loc[~match_df['self_match'], :]
    # Creates unique index from query, match 
    match_df['unique_index'] = match_df.apply(uniq, axis=1)
    # Removes duplicated entries (eg if A matches B, we don't need B matches A)
    match_df = match_df.drop_duplicates(subset=['unique_index'])

    REPORT_PATH = os.path.join(config.repr.directory, f'matches_at_{config.proc.match_distance}_distance.csv')

    print('Saving unfiltered report to {}'.format(REPORT_PATH))

    match_df.to_csv(REPORT_PATH)

#    if config.proc.detect_scenes:
#
#        frame_features_dict = bulk_read(reps.frame_level, select=None)
#        assert len(frame_features_dict) > 0, 'No Frame Level features were found.'
#        scenes = extract_scenes(frame_features_dict)
#        scene_metadata = pd.DataFrame(asdict(scenes))
#
#        if config.database.use:
#            # Connect to database
#            database = Database(uri=config.database.uri)
#            database.create_tables()
#
#            # Save scenes
#            result_storage = DBResultStorage(database)
#            result_storage.add_scenes(zip(scenes.video_filename, scenes.video_sha256, scenes.scene_duration_seconds))
#
#        if config.save_files:
#
#            SCENE_METADATA_OUTPUT_PATH = os.path.join(config.repr.directory, 'scene_metadata.csv')
#            scene_metadata.to_csv(SCENE_METADATA_OUTPUT_PATH)
#            print('Scene Metadata saved in:'.format(SCENE_METADATA_OUTPUT_PATH))


    if config.proc.filter_dark_videos:

        print('Filtering dark and/or short videos')

        # Get original files for which we have both frames and frame-level features
        repr_keys = list(set(reps.video_level.list()))
        paths = [key.path for key in repr_keys]
        hashes = [key.hash for key in repr_keys]

        print('Extracting additional information from video files')
        brightness_estimation = np.array([get_brightness_estimation(reps, key) for key in tqdm(repr_keys)])
        print(brightness_estimation.shape)
        metadata_df = pd.DataFrame({"fn": paths,
                                    "sha256": hashes,
                                    "gray_max":brightness_estimation.reshape(brightness_estimation.shape[0])})

        # Flag videos to be discarded

        metadata_df['video_dark_flag'] = metadata_df.gray_max < config.proc.filter_dark_videos_thr

        print('Videos discarded because of darkness:{}'.format(metadata_df['video_dark_flag'].sum()))

        metadata_df['flagged'] = metadata_df['video_dark_flag'] 

        # Discard videos
        discarded_videos = metadata_df.loc[metadata_df['flagged'], :][['fn', 'sha256']]
        discarded_videos = set(tuple(row) for row in discarded_videos.to_numpy())

        # Function to check if the (path,hash) row is in the discarded set
        def is_discarded(row):
            return tuple(row) in discarded_videos

        msk_1 = match_df[['query_video', 'query_sha256']].apply(is_discarded, axis=1)
        msk_2 = match_df[['match_video', 'match_sha256']].apply(is_discarded, axis=1)
        discard_msk = msk_1 | msk_2

        FILTERED_REPORT_PATH = os.path.join(config.repr.directory,
                                            f'matches_at_{config.proc.match_distance}_distance_filtered.csv')
        METADATA_REPORT_PATH = os.path.join(config.repr.directory, 'metadata_signatures.csv')

        match_df = match_df.loc[~discard_msk, :]        

    if config.database.use:
        # Connect to database and ensure schema
        database = Database(uri=config.database.uri)
        database.create_tables()

        # Save metadata
        result_storage = DBResultStorage(database)
        
        if metadata_df is not None:

            metadata_entries = metadata_df[['fn', 'sha256']]
            metadata_entries['metadata'] = metadata_df.drop(columns=['fn', 'sha256']).to_dict('records')
            result_storage.add_metadata(metadata_entries.to_numpy())

        # Save matches
        match_columns = ['query_video', 'query_sha256', 'match_video', 'match_sha256', 'distance']

        result_storage.add_matches(match_df[match_columns].to_numpy())

    if config.save_files:

        print('Saving metadata to {}'.format(METADATA_REPORT_PATH))
        metadata_df.to_csv(METADATA_REPORT_PATH)
        print('Saving Filtered Matches report to {}'.format(METADATA_REPORT_PATH))
        match_df.to_csv(FILTERED_REPORT_PATH)