Ejemplo n.º 1
0
def batch_features(x, batch_size=1):
    for i in range(0, x.shape[0], batch_size):
        xb = x[i:i + batch_size]
        xb = utils.compute_if_dask(xb, progress=False)
        for xi in xb:
            yield xi
        del xb
Ejemplo n.º 2
0
def features_to_str(x, batch_size=1, boost=False):
    for i in range(0, x.shape[0], batch_size):
        xb = x[i:i + batch_size]
        xb = utils.compute_if_dask(xb, progress=False)
        for xi in xb:
            yield surrogate_text(xi, boost=boost)
        del xb
Ejemplo n.º 3
0
def generate_index_actions(es, index_name, x, x_ids, thr, s, batch_size=1):
    for i in range(0, x.shape[0], batch_size):
        xb = x[i:i + batch_size]
        xb = thr_sq(xb, thr, s)
        xb = utils.compute_if_dask(xb, progress=False)
        id_b = x_ids[i:i + batch_size]
        for xi_id, xi in zip(id_b, xb):
            # if es.exists(index_name, xi_id):
                # tqdm.write(f'Skipping: {xi_id}')
            #    continue
            yield {'_index': index_name, '_id': xi_id, 'repr': surrogate_text(xi)}
        del xb
Ejemplo n.º 4
0
def surrogate_text(x, boost=False):
    surrogate = []
    x = utils.compute_if_dask(x, progress=False)
    for term, freq in enumerate(x):
        if freq:
            if boost:
                surrogate.append('{}^{}'.format(str(term), freq))
            else:
                try:
                    surrogate.extend([str(term)] * freq)
                except:
                    print(freq, type(freq))

    return ' '.join(surrogate)
Ejemplo n.º 5
0
def main(args):
    dataset, q, x = utils.load_benchmark(args.dataset, args.features)

    q = utils.load_features(q, chunks=(2500, 2048))
    x = utils.load_features(x, chunks=(2500, 2048))

    if args.limit:
        x = x[:args.limit]

    n_points, dim = x.shape

    if args.n_cells is None:
        step_k = 2500
        min_points_per_centroid = 39.0
        max_points_per_centroid = 256.0

        # n_train_points = min(n_points, 120000) # train index with less points or it crashes..
        min_k = np.ceil(
            n_points / (step_k * max_points_per_centroid)).astype(int) * step_k
        max_k = np.floor(
            n_points / (step_k * min_points_per_centroid)).astype(int) * step_k
        args.n_cells = min_k
        print('Using min suggested cells:', args.n_cells)

    exp = Experiment(args, root=args.output, ignore=('output', 'pretrained'))
    print(exp)

    # create or load faiss index
    index_file = exp.path_to('index.faiss')
    if not os.path.exists(index_file):
        if args.pretrained:
            print('Loading pre-trained empty index ...')
            index = faiss.read_index(args.pretrained)
            train_time = None
        else:
            tmp = utils.compute_if_dask(x)
            print('Creating index: training ...')
            index = faiss.index_factory(
                dim, 'IVF{},PQ{}'.format(args.n_cells, args.code_size))
            # index = faiss.index_factory(dim, 'IVF{},Flat'.format(args.n_cells))
            start = time.time()
            index.train(tmp)
            train_time = time.time() - start
            del tmp

        print('Creating index: adding ...')
        start = time.time()
        bs = 2**14
        for i in trange(0, x.shape[0], bs):
            batch = utils.compute_if_dask(x[i:i + bs])
            index.add(batch)
        add_time = time.time() - start

        faiss.write_index(index, index_file)
        size = os.path.getsize(index_file)
        index_stats_file = exp.path_to('index_stats.csv')
        index_stats = pd.DataFrame(
            {
                'size': size,
                'train_time': train_time,
                'add_time': add_time
            },
            index=[0])
        index_stats.to_csv(index_stats_file, index=False)
    else:
        print('Loading pre-built index ...')
        index = faiss.read_index(index_file)

    n_probes = (1, 2, 5, 10, 25)  # , 50, 100, 250, 500, 1000, 2500, 5000)
    n_probes = filter(lambda x: x <= args.n_cells, n_probes)
    params = vars(args)
    progress = tqdm(n_probes)
    for p in progress:
        index.nprobe = p
        params['nprobe'] = p
        progress.set_postfix(
            {k: v
             for k, v in params.items() if k != 'output'})

        scores = None
        scores_file = exp.path_to(f'scores_np{p}.h5')
        if not os.path.exists(scores_file):
            print('Computing scores:', scores_file)
            q = utils.compute_if_dask(q)
            # execute kNN search using k = dataset size
            ranked_sim, ranked_ids = index.search(q, n_points)
            # we need a similarity matrix, we construct it from the ranked results.
            # we fill it initially with the lowest score (not recovered IDs has infinity score)
            if False:  # XXX OPTIMIZED VERSION NOT WORKING!!!!
                ranked_ids = np.ma.array(ranked_ids, mask=(ranked_ids < 0))
                id_order = ranked_ids.argsort(axis=1)
                scores = -ranked_sim[np.arange(q.shape[0]).reshape(-1, 1),
                                     id_order]
                del ranked_sim, ranked_ids, id_order
            else:
                scores = np.full((q.shape[0], n_points), np.inf)
                for i, (rsims, rids) in enumerate(zip(ranked_sim, ranked_ids)):
                    for rsim, rid in zip(rsims, rids):
                        if rid > 0:
                            scores[i, rid] = rsim
                scores = -scores

            utils.save_as_hdf5(scores, scores_file, progress=True)

        query_times, query_times_file = exp.require_csv('query_times.csv',
                                                        index='n_probes')
        for i in trange(1, 6):
            if utils.value_missing(query_times, p, f'query_time_run{i}'):
                q = utils.compute_if_dask(q)
                start = time.time()
                index.search(q, n_points)
                query_time = time.time() - start
                query_times.at[p, f'query_time_run{i}'] = query_time
                query_times.to_csv(query_times_file)

        metrics, metrics_file = exp.require_csv(f'metrics_np{p}.csv')

        if 'ap' not in metrics:
            if scores is None:
                print('Loading scores...')
                scores = utils.load_features(scores_file)
            print('Computing mAP...')
            metrics['ap'] = dataset.score(scores[...],
                                          reduction=False,
                                          progress=True)
            metrics.to_csv(metrics_file, index=False)

        if 'ndcg' not in metrics:
            dataset._load()  # TODO in y_true getter
            if scores is None:
                print('Loading scores...')
                scores = utils.load_features(scores_file)
            print('Computing nDCG...')
            y_true = dataset.y_true[:, :args.
                                    limit] if args.limit else dataset.y_true

            bs = 5
            ndcg = []
            for i in trange(0, y_true.shape[0], bs):
                ndcg.append(
                    dcg(y_true[i:i + bs], scores[i:i + bs], normalized=True))
            ndcg = np.concatenate(ndcg)

            # metrics['ndcg'] = dcg(y_true, scores, normalized=True)
            metrics['ndcg'] = ndcg
            metrics.to_csv(metrics_file, index=False)

        if 'ndcg@25' not in metrics:
            dataset._load()  # TODO in y_true getter
            if scores is None:
                progress.write('Loading scores...')
                scores = utils.load_features(scores_file)[...]
            progress.write('Computing nDCG@25...')
            y_true = dataset.y_true[:, :args.
                                    limit] if args.limit else dataset.y_true
            bs = 50
            ndcg = []
            for i in trange(0, y_true.shape[0], bs):
                ndcg.append(
                    dcg(y_true[i:i + bs],
                        scores[i:i + bs],
                        p=25,
                        normalized=True))

            metrics['ndcg@25'] = np.concatenate(ndcg)
            # metrics['ndcg'] = dcg(dataset.y_true, scores, normalized=True)
            metrics.to_csv(metrics_file, index=False)
            progress.write(f'nDCG@25: {metrics["ndcg@25"].mean()}')

        metrics['n_probes'] = p
        metrics.to_csv(metrics_file, index=False)
Ejemplo n.º 6
0
def main(args):
    es = Elasticsearch(timeout=30, max_retries=10, retry_on_timeout=True)
    dataset, q, x = utils.load_benchmark(args.dataset, args.features)

    q = utils.load_features(q, chunks=(5000, 2048))
    x = utils.load_features(x, chunks=(5000, 2048))
    n_queries, n_samples = q.shape[0], x.shape[0]

    if args.limit:
        x = x[:args.limit]

    if args.crelu:
        q = crelu(q)
        x = crelu(x)

    params = vars(args)
    ignore = ('output', 'force')
    progress = tqdm(zip(args.threshold, args.sq_factor), total=len(args.threshold))
    for thr, s in progress:
        params['threshold'] = thr
        params['sq_factor'] = s
        progress.set_postfix({k: v for k, v in params.items() if k not in ignore})
        exp = Experiment(params, root=args.output, ignore=ignore)

        density, density_file = exp.require_csv(f'density.csv')
        if 'query_density' not in density:
            progress.write('Computing query density ...')
            q_sq = thr_sq(q, thr, s)
            q_density = (q_sq != 0).mean(axis=0)
            q_density = utils.compute_if_dask(q_density)
            density['query_density'] = q_density
            density.to_csv(density_file, index=False)

        if 'database_density' not in density:
            progress.write('Computing database density ...')
            x_sq = thr_sq(x, thr, s)
            x_density = (x_sq != 0).mean(axis=0)
            x_density = utils.compute_if_dask(x_density)
            density['database_density'] = x_density
            density.to_csv(density_file, index=False)

        index_name = exp.name.lower()
        if not es.indices.exists(index_name) or es.count(index=index_name)['count'] < n_samples or args.force:
            # x_sq = thr_sq(x, thr, s)
            x_ids, _ = dataset.images()

            index_actions = generate_index_actions(es, index_name, x, x_ids, thr, s, 50)
            # index_actions = tqdm(index_actions, total=n_samples)

            progress.write(f'Indexing: {index_name}')

            index_config = {
                "mappings": {
                    "_source": {"enabled": False},  # do not store STR
                    "properties": {"repr": {"type": "text"}}  # FULLTEXT
                },
                "settings": {
                    "index": {"number_of_shards": 1, "number_of_replicas": 0},
                    "analysis": {"analyzer": {"first": {"type": "whitespace"}}}
                }
            }
            
            # es.indices.delete(index_name, ignore=(400, 404))
            es.indices.create(index_name, index_config, ignore=400)
            es.indices.put_settings({"index": {"refresh_interval": "-1", "number_of_replicas": 0}}, index_name)

            indexing = parallel_bulk(es, index_actions, thread_count=4, chunk_size=150, max_chunk_bytes=2**26)
            indexing = tqdm(indexing, total=n_samples)
            start = time.time()            
            deque(indexing, maxlen=0)
            add_time = time.time() - start
            progress.write(f'Index time: {add_time}')

            es.indices.put_settings({"index": {"refresh_interval": "1s"}}, index_name)
            es.indices.refresh()

            index_stats_file = exp.path_to('index_stats.csv')
            index_stats = pd.DataFrame({'add_time': add_time}, index=[0])
            index_stats.to_csv(index_stats_file, index=False)

        metrics, metrics_file = exp.require_csv(f'metrics.csv')

        scores = None
        scores_file = exp.path_to(f'scores.h5')
        if not os.path.exists(scores_file):
            progress.write('Computing scores...')

            xid2idx = {k: i for i, k in enumerate(dataset.images()[0])}
            q_sq = thr_sq(q, thr, s)
            q_sq = utils.compute_if_dask(q_sq, progress=False)

            scores = np.zeros((n_queries, n_samples), dtype=np.float32)
            query_times = []
            
            for i, qi in enumerate(tqdm(q_sq)):
                query = {
                    "query": {"query_string": {"default_field": "repr", "query": surrogate_text(qi, boost=True)}},
                    # "from": 0, "size": n_samples
                }
                start = time.time()
                for hit in tqdm(scan(es, query, index=index_name, preserve_order=True), total=n_samples):
                    j = xid2idx[hit['_id']]
                    scores[i, j] = hit['_score']

                query_times.append(time.time() - start)
            metrics['query_time'] = query_times
            metrics.to_csv(metrics_file, index=False)
            progress.write(f'Query time: {metrics.query_time.sum()}')
            utils.save_as_hdf5(scores, scores_file, progress=True)

        if 'ap' not in metrics:
            if scores is None:
                progress.write('Loading scores...')
                scores = utils.load_features(scores_file)[...]
            progress.write('Computing mAP...')
            metrics['ap'] = dataset.score(scores, reduction=False, progress=True)
            metrics.to_csv(metrics_file, index=False)
            progress.write(f'mAP: {metrics.ap.mean()}')

        if 'ndcg' not in metrics:
            dataset._load()  # TODO in y_true getter
            if scores is None:
                progress.write('Loading scores...')
                scores = utils.load_features(scores_file)[...]
            progress.write('Computing nDCG...')
            metrics['ndcg'] = dcg(dataset.y_true, scores, normalized=True)
            metrics.to_csv(metrics_file, index=False)
            progress.write(f'nDCG: {metrics.ndcg.mean()}')
Ejemplo n.º 7
0
def main(args):
    lucene_vm = lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    lucene_vm.attachCurrentThread()

    dataset, q, x = utils.load_benchmark(args.dataset, args.features)

    q = utils.load_features(q, chunks=(5000, 2048))
    x = utils.load_features(x, chunks=(5000, 2048))

    if args.limit:
        x = x[:args.limit]

    n_queries, n_samples = q.shape[0], x.shape[0]

    if args.crelu:
        q = crelu(q)
        x = crelu(x)

    params = vars(args)
    ignore = ('output', 'force')
    progress = tqdm(zip(args.threshold, args.sq_factor),
                    total=len(args.threshold))
    for thr, s in progress:
        params['threshold'] = thr
        params['sq_factor'] = s
        progress.set_postfix(
            {k: v
             for k, v in params.items() if k not in ignore})
        exp = Experiment(params, root=args.output, ignore=ignore)

        density, density_file = exp.require_csv(f'density.csv')
        if 'query_density' not in density:
            progress.write('Computing query density ...')
            q_re = q.rechunk({0: -1, 1: 'auto'}) if utils.is_dask(q) else q
            q_sq = threshold(q_re, thr, s)
            q_density = (q_sq != 0).mean(axis=0)
            q_density = utils.compute_if_dask(q_density)
            density['query_density'] = q_density
            density.to_csv(density_file, index=False)

        if 'database_density' not in density:
            progress.write('Computing database density ...')
            x_re = q.rechunk({0: -1, 1: 'auto'}) if utils.is_dask(x) else x
            x_sq = threshold(x_re, thr, s)
            x_density = (x_sq != 0).mean(axis=0)
            x_density = utils.compute_if_dask(x_density)
            density['database_density'] = x_density
            density.to_csv(density_file, index=False)

        index_stats, index_stats_file = exp.require_csv('index_stats.csv')

        index_name = exp.name.lower()
        index_path = exp.path_to('lucene_index')
        with LuceneIndex(index_path) as idx:
            if idx.count() < n_samples:
                x_sq = threshold(x, thr, s)
                x_sq = batch_features(x_sq, 5000)
                # x_str = features_to_str(x_sq, 5000)

                progress.write(f'Indexing: {index_name}')

                start = time.time()
                for i, xi in enumerate(tqdm(x_sq, total=n_samples)):
                    idx.add(str(i), xi)

                add_time = time.time() - start
                progress.write(f'Index time: {add_time}')

                index_stats.at[0, 'add_time'] = add_time

            if 'size' not in index_stats.columns:
                index_stats.at[0, 'size'] = utils.get_folder_size(index_path)

            index_stats.to_csv(index_stats_file, index=False)

        metrics, metrics_file = exp.require_csv(f'metrics.csv')

        scores = None
        scores_file = exp.path_to(f'scores.h5')
        if not os.path.exists(scores_file):
            progress.write('Computing scores...')

            q_sq = threshold(q, thr, s)
            q_sq = utils.compute_if_dask(q_sq, progress=False)
            # q_str = features_to_str(q_sq, n_queries, boost=True)

            scores = np.zeros((n_queries, n_samples), dtype=np.float32)
            query_times = []

            if True:  # sequential version
                for i, qi in enumerate(tqdm(q_sq, total=n_queries)):
                    start = time.time()
                    if qi.any():
                        for j, score in tqdm(idx.query(qi, n_samples),
                                             total=n_samples):
                            scores[i, int(j)] = score
                        query_times.append(time.time() - start)
                    else:
                        query_times.append(None)

            else:  # Parallel version (currently slower)
                idx._init_searcher()

                def _search(i, qi):
                    lucene_vm.attachCurrentThread()
                    scores_i = np.zeros(n_samples, dtype=np.float32)
                    start = time.time()
                    if qi.any():
                        for j, score in idx.query(qi, n_samples):
                            scores_i[int(j)] = score
                        query_time = time.time() - start
                    else:
                        query_time = None

                    return scores_i, query_time

                queries = enumerate(tqdm(q_sq, total=n_queries))
                scores_n_times = Parallel(n_jobs=6, prefer="threads")(
                    delayed(_search)(i, qi) for i, qi in queries)
                scores, query_times = zip(*scores_n_times)
                scores = np.vstack(scores)

            metrics['query_time'] = query_times
            metrics.to_csv(metrics_file, index=False)
            progress.write(f'Query time: {metrics.query_time.sum()}')
            utils.save_as_hdf5(scores, scores_file, progress=True)

        if 'ap' not in metrics:
            dataset._load()  # TODO in y_true getter
            if scores is None:
                progress.write('Loading scores...')
                scores = utils.load_features(scores_file)[...]
            progress.write('Computing mAP...')
            metrics['ap'] = dataset.score(scores,
                                          reduction=False,
                                          progress=True)
            metrics.to_csv(metrics_file, index=False)
            progress.write(f'mAP: {metrics.ap.mean()}')

        if 'ndcg' not in metrics:
            dataset._load()  # TODO in y_true getter
            if scores is None:
                progress.write('Loading scores...')
                scores = utils.load_features(scores_file)[...]
            progress.write('Computing nDCG...')
            y_true = dataset.y_true[:, :args.
                                    limit] if args.limit else dataset.y_true
            bs = 50
            ndcg = []
            for i in trange(0, y_true.shape[0], bs):
                ndcg.append(
                    dcg(y_true[i:i + bs], scores[i:i + bs], normalized=True))

            metrics['ndcg'] = np.concatenate(ndcg)
            # metrics['ndcg'] = dcg(dataset.y_true, scores, normalized=True)
            metrics.to_csv(metrics_file, index=False)
            progress.write(f'nDCG: {metrics.ndcg.mean()}')

        if 'ndcg@25' not in metrics:
            dataset._load()  # TODO in y_true getter
            if scores is None:
                progress.write('Loading scores...')
                scores = utils.load_features(scores_file)[...]
            progress.write('Computing nDCG@25...')
            y_true = dataset.y_true[:, :args.
                                    limit] if args.limit else dataset.y_true
            bs = 50
            ndcg = []
            for i in trange(0, y_true.shape[0], bs):
                ndcg.append(
                    dcg(y_true[i:i + bs],
                        scores[i:i + bs],
                        p=25,
                        normalized=True))

            metrics['ndcg@25'] = np.concatenate(ndcg)
            # metrics['ndcg'] = dcg(dataset.y_true, scores, normalized=True)
            metrics.to_csv(metrics_file, index=False)
            progress.write(f'nDCG@25: {metrics["ndcg@25"].mean()}')
Ejemplo n.º 8
0
    dataset, q, x = utils.load_benchmark(args.dataset, args.features)
    x = utils.load_features(x, chunks=(1000, 2048))
    q = utils.load_features(q, chunks=(1000, 2048))

    x /= da.sqrt((x**2).sum(axis=1, keepdims=True))
    q /= da.sqrt((q**2).sum(axis=1, keepdims=True))

    if args.rotate:
        R = np.load(args.rotate)
        q = q.dot(R.T)
        x = x.dot(R.T)
        x -= x.mean(axis=0)
    
    scores = q.dot(x.T)
    scores = utils.compute_if_dask(scores)
    dataset._load()
    
    mean_ap = dataset.score(scores)
    print(mean_ap)
    
    """ CONFIRMED THAT compute_ap WORKS
    eval_bin = 'eval_bin/compute_ap'

    aps = []
    for i, scores_i in enumerate(tqdm(scores)):
        tmp_rnk = f'tmp/{dataset.query_ids[i]}.rnk'
        rank = scores_i.argsort()[::-1]
        
        with open(tmp_rnk, 'w') as f:
            f.write('\n'.join(dataset.image_ids[rank]))