Esempio n. 1
0
def plot_tradeoffs(db, allres, allstats, code_size, recall_rank):
    stat0 = next(iter(allstats.values()))
    d = stat0["d"]
    n_threads = stat0["n_threads"]
    recall_idx = stat0["ranks"].index(recall_rank)
    # times come after the perf measure
    times_idx = len(stat0["ranks"])

    if type(code_size) == int:
        if code_size == 0:
            code_size = [0, 1e50]
            code_size_name = "any code size"
        else:
            code_size_name = "code_size=%d" % code_size
            code_size = [code_size, code_size]
    elif type(code_size) == tuple:
        code_size_name = "code_size in [%d, %d]" % code_size
    else:
        assert False

    names_maxperf = []

    for k in sorted(allres):
        v = allres[k]
        if v.ndim != 2: continue
        us = unitsize(d, k)
        if not code_size[0] <= us <= code_size[1]: continue
        names_maxperf.append((v[-1, recall_idx], k))

    # sort from lowest to highest topline accuracy
    names_maxperf.sort()
    names = [name for mp, name in names_maxperf]

    selected_methods, optimal_points =  \
        extract_pareto_optimal(allres, names, recall_idx, times_idx)

    not_selected = list(set(names) - set(selected_methods))

    print("methods without an optimal OP: ", not_selected)

    pyplot.title('database ' + db + ' ' + code_size_name)

    # grayed out lines

    for k in not_selected:
        v = allres[k]
        if v.ndim != 2: continue
        us = unitsize(d, k)
        if not code_size[0] <= us <= code_size[1]: continue

        linestyle = (':' if 'PQ' in k else
                     '-.' if 'SQ4' in k else '--' if 'SQ8' in k else '-')

        pyplot.semilogy(v[:, recall_idx],
                        1000 / v[:, times_idx],
                        label=None,
                        linestyle=linestyle,
                        marker='o' if 'HNSW' in k else '+',
                        color='#cccccc',
                        linewidth=0.2)

    plot_subset(allres, allstats, selected_methods, recall_idx, times_idx)

    if len(not_selected) == 0:
        om = ''
    else:
        om = '\nomitted:'
        nc = len(om)
        for m in not_selected:
            if nc > 80:
                om += '\n'
                nc = 0
            om += ' ' + m
            nc += len(m) + 1

    # pyplot.semilogy(optimal_points[1, :], optimal_points[2, :], marker="s")
    # print(optimal_points[0, :])
    pyplot.xlabel('1-recall at %d %s' % (recall_rank, om))
    pyplot.ylabel('QPS (%d threads)' % n_threads)
    pyplot.legend()
    pyplot.grid()
    return selected_methods, not_selected
Esempio n. 2
0
def plot_subset(allres,
                allstats,
                selected_methods,
                recall_idx,
                times_idx=3,
                report=["overhead", "build time"]):

    # important methods
    for k in selected_methods:
        v = allres[k]

        stats = allstats[k]
        d = stats["d"]
        dbsize = stats["nb"]
        if "index_size" in stats and "tables_size" in stats:
            tot_size = stats['index_size'] + stats['tables_size']
        else:
            tot_size = -1
        id_size = 8  # 64 bit

        addt = ''
        if 'add_time' in stats:
            add_time = stats['add_time']
            if add_time > 7200:
                add_min = add_time / 60
                addt = ', %dh%02d' % (add_min / 60, add_min % 60)
            else:
                add_sec = int(add_time)
                addt = ', %dm%02d' % (add_sec / 60, add_sec % 60)

        code_size = unitsize(d, k)

        label = k

        if "code_size" in report:
            label += " %d bytes" % code_size

        tight_size = (code_size + id_size) * dbsize

        if tot_size < 0 or "overhead" not in report:
            pass  # don't know what the index size is
        elif tot_size > 10 * tight_size:
            label += " overhead x%.1f" % (tot_size / tight_size)
        else:
            label += " overhead+%.1f%%" % (tot_size / tight_size * 100 - 100)

        if "build time" in report:
            label += " " + addt

        linestyle = (':' if 'Refine' in k or 'RFlat' in k else
                     '-.' if 'SQ' in k else '-' if '4fs' in k else '-')
        print(k, linestyle)
        pyplot.semilogy(v[:, recall_idx],
                        1000 / v[:, times_idx],
                        label=label,
                        linestyle=linestyle,
                        marker='o' if '4fs' in k else '+')

    recall_rank = stats["ranks"][recall_idx]
    if stats["measure"] == "recall":
        pyplot.xlabel('1-recall at %d' % recall_rank)
    elif stats["measure"] == "inter":
        pyplot.xlabel('inter @ %d' % recall_rank)
    else:
        assert False
    pyplot.ylabel('QPS (%d threads)' % stats["n_threads"])