Esempio n. 1
0
def summary_performance(prefix, data_reduced, data, post_cn):
    if post_cn:
        data_reduced = center_data(data_reduced)
        data_reduced = norm_data(data_reduced)

    val_l2 = rprec_a_l2(
        data_reduced["queries"],
        data_reduced["docs"],
        data["relevancy"],
        data["relevancy_articles"],
        data["docs_articles"],
        fast=True, report=False
    )
    if post_cn:
        val_ip = val_l2
    else:
        val_ip = rprec_a_ip(
            data_reduced["queries"],
            data_reduced["docs"],
            data["relevancy"],
            data["relevancy_articles"],
            data["docs_articles"],
            fast=True, report=False
        )
    print(f'{prefix} rprec_ip: {val_ip:.3f}, rprec_l2: {val_l2:.3f}')
    return val_ip, val_l2
Esempio n. 2
0
def summary_performance(dataReduced, dataReconstructed):
    if args.post_cn:
        dataReduced = center_data(dataReduced)
        dataReduced = norm_data(dataReduced)

    val_l2 = rprec_a_l2(dataReduced["queries"],
                        dataReduced["docs"],
                        data["relevancy"],
                        data["relevancy_articles"],
                        data["docs_articles"],
                        fast=True,
                        report=False)
    if args.post_cn:
        val_ip = val_l2
    else:
        val_ip = rprec_a_ip(dataReduced["queries"],
                            dataReduced["docs"],
                            data["relevancy"],
                            data["relevancy_articles"],
                            data["docs_articles"],
                            fast=True,
                            report=False)

    if not args.skip_loss:
        loss_q = sklearn.metrics.mean_squared_error(
            data["queries"], dataReconstructed["queries"])
        # loss of only the first 10k documents because it has to get copied
        loss_d = sklearn.metrics.mean_squared_error(
            data["docs"][:10000], dataReconstructed["docs"][:10000])
        return val_ip, val_l2, loss_q.item(), loss_d.item()
    else:
        return val_ip, val_l2, None, None
Esempio n. 3
0
def summary_performance(dataReduced):
    if args.post_cn:
        dataReduced = center_data(dataReduced)
        dataReduced = norm_data(dataReduced)

    val_l2 = rprec_a_l2(
        dataReduced["queries"],
        dataReduced["docs"],
        data["relevancy"],
        data["relevancy_articles"],
        data["docs_articles"],
        fast=True,
    )
    if args.post_cn:
        val_ip = val_l2
    else:
        val_ip = rprec_a_ip(
            dataReduced["queries"],
            dataReduced["docs"],
            data["relevancy"],
            data["relevancy_articles"],
            data["docs_articles"],
            fast=True,
        )
    return val_ip, val_l2
Esempio n. 4
0
def report(prefix, encoded, data, post_cn):
    if post_cn:
        encoded = center_data(encoded)
        encoded = norm_data(encoded)
    val_l2 = rprec_l2(
        encoded["queries"], encoded["docs"],
        data["relevancy"], fast=True, report=False)
    if post_cn:
        val_ip = val_l2
    else:
        val_ip = rprec_ip(
            encoded["queries"], encoded["docs"],
            data["relevancy"], fast=True, report=False)
    print(f'{prefix} rprec_ip: {val_ip:.3f}, rprec_l2: {val_l2:.3f}')
    return val_ip, val_l2
Esempio n. 5
0
def summary_performance(name, dataReduced, dataReconstructed):
    if args.post_cn:
        dataReduced = center_data(dataReduced)
        dataReduced = norm_data(dataReduced)

    val_ip = rprec_ip(dataReduced["queries"],
                      dataReduced["docs"],
                      data["relevancy"],
                      fast=True)
    val_l2 = rprec_l2(dataReduced["queries"],
                      dataReduced["docs"],
                      data["relevancy"],
                      fast=True)
    loss_q = torch.nn.MSELoss()(torch.Tensor(data["queries"]),
                                torch.Tensor(dataReconstructed["queries"]))
    loss_d = torch.nn.MSELoss()(torch.Tensor(data["docs"]),
                                torch.Tensor(dataReconstructed["docs"]))
    return val_ip, val_l2, loss_q.item(), loss_d.item()
def summary_performance(name, dataReduced, dataReconstructed):
    if args.post_cn:
        dataReduced = center_data(dataReduced)
        dataReduced = norm_data(dataReduced)

    val_ip = rprec_a_ip(dataReduced["queries"],
                        dataReduced["docs"],
                        data["relevancy"],
                        data["relevancy_articles"],
                        data["docs_articles"],
                        fast=True)
    val_l2 = rprec_a_l2(dataReduced["queries"],
                        dataReduced["docs"],
                        data["relevancy"],
                        data["relevancy_articles"],
                        data["docs_articles"],
                        fast=True)
    name = name.replace("float", "f")
    print(f"{name:<21} {val_ip:>5.3f} {val_l2:>5.3f}")
    return val_ip, val_l2
Esempio n. 7
0
def random_projection_performance(dim):
    model = DropRandomProjection()

    dataReduced = {
        "queries": model.transform(data["queries"], dim, IMPR_L2),
        "docs": model.transform(data["docs"], dim, IMPR_L2)
    }
    if args.post_cn:
        dataReduced = center_data(dataReduced)
        dataReduced = norm_data(dataReduced)

    # copy to make it C-continuous
    val_l2 = rprec_a_l2(
        dataReduced["queries"].copy(),
        dataReduced["docs"].copy(),
        data["relevancy"],
        data["relevancy_articles"],
        data["docs_articles"],
        fast=True,
    )
    if not args.post_cn:
        val_ip = rprec_a_ip(
            dataReduced["queries"].copy(),
            dataReduced["docs"].copy(),
            data["relevancy"],
            data["relevancy_articles"],
            data["docs_articles"],
            fast=True,
        )
    else:
        val_ip = val_l2

    data_log.append({"del_dim": dim, "val_ip": val_ip, "val_l2": val_l2})

    # continuously override the file
    with open(args.logfile, "w") as f:
        f.write(str(data_log))

    print(f"Delete {dim} dims: {val_l2:<8.5f}")
Esempio n. 8
0
def summary_performance(name, dataReduced, dataReconstructed):
    if args.post_cn:
        dataReduced = center_data(dataReduced)
        dataReduced = norm_data(dataReduced)

    val_ip = rprec_ip(dataReduced["queries"],
                      dataReduced["docs"],
                      data["relevancy"],
                      fast=True)
    val_l2 = rprec_l2(dataReduced["queries"],
                      dataReduced["docs"],
                      data["relevancy"],
                      fast=True)
    loss_q = torch.nn.MSELoss()(torch.Tensor(data["queries"]),
                                torch.Tensor(dataReconstructed["queries"]))
    loss_d = torch.nn.MSELoss()(torch.Tensor(data["docs"]),
                                torch.Tensor(dataReconstructed["docs"]))
    name = name.replace("float", "f")
    print(
        f"{name:<21} {loss_d:>7.5f} {loss_q:>7.5f} {val_ip:>5.3f} {val_l2:>5.3f}"
    )
    return val_ip, val_l2, loss_q.item(), loss_d.item()
Esempio n. 9
0
if args.docs_small_random:
    print(f"Downsizing to {args.docs_small_random} docs")
    data = {
        "docs": random.sample(data["docs"], args.docs_small_random),
        "queries": []
    }

if args.center:
    print("Centering")
    data = center_data(data)

if args.std:
    print("Dividing by std")
    data = zscore_data(data, center=False)

if args.norm:
    print("Normalizing")
    data = norm_data(data)

if len(data["queries"]) != 0:
    print("First query element[:4]", data["queries"][0][:4], "Norm:",
          np.linalg.norm(data["queries"][0]))

print("Output shape", data["docs"][0].shape)
# pickler does not support serialization for objects over 4GB
data["docs"] = [x for x in data["docs"]]
data["queries"] = [x for x in data["queries"]]

print("Saving")
save_pickle(args.data_out, data)
Esempio n. 10
0
def random_projection_performance(components, model_name, runs=3):
    if model_name == "gauss":
        Model = GaussianRandomProjection
    elif model_name == "sparse":
        Model = SparseRandomProjection
    elif model_name == "crop":
        Model = CropRandomProjection
    else:
        raise Exception("Unknown model")

    random.seed(args.seed)
    vals_ip = []
    vals_l2 = []
    for i in range(runs):
        data = read_pickle(args.data)
        # take only dev queries
        data = sub_data(data, train=False, in_place=True)
        # make sure the vectors are np arrays
        data["queries"] = np.array(data["queries"])
        data["docs"] = np.array(data["docs"])

        model = Model(n_components=components,
                      random_state=random.randint(0, 2**8 - 1))
        model.fit(data["docs"])

        dataReduced = {
            "queries": safe_transform(model, data["queries"]),
            "docs": safe_transform(model, data["docs"])
        }
        del data["queries"]
        del data["docs"]

        if args.post_cn:
            dataReduced = center_data(dataReduced)
            dataReduced = norm_data(dataReduced)

        # copy to make it C-continuous
        # (skipped)
        val_l2 = rprec_a_l2(
            dataReduced["queries"],
            dataReduced["docs"],
            data["relevancy"],
            data["relevancy_articles"],
            data["docs_articles"],
            report=False,
            fast=True,
        )
        vals_l2.append(val_l2)

        # skip IP computation because the vectors are normalized
        if not args.post_cn:
            val_ip = rprec_a_ip(
                dataReduced["queries"],
                dataReduced["docs"],
                data["relevancy"],
                data["relevancy_articles"],
                data["docs_articles"],
                report=False,
                fast=True,
            )
            vals_ip.append(val_ip)
        else:
            vals_ip.append(val_l2)

    logdata.append({
        "dim": components,
        "vals_ip": vals_ip,
        "vals_l2": vals_l2,
        "model": model_name
    })

    # continuously override the file
    with open(args.logfile, "w") as f:
        f.write(str(logdata))
Esempio n. 11
0
parser.add_argument('--data-small', default=None)
parser.add_argument('--logfile', default="computed/tmp.log")
parser.add_argument('--post-cn', action="store_true")
parser.add_argument('--center', action="store_true")
parser.add_argument('--norm', action="store_true")
parser.add_argument('--skip-loss', action="store_true")
parser.add_argument('--dims', default="custom")
parser.add_argument('--seed', type=int, default=0)
args = parser.parse_args()
data = read_pickle(args.data)

if args.data_small is None:
    if args.center:
        data = center_data(data)
    if args.norm:
        data = norm_data(data)
    print(
        "Because args.data_small is not provided, I'm copying the whole structure"
    )
    data_small = copy.deepcopy(data)

    data = sub_data(data, train=False, in_place=True)
    data_small = sub_data(data_small, train=True, in_place=True)
else:
    data_small = read_pickle(args.data_small)
    if args.center:
        data = center_data(data)
        data_small = center_data(data_small)
    if args.norm:
        data = norm_data(data)
        data_small = norm_data(data_small)
Esempio n. 12
0
print("Fitting model")
model.fit(data["docs"])
dataNew = {
    "docs": model.transform(data["docs"]),
    "queries": model.transform(data["queries"]),
}

val_ip_pca = rprec_a_ip(dataNew["queries"],
                        dataNew["docs"],
                        data["relevancy"],
                        data["relevancy_articles"],
                        data["docs_articles"],
                        fast=True)
val_l2_pca = rprec_a_l2(dataNew["queries"],
                        dataNew["docs"],
                        data["relevancy"],
                        data["relevancy_articles"],
                        data["docs_articles"],
                        fast=True)
print(f"ip: {val_ip_pca:.4f}, l2: {val_l2_pca:.4f}")

dataNew = center_data(dataNew)
dataNew = norm_data(dataNew)

val_ip_pca = rprec_a_ip(dataNew["queries"],
                        dataNew["docs"],
                        data["relevancy"],
                        data["relevancy_articles"],
                        data["docs_articles"],
                        fast=True)
print(f"ip: {val_ip_pca:.4f} (cn)")
Esempio n. 13
0
print("\nPerformance")
val_ip_pca = rprec_ip(dataReduced["queries"],
                      dataReduced["docs"],
                      data["relevancy"],
                      fast=True)
val_l2_pca = rprec_l2(dataReduced["queries"],
                      dataReduced["docs"],
                      data["relevancy"],
                      fast=True)
print(f"IP: {val_ip_pca}")
print(f"L2: {val_l2_pca}")

print("\nRenormalized performance")
dataReduced = center_data(dataReduced)
dataReduced = norm_data(dataReduced)
val_ip_pca = rprec_ip(dataReduced["queries"],
                      dataReduced["docs"],
                      data["relevancy"],
                      fast=True)
val_l2_pca = rprec_l2(dataReduced["queries"],
                      dataReduced["docs"],
                      data["relevancy"],
                      fast=True)
print(f"IP: {val_ip_pca}")
print(f"L2: {val_l2_pca}")

print("\nOverlap of retrievals")
val_order_l2 = list(
    order_l2(dataReduced["queries"],
             dataReduced["docs"], [len(x) for x in data["relevancy"]],
Esempio n. 14
0
            metric_ip(data["queries"],
                      data["docs"],
                      data["relevancy"],
                      data["relevancy_articles"],
                      data["docs_articles"],
                      fast=True)))
    print(
        f"{args.metric}_l2:", "{:.4f}".format(
            metric_l2(data["queries"],
                      data["docs"],
                      data["relevancy"],
                      data["relevancy_articles"],
                      data["docs_articles"],
                      fast=True)))

    data = norm_data(data_b)
    print(
        f"{args.metric}_ip (norm):", "{:.4f}".format(
            metric_ip(data["queries"],
                      data["docs"],
                      data["relevancy"],
                      data["relevancy_articles"],
                      data["docs_articles"],
                      fast=True)))
    print(
        f"{args.metric}_l2 (norm):", "{:.4f}".format(
            metric_l2(data["queries"],
                      data["docs"],
                      data["relevancy"],
                      data["relevancy_articles"],
                      data["docs_articles"],