def summary_performance(prefix, data_reduced, data, post_cn): if post_cn: data_reduced = center_data(data_reduced) data_reduced = norm_data(data_reduced) val_l2 = rprec_a_l2( data_reduced["queries"], data_reduced["docs"], data["relevancy"], data["relevancy_articles"], data["docs_articles"], fast=True, report=False ) if post_cn: val_ip = val_l2 else: val_ip = rprec_a_ip( data_reduced["queries"], data_reduced["docs"], data["relevancy"], data["relevancy_articles"], data["docs_articles"], fast=True, report=False ) print(f'{prefix} rprec_ip: {val_ip:.3f}, rprec_l2: {val_l2:.3f}') return val_ip, val_l2
def summary_performance(dataReduced, dataReconstructed): if args.post_cn: dataReduced = center_data(dataReduced) dataReduced = norm_data(dataReduced) val_l2 = rprec_a_l2(dataReduced["queries"], dataReduced["docs"], data["relevancy"], data["relevancy_articles"], data["docs_articles"], fast=True, report=False) if args.post_cn: val_ip = val_l2 else: val_ip = rprec_a_ip(dataReduced["queries"], dataReduced["docs"], data["relevancy"], data["relevancy_articles"], data["docs_articles"], fast=True, report=False) if not args.skip_loss: loss_q = sklearn.metrics.mean_squared_error( data["queries"], dataReconstructed["queries"]) # loss of only the first 10k documents because it has to get copied loss_d = sklearn.metrics.mean_squared_error( data["docs"][:10000], dataReconstructed["docs"][:10000]) return val_ip, val_l2, loss_q.item(), loss_d.item() else: return val_ip, val_l2, None, None
def summary_performance(dataReduced): if args.post_cn: dataReduced = center_data(dataReduced) dataReduced = norm_data(dataReduced) val_l2 = rprec_a_l2( dataReduced["queries"], dataReduced["docs"], data["relevancy"], data["relevancy_articles"], data["docs_articles"], fast=True, ) if args.post_cn: val_ip = val_l2 else: val_ip = rprec_a_ip( dataReduced["queries"], dataReduced["docs"], data["relevancy"], data["relevancy_articles"], data["docs_articles"], fast=True, ) return val_ip, val_l2
def report(prefix, encoded, data, post_cn): if post_cn: encoded = center_data(encoded) encoded = norm_data(encoded) val_l2 = rprec_l2( encoded["queries"], encoded["docs"], data["relevancy"], fast=True, report=False) if post_cn: val_ip = val_l2 else: val_ip = rprec_ip( encoded["queries"], encoded["docs"], data["relevancy"], fast=True, report=False) print(f'{prefix} rprec_ip: {val_ip:.3f}, rprec_l2: {val_l2:.3f}') return val_ip, val_l2
def summary_performance(name, dataReduced, dataReconstructed): if args.post_cn: dataReduced = center_data(dataReduced) dataReduced = norm_data(dataReduced) val_ip = rprec_ip(dataReduced["queries"], dataReduced["docs"], data["relevancy"], fast=True) val_l2 = rprec_l2(dataReduced["queries"], dataReduced["docs"], data["relevancy"], fast=True) loss_q = torch.nn.MSELoss()(torch.Tensor(data["queries"]), torch.Tensor(dataReconstructed["queries"])) loss_d = torch.nn.MSELoss()(torch.Tensor(data["docs"]), torch.Tensor(dataReconstructed["docs"])) return val_ip, val_l2, loss_q.item(), loss_d.item()
def summary_performance(name, dataReduced, dataReconstructed): if args.post_cn: dataReduced = center_data(dataReduced) dataReduced = norm_data(dataReduced) val_ip = rprec_a_ip(dataReduced["queries"], dataReduced["docs"], data["relevancy"], data["relevancy_articles"], data["docs_articles"], fast=True) val_l2 = rprec_a_l2(dataReduced["queries"], dataReduced["docs"], data["relevancy"], data["relevancy_articles"], data["docs_articles"], fast=True) name = name.replace("float", "f") print(f"{name:<21} {val_ip:>5.3f} {val_l2:>5.3f}") return val_ip, val_l2
def random_projection_performance(dim): model = DropRandomProjection() dataReduced = { "queries": model.transform(data["queries"], dim, IMPR_L2), "docs": model.transform(data["docs"], dim, IMPR_L2) } if args.post_cn: dataReduced = center_data(dataReduced) dataReduced = norm_data(dataReduced) # copy to make it C-continuous val_l2 = rprec_a_l2( dataReduced["queries"].copy(), dataReduced["docs"].copy(), data["relevancy"], data["relevancy_articles"], data["docs_articles"], fast=True, ) if not args.post_cn: val_ip = rprec_a_ip( dataReduced["queries"].copy(), dataReduced["docs"].copy(), data["relevancy"], data["relevancy_articles"], data["docs_articles"], fast=True, ) else: val_ip = val_l2 data_log.append({"del_dim": dim, "val_ip": val_ip, "val_l2": val_l2}) # continuously override the file with open(args.logfile, "w") as f: f.write(str(data_log)) print(f"Delete {dim} dims: {val_l2:<8.5f}")
def summary_performance(name, dataReduced, dataReconstructed): if args.post_cn: dataReduced = center_data(dataReduced) dataReduced = norm_data(dataReduced) val_ip = rprec_ip(dataReduced["queries"], dataReduced["docs"], data["relevancy"], fast=True) val_l2 = rprec_l2(dataReduced["queries"], dataReduced["docs"], data["relevancy"], fast=True) loss_q = torch.nn.MSELoss()(torch.Tensor(data["queries"]), torch.Tensor(dataReconstructed["queries"])) loss_d = torch.nn.MSELoss()(torch.Tensor(data["docs"]), torch.Tensor(dataReconstructed["docs"])) name = name.replace("float", "f") print( f"{name:<21} {loss_d:>7.5f} {loss_q:>7.5f} {val_ip:>5.3f} {val_l2:>5.3f}" ) return val_ip, val_l2, loss_q.item(), loss_d.item()
if args.docs_small_random: print(f"Downsizing to {args.docs_small_random} docs") data = { "docs": random.sample(data["docs"], args.docs_small_random), "queries": [] } if args.center: print("Centering") data = center_data(data) if args.std: print("Dividing by std") data = zscore_data(data, center=False) if args.norm: print("Normalizing") data = norm_data(data) if len(data["queries"]) != 0: print("First query element[:4]", data["queries"][0][:4], "Norm:", np.linalg.norm(data["queries"][0])) print("Output shape", data["docs"][0].shape) # pickler does not support serialization for objects over 4GB data["docs"] = [x for x in data["docs"]] data["queries"] = [x for x in data["queries"]] print("Saving") save_pickle(args.data_out, data)
def random_projection_performance(components, model_name, runs=3): if model_name == "gauss": Model = GaussianRandomProjection elif model_name == "sparse": Model = SparseRandomProjection elif model_name == "crop": Model = CropRandomProjection else: raise Exception("Unknown model") random.seed(args.seed) vals_ip = [] vals_l2 = [] for i in range(runs): data = read_pickle(args.data) # take only dev queries data = sub_data(data, train=False, in_place=True) # make sure the vectors are np arrays data["queries"] = np.array(data["queries"]) data["docs"] = np.array(data["docs"]) model = Model(n_components=components, random_state=random.randint(0, 2**8 - 1)) model.fit(data["docs"]) dataReduced = { "queries": safe_transform(model, data["queries"]), "docs": safe_transform(model, data["docs"]) } del data["queries"] del data["docs"] if args.post_cn: dataReduced = center_data(dataReduced) dataReduced = norm_data(dataReduced) # copy to make it C-continuous # (skipped) val_l2 = rprec_a_l2( dataReduced["queries"], dataReduced["docs"], data["relevancy"], data["relevancy_articles"], data["docs_articles"], report=False, fast=True, ) vals_l2.append(val_l2) # skip IP computation because the vectors are normalized if not args.post_cn: val_ip = rprec_a_ip( dataReduced["queries"], dataReduced["docs"], data["relevancy"], data["relevancy_articles"], data["docs_articles"], report=False, fast=True, ) vals_ip.append(val_ip) else: vals_ip.append(val_l2) logdata.append({ "dim": components, "vals_ip": vals_ip, "vals_l2": vals_l2, "model": model_name }) # continuously override the file with open(args.logfile, "w") as f: f.write(str(logdata))
parser.add_argument('--data-small', default=None) parser.add_argument('--logfile', default="computed/tmp.log") parser.add_argument('--post-cn', action="store_true") parser.add_argument('--center', action="store_true") parser.add_argument('--norm', action="store_true") parser.add_argument('--skip-loss', action="store_true") parser.add_argument('--dims', default="custom") parser.add_argument('--seed', type=int, default=0) args = parser.parse_args() data = read_pickle(args.data) if args.data_small is None: if args.center: data = center_data(data) if args.norm: data = norm_data(data) print( "Because args.data_small is not provided, I'm copying the whole structure" ) data_small = copy.deepcopy(data) data = sub_data(data, train=False, in_place=True) data_small = sub_data(data_small, train=True, in_place=True) else: data_small = read_pickle(args.data_small) if args.center: data = center_data(data) data_small = center_data(data_small) if args.norm: data = norm_data(data) data_small = norm_data(data_small)
print("Fitting model") model.fit(data["docs"]) dataNew = { "docs": model.transform(data["docs"]), "queries": model.transform(data["queries"]), } val_ip_pca = rprec_a_ip(dataNew["queries"], dataNew["docs"], data["relevancy"], data["relevancy_articles"], data["docs_articles"], fast=True) val_l2_pca = rprec_a_l2(dataNew["queries"], dataNew["docs"], data["relevancy"], data["relevancy_articles"], data["docs_articles"], fast=True) print(f"ip: {val_ip_pca:.4f}, l2: {val_l2_pca:.4f}") dataNew = center_data(dataNew) dataNew = norm_data(dataNew) val_ip_pca = rprec_a_ip(dataNew["queries"], dataNew["docs"], data["relevancy"], data["relevancy_articles"], data["docs_articles"], fast=True) print(f"ip: {val_ip_pca:.4f} (cn)")
print("\nPerformance") val_ip_pca = rprec_ip(dataReduced["queries"], dataReduced["docs"], data["relevancy"], fast=True) val_l2_pca = rprec_l2(dataReduced["queries"], dataReduced["docs"], data["relevancy"], fast=True) print(f"IP: {val_ip_pca}") print(f"L2: {val_l2_pca}") print("\nRenormalized performance") dataReduced = center_data(dataReduced) dataReduced = norm_data(dataReduced) val_ip_pca = rprec_ip(dataReduced["queries"], dataReduced["docs"], data["relevancy"], fast=True) val_l2_pca = rprec_l2(dataReduced["queries"], dataReduced["docs"], data["relevancy"], fast=True) print(f"IP: {val_ip_pca}") print(f"L2: {val_l2_pca}") print("\nOverlap of retrievals") val_order_l2 = list( order_l2(dataReduced["queries"], dataReduced["docs"], [len(x) for x in data["relevancy"]],
metric_ip(data["queries"], data["docs"], data["relevancy"], data["relevancy_articles"], data["docs_articles"], fast=True))) print( f"{args.metric}_l2:", "{:.4f}".format( metric_l2(data["queries"], data["docs"], data["relevancy"], data["relevancy_articles"], data["docs_articles"], fast=True))) data = norm_data(data_b) print( f"{args.metric}_ip (norm):", "{:.4f}".format( metric_ip(data["queries"], data["docs"], data["relevancy"], data["relevancy_articles"], data["docs_articles"], fast=True))) print( f"{args.metric}_l2 (norm):", "{:.4f}".format( metric_l2(data["queries"], data["docs"], data["relevancy"], data["relevancy_articles"], data["docs_articles"],