def summary_performance(dataReduced, dataReconstructed): if args.post_cn: dataReduced = center_data(dataReduced) dataReduced = norm_data(dataReduced) val_l2 = rprec_a_l2(dataReduced["queries"], dataReduced["docs"], data["relevancy"], data["relevancy_articles"], data["docs_articles"], fast=True, report=False) if args.post_cn: val_ip = val_l2 else: val_ip = rprec_a_ip(dataReduced["queries"], dataReduced["docs"], data["relevancy"], data["relevancy_articles"], data["docs_articles"], fast=True, report=False) if not args.skip_loss: loss_q = sklearn.metrics.mean_squared_error( data["queries"], dataReconstructed["queries"]) # loss of only the first 10k documents because it has to get copied loss_d = sklearn.metrics.mean_squared_error( data["docs"][:10000], dataReconstructed["docs"][:10000]) return val_ip, val_l2, loss_q.item(), loss_d.item() else: return val_ip, val_l2, None, None
def summary_performance(dataReduced): if args.post_cn: dataReduced = center_data(dataReduced) dataReduced = norm_data(dataReduced) val_l2 = rprec_a_l2( dataReduced["queries"], dataReduced["docs"], data["relevancy"], data["relevancy_articles"], data["docs_articles"], fast=True, ) if args.post_cn: val_ip = val_l2 else: val_ip = rprec_a_ip( dataReduced["queries"], dataReduced["docs"], data["relevancy"], data["relevancy_articles"], data["docs_articles"], fast=True, ) return val_ip, val_l2
def summary_performance(prefix, data_reduced, data, post_cn): if post_cn: data_reduced = center_data(data_reduced) data_reduced = norm_data(data_reduced) val_l2 = rprec_a_l2( data_reduced["queries"], data_reduced["docs"], data["relevancy"], data["relevancy_articles"], data["docs_articles"], fast=True, report=False ) if post_cn: val_ip = val_l2 else: val_ip = rprec_a_ip( data_reduced["queries"], data_reduced["docs"], data["relevancy"], data["relevancy_articles"], data["docs_articles"], fast=True, report=False ) print(f'{prefix} rprec_ip: {val_ip:.3f}, rprec_l2: {val_l2:.3f}') return val_ip, val_l2
def summary_performance(name, dataReduced, dataReconstructed): if args.post_cn: dataReduced = center_data(dataReduced) dataReduced = norm_data(dataReduced) val_ip = rprec_a_ip(dataReduced["queries"], dataReduced["docs"], data["relevancy"], data["relevancy_articles"], data["docs_articles"], fast=True) val_l2 = rprec_a_l2(dataReduced["queries"], dataReduced["docs"], data["relevancy"], data["relevancy_articles"], data["docs_articles"], fast=True) name = name.replace("float", "f") print(f"{name:<21} {val_ip:>5.3f} {val_l2:>5.3f}") return val_ip, val_l2
def random_projection_performance(dim): model = DropRandomProjection() dataReduced = { "queries": model.transform(data["queries"], dim, IMPR_L2), "docs": model.transform(data["docs"], dim, IMPR_L2) } if args.post_cn: dataReduced = center_data(dataReduced) dataReduced = norm_data(dataReduced) # copy to make it C-continuous val_l2 = rprec_a_l2( dataReduced["queries"].copy(), dataReduced["docs"].copy(), data["relevancy"], data["relevancy_articles"], data["docs_articles"], fast=True, ) if not args.post_cn: val_ip = rprec_a_ip( dataReduced["queries"].copy(), dataReduced["docs"].copy(), data["relevancy"], data["relevancy_articles"], data["docs_articles"], fast=True, ) else: val_ip = val_l2 data_log.append({"del_dim": dim, "val_ip": val_ip, "val_l2": val_l2}) # continuously override the file with open(args.logfile, "w") as f: f.write(str(data_log)) print(f"Delete {dim} dims: {val_l2:<8.5f}")
def summary_performance(dataReduced, dataReconstructed): # reconstructed data is not in the original form when scaling # note the reverse order if args.norm: dataReconstructed = norm_model.inverse_transform(dataReconstructed) if args.center: dataReconstructed = center_model.inverse_transform(dataReconstructed) if args.post_cn: dataReduced = CenterScaler().transform(dataReduced) dataReduced = NormScaler().transform(dataReduced) val_l2 = rprec_a_l2( dataReduced["queries"], dataReduced["docs"], data["relevancy"], data["relevancy_articles"], data["docs_articles"], fast=True, ) if args.post_cn: val_ip = val_l2 else: val_ip = rprec_a_ip( dataReduced["queries"], dataReduced["docs"], data["relevancy"], data["relevancy_articles"], data["docs_articles"], fast=True, ) loss_q = sklearn.metrics.mean_squared_error(data_orig["queries"], dataReconstructed["queries"]) # loss of only the first 10k documents because it has to get copied loss_d = sklearn.metrics.mean_squared_error( data_orig["docs"][:10000], dataReconstructed["docs"][:10000]) return val_ip, val_l2, loss_q, loss_d
print("Fitting model") # dataNew = model.fit_transform(similarities) dataNew = model.fit_transform(dataNew) dataNew = { "docs": dataNew[:len(data["docs"])].copy(), "queries": dataNew[len(data["docs"]):].copy(), } print(len(dataNew["docs"])) print(len(dataNew["queries"])) val_ip_pca = rprec_a_ip(dataNew["queries"], dataNew["docs"], data["relevancy"], data["relevancy_articles"], data["docs_articles"], fast=False) val_l2_pca = rprec_a_l2(dataNew["queries"], dataNew["docs"], data["relevancy"], data["relevancy_articles"], data["docs_articles"], fast=False) print(f"ip: {val_ip_pca:.4f}, l2: {val_l2_pca:.4f} (MDS)") val_ip_pca = rprec_a_ip(data["queries"], data["docs"], data["relevancy"], data["relevancy_articles"], data["docs_articles"],
def random_projection_performance(components, model_name, runs=3): if model_name == "gauss": Model = GaussianRandomProjection elif model_name == "sparse": Model = SparseRandomProjection elif model_name == "crop": Model = CropRandomProjection else: raise Exception("Unknown model") random.seed(args.seed) vals_ip = [] vals_l2 = [] for i in range(runs): data = read_pickle(args.data) # take only dev queries data = sub_data(data, train=False, in_place=True) # make sure the vectors are np arrays data["queries"] = np.array(data["queries"]) data["docs"] = np.array(data["docs"]) model = Model(n_components=components, random_state=random.randint(0, 2**8 - 1)) model.fit(data["docs"]) dataReduced = { "queries": safe_transform(model, data["queries"]), "docs": safe_transform(model, data["docs"]) } del data["queries"] del data["docs"] if args.post_cn: dataReduced = center_data(dataReduced) dataReduced = norm_data(dataReduced) # copy to make it C-continuous # (skipped) val_l2 = rprec_a_l2( dataReduced["queries"], dataReduced["docs"], data["relevancy"], data["relevancy_articles"], data["docs_articles"], report=False, fast=True, ) vals_l2.append(val_l2) # skip IP computation because the vectors are normalized if not args.post_cn: val_ip = rprec_a_ip( dataReduced["queries"], dataReduced["docs"], data["relevancy"], data["relevancy_articles"], data["docs_articles"], report=False, fast=True, ) vals_ip.append(val_ip) else: vals_ip.append(val_l2) logdata.append({ "dim": components, "vals_ip": vals_ip, "vals_l2": vals_l2, "model": model_name }) # continuously override the file with open(args.logfile, "w") as f: f.write(str(logdata))