def eval_retrieval(scaler, model): # This loads the data from disk every time # To speed this up, we could load this once and then "only" make copies # The reason we can't have a single copy is that the model predictions can differ data = read_pickle(args.data) data = sub_data(data, train=False, in_place=True) prediction = model.predict(scaler.transform(data["docs"])) data = prune_docs( data, None, [i for i, _x in enumerate(data["docs"]) if prediction[i] == 1], verbose=False) acc = acc_ip(data["queries"], data["docs"], data["relevancy"], n=10) return acc
import numpy as np import argparse parser = argparse.ArgumentParser() parser.add_argument('--data') parser.add_argument('--logfile-single', default="computed/tmp.log") parser.add_argument('--post-cn', action="store_true") parser.add_argument('--logfile', default="computed/tmp.log") parser.add_argument('--dims', default="custom") parser.add_argument('--seed', type=int, default=0) args = parser.parse_args() data = read_pickle(args.data) # take only dev queries data = sub_data(data, train=False, in_place=True) data["queries"] = np.array(data["queries"]) data["docs"] = np.array(data["docs"]) with open(args.logfile_single, "r") as f: DATA_SINGLE = eval(f.read()) DATA_BASE = [x for x in DATA_SINGLE if x["dim"] == False][0] IMPR_L2 = [ x["dim"] for x in sorted(DATA_SINGLE, key=lambda x: x["val_l2"], reverse=True) ] print("l2_impr count", len([x for x in DATA_SINGLE if x["val_l2"] >= DATA_BASE["val_l2"]]))
def random_projection_performance(components, model_name, runs=3): if model_name == "gauss": Model = GaussianRandomProjection elif model_name == "sparse": Model = SparseRandomProjection elif model_name == "crop": Model = CropRandomProjection else: raise Exception("Unknown model") random.seed(args.seed) vals_ip = [] vals_l2 = [] for i in range(runs): data = read_pickle(args.data) # take only dev queries data = sub_data(data, train=False, in_place=True) # make sure the vectors are np arrays data["queries"] = np.array(data["queries"]) data["docs"] = np.array(data["docs"]) model = Model(n_components=components, random_state=random.randint(0, 2**8 - 1)) model.fit(data["docs"]) dataReduced = { "queries": safe_transform(model, data["queries"]), "docs": safe_transform(model, data["docs"]) } del data["queries"] del data["docs"] if args.post_cn: dataReduced = center_data(dataReduced) dataReduced = norm_data(dataReduced) # copy to make it C-continuous # (skipped) val_l2 = rprec_a_l2( dataReduced["queries"], dataReduced["docs"], data["relevancy"], data["relevancy_articles"], data["docs_articles"], report=False, fast=True, ) vals_l2.append(val_l2) # skip IP computation because the vectors are normalized if not args.post_cn: val_ip = rprec_a_ip( dataReduced["queries"], dataReduced["docs"], data["relevancy"], data["relevancy_articles"], data["docs_articles"], report=False, fast=True, ) vals_ip.append(val_ip) else: vals_ip.append(val_l2) logdata.append({ "dim": components, "vals_ip": vals_ip, "vals_l2": vals_l2, "model": model_name }) # continuously override the file with open(args.logfile, "w") as f: f.write(str(logdata))
parser.add_argument('--logfile', default="computed/tmp.log") parser.add_argument('--seed', type=int, default=0) args = parser.parse_args() torch.manual_seed(args.seed) data = read_pickle(args.data) if args.center: data = center_data(data) if args.norm: data = norm_data(data) print( "Because args.data_small is not provided, I'm copying the whole structure") data_train = copy.deepcopy(data) data = sub_data(data, train=False, in_place=True) data_train = sub_data(data_train, train=True, in_place=True) DIMS = process_dims(args.dims) logdata = [] # fail first for dim in DIMS: for train_key in ["dq"]: dim = int(dim) # training train_time = time.time() model = AutoencoderModel(model=args.model, bottleneck_width=dim) model.train_routine( data,
parser.add_argument('--dims', default="custom") parser.add_argument('--seed', type=int, default=0) args = parser.parse_args() data = read_pickle(args.data) if args.data_small is None: if args.center: data = center_data(data) if args.norm: data = norm_data(data) print( "Because args.data_small is not provided, I'm copying the whole structure" ) data_small = copy.deepcopy(data) data = sub_data(data, train=False, in_place=True) data_small = sub_data(data_small, train=True, in_place=True) else: data_small = read_pickle(args.data_small) if args.center: data = center_data(data) data_small = center_data(data_small) if args.norm: data = norm_data(data) data_small = norm_data(data_small) data = sub_data(data, train=False, in_place=True) data_small = sub_data(data_small, train=True, in_place=True) def summary_performance(dataReduced, dataReconstructed): if args.post_cn:
import sys sys.path.append("src") from misc.load_utils import read_pickle, save_json, save_pickle, small_data, sub_data from misc.retrieval_utils import retrieved_ip, acc_ip from filtering_utils import filter_step import argparse, json, pickle import time parser = argparse.ArgumentParser() parser.add_argument('--data', default="/data/hp/dpr-c-pruned.embd_cn") parser.add_argument('--logfile', default="computed/autofilter.json") parser.add_argument('--traindata', default="computed/autofilter_traindata.pkl") args = parser.parse_args() data = read_pickle(args.data) data = sub_data(data, train=False, in_place=True) data_dev = read_pickle(args.data) data_dev = sub_data(data_dev, train=False, in_place=True) print(len(data["queries"]), "train queries") print(len(data_dev["queries"]), "dev queries") print(len(data["docs"]), "docs") print(max([max(x) for x in data["relevancy"]]), "max train relevancy") print(max([max(x) for x in data_dev["relevancy"]]), "max dev relevancy") print("", flush=True) del data["relevancy_articles"] del data["docs_articles"] del data_dev["docs_articles"] del data_dev["relevancy_articles"] del data_dev["docs"]