if args.docs_small_random: print(f"Downsizing to {args.docs_small_random} docs") data = { "docs": random.sample(data["docs"], args.docs_small_random), "queries": [] } if args.center: print("Centering") data = center_data(data) if args.std: print("Dividing by std") data = zscore_data(data, center=False) if args.norm: print("Normalizing") data = norm_data(data) if len(data["queries"]) != 0: print("First query element[:4]", data["queries"][0][:4], "Norm:", np.linalg.norm(data["queries"][0])) print("Output shape", data["docs"][0].shape) # pickler does not support serialization for objects over 4GB data["docs"] = [x for x in data["docs"]] data["queries"] = [x for x in data["queries"]] print("Saving") save_pickle(args.data_out, data)
sum([len(x) for x in data_relevancy]), "relevancies total", np.average([len(x) for x in data_relevancy]), "relevancies average", sum([len(x) for x in data_relevancy_articles]), "articles total", np.average([len(x) for x in data_relevancy_articles]), "articles average", ) save_pickle( args.data_out, { "queries": data_query, "docs": data_docs, "relevancy": data_relevancy, "relevancy_articles": data_relevancy_articles, "docs_articles": data_docs_articles, "boundaries": { "train": query_train_max, "dev": query_dev_max, "test": query_test_max } }) print("data_query[0]:") print(data_query[0]) print("\ndata_docs[0]:") print(data_docs[0]) print("\ndata_relevancy[0]:") print(data_relevancy[0]) print("\ndata_relevancy_articles[0]:") print(data_relevancy_articles[0]) print("\ndata_docs_articles[0]:")
"relevancy": data["relevancy"], } model = SimDistilModel( args.model, args.dimension, batchSize=args.batch_size, learningRate=args.learning_rate, dataOrganization=args.data_organization, merge=not args.not_merge, similarityModel=args.similarity_model, similarityGold=args.similarity_gold, ) # model = SimDistillationFromAutoencoderModel( # args.model, args.dimension, # batchSize=args.batch_size, # learningRate=args.learning_rate, # ) print(model) model.trainModel(data, args.epochs, args.post_cn) model.train(False) # encode data with torch.no_grad(): encoded = { "queries": model.encode1(data["queries"]).cpu().numpy(), "docs": model.encode2(data["docs"]).cpu().numpy(), "relevancy": data["relevancy"], } report(f"Final:", encoded, data.cpu()) save_pickle(encoded, args.data_out)
def comp_acc(data, data_dev): val_acc = acc_ip( data_dev["queries"], data["docs"], data_dev["relevancy"], n=10 ) return val_acc cur_time = time.time() val_acc = comp_acc(data, data_dev) logdata_all = [] logdata_all.append({"acc": val_acc}) traindata_all = [] for i in range(10): print(f"#### Pass {i+1}", flush=True) traindata, logitem = filter_step(data, data_dev, cur_time=cur_time) cur_time = time.time() val_acc = comp_acc(data, data_dev) logitem["acc"] = val_acc logdata_all.append(logitem) traindata_all.append(traindata) # continuously overwrite logfile save_json(args.logfile, logdata_all) save_pickle(args.traindata, traindata_all)