Example #1
0
if args.docs_small_random:
    print(f"Downsizing to {args.docs_small_random} docs")
    data = {
        "docs": random.sample(data["docs"], args.docs_small_random),
        "queries": []
    }

if args.center:
    print("Centering")
    data = center_data(data)

if args.std:
    print("Dividing by std")
    data = zscore_data(data, center=False)

if args.norm:
    print("Normalizing")
    data = norm_data(data)

if len(data["queries"]) != 0:
    print("First query element[:4]", data["queries"][0][:4], "Norm:",
          np.linalg.norm(data["queries"][0]))

print("Output shape", data["docs"][0].shape)
# pickler does not support serialization for objects over 4GB
data["docs"] = [x for x in data["docs"]]
data["queries"] = [x for x in data["queries"]]

print("Saving")
save_pickle(args.data_out, data)
     sum([len(x) for x in data_relevancy]),
     "relevancies total",
     np.average([len(x) for x in data_relevancy]),
     "relevancies average",
     sum([len(x) for x in data_relevancy_articles]),
     "articles total",
     np.average([len(x) for x in data_relevancy_articles]),
     "articles average",
 )
 save_pickle(
     args.data_out, {
         "queries": data_query,
         "docs": data_docs,
         "relevancy": data_relevancy,
         "relevancy_articles": data_relevancy_articles,
         "docs_articles": data_docs_articles,
         "boundaries": {
             "train": query_train_max,
             "dev": query_dev_max,
             "test": query_test_max
         }
     })
 print("data_query[0]:")
 print(data_query[0])
 print("\ndata_docs[0]:")
 print(data_docs[0])
 print("\ndata_relevancy[0]:")
 print(data_relevancy[0])
 print("\ndata_relevancy_articles[0]:")
 print(data_relevancy_articles[0])
 print("\ndata_docs_articles[0]:")
Example #3
0
        "relevancy": data["relevancy"],
    }
    model = SimDistilModel(
        args.model,
        args.dimension,
        batchSize=args.batch_size,
        learningRate=args.learning_rate,
        dataOrganization=args.data_organization,
        merge=not args.not_merge,
        similarityModel=args.similarity_model,
        similarityGold=args.similarity_gold,
    )
    # model = SimDistillationFromAutoencoderModel(
    #     args.model, args.dimension,
    #     batchSize=args.batch_size,
    #     learningRate=args.learning_rate,
    # )
    print(model)
    model.trainModel(data, args.epochs, args.post_cn)
    model.train(False)

    # encode data
    with torch.no_grad():
        encoded = {
            "queries": model.encode1(data["queries"]).cpu().numpy(),
            "docs": model.encode2(data["docs"]).cpu().numpy(),
            "relevancy": data["relevancy"],
        }
    report(f"Final:", encoded, data.cpu())
    save_pickle(encoded, args.data_out)
Example #4
0
def comp_acc(data, data_dev):
    val_acc = acc_ip(
        data_dev["queries"], data["docs"], data_dev["relevancy"], n=10
    )
    return val_acc


cur_time = time.time()
val_acc = comp_acc(data, data_dev)

logdata_all = []
logdata_all.append({"acc": val_acc})
traindata_all = []

for i in range(10):
    print(f"#### Pass {i+1}", flush=True)
    traindata, logitem = filter_step(data, data_dev, cur_time=cur_time)
    
    cur_time = time.time()

    val_acc = comp_acc(data, data_dev)
    logitem["acc"] = val_acc

    logdata_all.append(logitem)
    traindata_all.append(traindata)

    # continuously overwrite logfile
    save_json(args.logfile, logdata_all)
    save_pickle(args.traindata, traindata_all)