parser.add_argument('--file', dest="file", type=str, default="RESULTS/football-5D-KMEANS-1/", help="embeddings location file") args = parser.parse_args() dataset_dict = { "karate": corpora.load_karate, "football": corpora.load_football, "flickr": corpora.load_flickr, "dblp": corpora.load_dblp, "books": corpora.load_books, "blogCatalog": corpora.load_blogCatalog, "polblog": corpora.load_polblogs, "adjnoun": corpora.load_adjnoun } log_in = logger.JSONLogger(os.path.join(args.file,"log.json"), mod="continue") dataset_name = log_in["dataset"] print(dataset_name) n_gaussian = log_in["n_gaussian"] if(dataset_name not in dataset_dict): print("Dataset " + dataset_name + " does not exist, please select one of the following : ") print(list(dataset_dict.keys())) quit() print("Loading Corpus ") D, X, Y = dataset_dict[dataset_name]() results = [] std_kmeans = [] representations = torch.load(os.path.join(args.file,"embeddings.t7"))[0]
if (args.init_beta < 0): args.init_beta = args.beta # set the seed for random sampling alpha, beta = args.init_alpha, args.init_beta print("Loading Corpus ") D, X, Y = dataset_dict[args.dataset]() print("Creating dataset") # index of examples dataset dataset_index = corpora_tools.from_indexable( torch.arange(0, len(D), 1).unsqueeze(-1)) print("Dataset Size -> ", len(D)) if (args.save): os.makedirs(os.path.join(saving_folder, args.id + "/"), exist_ok=True) logger_object = logger.JSONLogger( os.path.join(saving_folder, args.id + "/log.json")) logger_object.append(vars(args)) D.set_path(False) # negative sampling distribution frequency = D.getFrequency()**(3 / 4) frequency[:, 1] /= frequency[:, 1].sum() frequency = pytorch_categorical.Categorical(frequency[:, 1]) # random walk dataset d_rw = D.light_copy() rw_log = logger.JSONLogger("ressources/random_walk.conf", mod="continue") if (args.force_rw): key = args.dataset + "_" + str(args.context_size) + "_" + str( args.walk_lenght) + "_" + str(args.seed) if (key in rw_log):
print("Creating dataset") # index of examples dataset dataset_index = corpora_tools.from_indexable( torch.arange(0, len(D), 1).unsqueeze(-1)) print("Dataset Size -> ", len(D)) D.set_path(False) # negative sampling distribution frequency = D.getFrequency()**(3 / 4) frequency[:, 1] /= frequency[:, 1].sum() frequency = pytorch_categorical.Categorical(frequency[:, 1]) # random walk dataset d_rw = D.light_copy() rw_log = logger.JSONLogger("ressources/random_walk.conf", mod="continue") if (args.force_rw): key = args.dataset + "_" + str(args.context_size) + "_" + str( args.walk_lenght) + "_" + str(args.seed) if (key in rw_log): try: print('Loading random walks from files') d_rw = torch.load(rw_log[key]["file"]) print('Loaded') except: os.makedirs("/local/gerald/KMEANS_RESULTS/", exist_ok=True) d_rw.set_walk(args.walk_lenght, 1.0) d_rw.set_path(True) d_rw = corpora.ContextCorpus(d_rw, context_size=args.context_size,
"dblp": corpora.load_dblp, "books": corpora.load_books, "blogCatalog": corpora.load_blogCatalog } optimizer_dict = { "addhsgd": optimizer.PoincareBallSGDAdd, "exphsgd": optimizer.PoincareBallSGDExp, "hsgd": optimizer.PoincareBallSGD, "exphsga": optimizer.PoincareBallSGAExp } if (args.save): print("The following options are use for the current experiment ", args) os.makedirs("RESULTS/" + args.id + "/", exist_ok=True) logger_object = logger.JSONLogger("RESULTS/" + args.id + "/log.json") logger_object.append(vars(args)) # check if dataset exists if (args.dataset not in dataset_dict): print("Dataset " + args.dataset + " does not exist, please select one of the following : ") print(list(dataset_dict.keys())) quit() if (args.embedding_optimizer not in optimizer_dict): print("Optimizer " + args.embedding_optimizer + " does not exist, please select one of the following : ") print(list(optimizer_dict.keys())) quit()