def main(): if args.verbose: logging.getLogger().setLevel(logging.DEBUG) # use double precision by default torch.set_default_dtype(torch.float64) for ds_dir in fullpath_list(args.root_dir): ds_name = os.path.basename(ds_dir) if args.datasets and ds_name not in args.datasets: continue # Load the dataset graph in order to compute F1 scores. _, g = load_graph_pdists(os.path.join('../data', ds_name + '.edges.gz'), cache_dir='.cached_pdists') n_nodes = g.number_of_nodes() with Timer('constructing FastPrecision'): fp = FastPrecision(g) nodes_per_layer = fp.nodes_per_layer()[1:] nodes_per_layer = nodes_per_layer / np.sum(nodes_per_layer) for flipp_dir in fullpath_list(ds_dir): flipp = os.path.basename(flipp_dir).split('_')[1] if args.flip_probabilities and flipp not in args.flip_probabilities: continue for loss_fn_dir in fullpath_list(flipp_dir): loss_fn_str = os.path.basename(loss_fn_dir) if args.loss_fns and loss_fn_str not in args.loss_fns: continue # create one plot per (dataset, flipp, loss_fn) combination width = 6 if args.leftmost or args.rightmost else 5 fig, ax = plt.subplots(figsize=(width, 5)) plot_id = 0 for man_dir in fullpath_list(loss_fn_dir): man_name = os.path.basename(man_dir) if args.manifolds and man_name not in args.manifolds: continue factor_names = manifold_factors_from_path_label(man_name) man_factors = build_manifold(*factor_names) man_label = manifold_label_for_display(*factor_names) dim = sum([m.dim for m in man_factors]) if args.dims and dim not in args.dims: continue # compute the metric means, stds = comp_metric(ds_name, n_nodes, fp, flipp, loss_fn_str, man_dir, man_factors, man_label) if means is None: continue # add them to the plot plot_f1_scores(ax, means, stds, plot_id, label=man_label) plot_id += 1 # save the figure configure_and_save_plots(ax, fig, ds_name, flipp, loss_fn_str, nodes_per_layer)
def _load(flipp): gpdists, g = load_graph_pdists(ds_path, cache_dir='.cached_pdists', flip_probability=flipp) assert args.no_fp or g is not None n_nodes = nnm1d2_to_n(len(gpdists)) ds = GraphDataset(gpdists if n_nodes < 5000 else gpdists.to('cpu')) fp = None if args.no_fp else FastPrecision(g) return g, ds, fp
def main(): if args.verbose: logging.getLogger().setLevel(logging.INFO) # Default torch settings. torch.set_default_dtype(torch.float64) if torch.cuda.is_available(): torch.set_default_tensor_type(torch.cuda.DoubleTensor) with ThreadPoolExecutor(max_workers=2) as tpool: for ds_path in args.datasets: ds_name = os.path.basename(ds_path).split('.')[0] # Load the graph. gpdists, g = load_graph_pdists(ds_path, cache_dir='.cached_pdists') n_nodes = g.number_of_nodes() ds = GraphDataset(gpdists if n_nodes < 5000 else gpdists.to('cpu')) fp = FastPrecision(g) for loss_fn_str in args.loss_fns: loss_fn, alpha = build_loss_fn(loss_fn_str) # Run the Euclidean baseline. if args.run_baseline: for run_id in range(args.n_runs): # Set the random seeds. set_seeds(run_id) # Create the output directory. output_dir = make_exp_dir(args.save_dir, ds_name, loss_fn_str, 'baseline', str(run_id)) # Run the experiment. exp_run_eucl(ds_name, ds, loss_fn, alpha, tpool, fp, output_dir) # Add the curvature regularizer if needed. # TODO(ccruceru): Put this into the naming of the loss function. if args.lambda_reg is not None: loss_fn = Sum(loss_fn, CurvatureRegularizer(g, args.lambda_reg)) # Run the products. for n_fact in args.factors: for run_id in range(args.n_runs): # Set the random seeds. set_seeds(run_id) # Create the output directory. output_dir = make_exp_dir(args.save_dir, ds_name, loss_fn_str, str(n_fact), str(run_id)) # Run the experiment. exp_run(ds_name, ds, loss_fn, alpha, n_fact, tpool, fp, output_dir)
def main(): args = parse_args() # Fix the random seeds. set_seeds(args.random_seed) # Default torch settings. torch.set_default_dtype(torch.float64) if torch.cuda.is_available(): torch.set_default_tensor_type(torch.cuda.DoubleTensor) # load data gpdists, g = load_graph_pdists(args.input_graph, cache_dir='.cached_pdists') n_nodes = g.number_of_nodes() ds = GraphDataset(gpdists) fp = FastPrecision(g) # run hyp2 hyp = Lorentz(3) emb = ManifoldEmbedding(n_nodes, [hyp] * args.n_factors) for i in range(args.n_factors): emb.scales[i] = torch.nn.Parameter(torch.tensor(2.0)) man_name = '_'.join('hyp2' for _ in range(args.n_factors)) save_dir = os.path.join(args.save_dir, man_name) if args.hyp_snapshot or args.hyp_pretrained: logging.info('Loading embedding for %s', man_name) load_embedding(emb, save_dir) if not args.hyp_pretrained: train(ds, fp, emb, args.n_epochs, save_dir) # map it to SPD spd = SPD(2 * args.n_factors) spd_emb = ManifoldEmbedding(n_nodes, [spd]) save_dir = os.path.join(args.save_dir, 'spd{}'.format(spd.dim)) if args.spd_snapshot: logging.info('Loading embedding for SPD%d', spd.dim) load_embedding(spd_emb, save_dir) else: with torch.no_grad(): spd_emb.xs[0] = ManifoldParameter(block_diag([ h2_to_sspd2(emb.xs[i].mul(math.sqrt(2))) for i in range(args.n_factors) ]), manifold=spd) hyp_dists = emb.to('cpu').compute_dists(None) spd_dists = spd_emb.compute_dists(None).to('cpu') assert torch.allclose(hyp_dists, spd_dists, atol=1e-4) # run spd2 train(ds, fp, spd_emb, args.n_epochs, save_dir, args.n_epochs)
def main(): torch.set_default_dtype(torch.float64) csv_file = open(args.results_file, 'w') csv_writer = csv.DictWriter(csv_file, delimiter=';', fieldnames=HEADERS) csv_writer.writeheader() for ds_dir in fullpath_list(args.root_dir): ds_name = os.path.basename(ds_dir) if args.datasets and ds_name not in args.datasets: continue # Load the dataset graph in order to compute F1 scores. gpdists, g = load_graph_pdists(os.path.join('../data', ds_name + '.edges.gz'), cache_dir='.cached_pdists') gpdists.div_(gpdists.max()) # Normalization important for distortion! with Timer('constructing FastPrecision'): fp = FastPrecision(g) for flipp_dir in fullpath_list(ds_dir): flipp = os.path.basename(flipp_dir).split('_')[1] for loss_fn_dir in fullpath_list(flipp_dir): loss_fn_str = os.path.basename(loss_fn_dir) for man_dir in fullpath_list(loss_fn_dir): man_name = os.path.basename(man_dir) factor_names = manifold_factors_from_path_label(man_name) man_factors = build_manifold(*factor_names) man_label = manifold_label_for_display(*factor_names) def add_dimensions(partial_entry): partial_entry.update({ 'dataset': ds_name, 'flip_probability': flipp, 'loss_fn': loss_fn_str, 'manifold': man_label, 'dim': sum([m.dim for m in man_factors]), }) # yapf: disable return partial_entry # So I don't get confused again, note that this is OK w.r.t # to flipp because when it is 0, there's only an 'orig' dir. try: partial_entry = process_exp_dir( man_dir, man_factors, gpdists, fp) except Exception as e: logging.error('Failed to run for (%s): %s', man_dir, str(e)) continue entry = add_dimensions(partial_entry) csv_writer.writerow(entry) csv_file.close()
def main(): torch.set_default_dtype(torch.float64) csv_file = open('results.csv', 'w') csv_writer = csv.DictWriter(csv_file, delimiter=';', fieldnames=HEADERS) csv_writer.writeheader() for ds_dir in fullpath_list(args.root_dir): ds_name = os.path.basename(ds_dir) if args.datasets and ds_name not in args.datasets: continue # Load the dataset graph in order to compute F1 scores. gpdists, _ = load_graph_pdists( os.path.join('../data/dissimilarities', ds_name + '.npy')) gpdists.div_(gpdists.max()) # Normalization important for distortion! flipp_dir = os.path.join(ds_dir, 'flipp_0.0000') for loss_fn_dir in fullpath_list(flipp_dir): loss_fn_str = os.path.basename(loss_fn_dir) for man_dir in fullpath_list(loss_fn_dir): man_name = os.path.basename(man_dir) factor_names = manifold_factors_from_path_label(man_name) man_factors = build_manifold(*factor_names) man_label = manifold_label_for_display(*factor_names) def add_dimensions(partial_entry): partial_entry.update({ 'dataset': ds_name, 'loss_fn': loss_fn_str, 'manifold': man_label, 'dim': sum([m.dim for m in man_factors]), }) # yapf: disable return partial_entry partial_entry = process_exp_dir(os.path.join(man_dir, 'orig'), man_factors, gpdists) entry = add_dimensions(partial_entry) csv_writer.writerow(entry) csv_file.close()
def main(): args = parse_args() # Fix the random seeds. set_seeds(args.random_seed) # Default torch settings. torch.set_default_dtype(torch.float64) if torch.cuda.is_available(): torch.set_default_tensor_type(torch.cuda.DoubleTensor) # load data gpdists, g = load_graph_pdists(args.input_graph, cache_dir='.cached_pdists') n_nodes = g.number_of_nodes() ds = GraphDataset(gpdists) fp = FastPrecision(g) # run hyp2 emb = ManifoldEmbedding(n_nodes, [Lorentz(3)]) path = os.path.join(args.save_dir, 'hyp2') train(ds, fp, emb, args.n_epochs, path) curvature_sq = 1 / emb.scales[0] # map it to SSPD sspd_emb = ManifoldEmbedding(n_nodes, [SPD(2)]) sspd_emb.xs[0] = ManifoldParameter(h2_to_sspd2(emb.xs[0] / curvature_sq.sqrt()), manifold=sspd_emb.manifolds[0]) sspd_emb.scales[0] = torch.nn.Parameter(1 / curvature_sq / 2) assert torch.allclose(emb.compute_dists(None), sspd_emb.compute_dists(None), atol=1e-4) # run spd2 path = os.path.join(args.save_dir, 'spd2') train(ds, fp, sspd_emb, args.n_epochs, path, args.n_epochs)
def main(): torch.set_default_dtype(torch.float64) csv_file = open('results.csv', 'w') csv_writer = csv.DictWriter(csv_file, delimiter=';', fieldnames=HEADERS) csv_writer.writeheader() for ds_dir in fullpath_list(args.root_dir): ds_name = os.path.basename(ds_dir) if args.datasets and ds_name not in args.datasets: continue # Load the dataset graph in order to compute F1 scores. gpdists, g = load_graph_pdists(os.path.join('../data', ds_name + '.edges.gz'), cache_dir='.cached_pdists') gpdists.div_(gpdists.max()) # Normalization important for distortion! n_nodes = g.number_of_nodes() with Timer('constructing FastPrecision'): fp = FastPrecision(g) for loss_fn_dir in fullpath_list(ds_dir): loss_fn = os.path.basename(loss_fn_dir) for n_factors_dir in fullpath_list(loss_fn_dir): n_factors = os.path.basename(n_factors_dir) n_factors = 0 if n_factors == 'baseline' else int(n_factors) run_dirs = list(fullpath_list(n_factors_dir)) num_pdists = n_nodes * (n_nodes - 1) // 2 n_runs = len(run_dirs) all_pdists = np.ndarray(shape=(num_pdists * n_runs)) all_cs = np.zeros(shape=(n_runs, MAX_NUM_FACTORS)) pearson_rs = np.ndarray(shape=(n_runs)) distortions = np.ndarray(shape=(n_runs)) for i, run_dir in enumerate(run_dirs): # Load the embedding. pattern = os.path.join(run_dir, 'embedding_*.pth') path = latest_path_by_basename_numeric_order(pattern) emb = load_embedding(path) # The sorted curvatures. if isinstance(emb, UniversalEmbedding): cs = np.sort([-c.item() for c in emb.cuvature_params]).flatten() all_cs[i, :n_factors] = cs # Compute the manifold pairwise distances. mpdists = emb.compute_dists(None) mpdists.sqrt_() indices = np.arange(i * num_pdists, (i + 1) * num_pdists) all_pdists[indices] = mpdists.numpy() # Compute the pearson R. pearson_rs[i] = pearsonr(gpdists, mpdists) # Compute the average distortion distortions[i] = average_distortion(mpdists, gpdists) # Compute the F1 scores. with Timer('computing F1 scores'): f1_means, f1_stds = fp.layer_mean_f1_scores( all_pdists, n_runs) # Aggregate the metrics # - Pearson R r_mean = np.mean(pearson_rs) r_std = np.std(pearson_rs) # - average distortion dist_mean = np.mean(distortions) dist_std = np.std(distortions) # Average the curvatures. c_means = np.mean(all_cs, axis=0) c_stds = np.std(all_cs, axis=0) entry = { 'dataset': ds_name, 'loss_fn': loss_fn, 'n_factors': n_factors, 'c1_mean': c_means[0], 'c1_std': c_stds[0], 'c2_mean': c_means[1], 'c2_std': c_stds[1], 'c3_mean': c_means[2], 'c3_std': c_stds[2], 'c4_mean': c_means[3], 'c4_std': c_stds[3], 'c5_mean': c_means[4], 'c5_std': c_stds[4], 'c6_mean': c_means[5], 'c6_std': c_stds[5], 'f1_1_mean': f1_means[0], 'f1_1_std': f1_stds[0], 'f1_2_mean': f1_means[1], 'f1_2_std': f1_stds[1], 'f1_3_mean': f1_means[2], 'f1_3_std': f1_stds[2], 'f1_4_mean': f1_means[3], 'f1_4_std': f1_stds[3], 'f1_5_mean': f1_means[4], 'f1_5_std': f1_stds[4], 'f1_10_mean': f1_means[9], 'f1_10_std': f1_stds[9], 'auc_5': area_under_curve(f1_means[:5])[0], 'auc_10': area_under_curve(f1_means[:10])[0], 'auc_total': area_under_curve(f1_means)[0], 'r_mean': r_mean, 'r_std': r_std, 'dist_mean': dist_mean, 'dist_std': dist_std, } # yapf: disable csv_writer.writerow(entry) csv_file.close()
def main(): args = parse_args() if args.verbose: logging.getLogger().setLevel(logging.DEBUG) config = parse_config(args.config) set_seeds(args.random_seed) save_dir = check_mkdir(config['save_dir_root'], increment=True) copyfile(args.config, os.path.join(save_dir, 'config.yaml')) # torch settings torch.set_default_dtype(torch.float64) # use double precision if torch.cuda.is_available(): # place everything on CUDA # NOTE: We rely on this in several parts of the code. torch.set_default_tensor_type(torch.cuda.DoubleTensor) if args.detect_anomaly: torch.autograd.set_detect_anomaly(True) # prepare data gpdists, g = load_graph_pdists(config['input_graph'], cache_dir=config.get('cache_dir')) n_nodes = nnm1d2_to_n(len(gpdists)) if 'preprocess' in config: gpdists = config['preprocess'](gpdists) dataset = GraphDataset(gpdists if n_nodes < 5000 else gpdists.to('cpu')) # the embedding embedding = config['embedding'](n_nodes) # the optimizers optimizers = [] lr_schedulers = [] if 'embedding_optimizer' in config: emb_optim = config['embedding_optimizer'](embedding.xs) optimizers.append(emb_optim) if 'embedding_lr_scheduler' in config: lr_schedulers.append(config['embedding_lr_scheduler'](emb_optim)) if 'curvature_optimizer' in config: curv_optim = config['curvature_optimizer'](embedding.curvature_params) optimizers.append(curv_optim) if 'curvature_lr_scheduler' in config: lr_schedulers.append(config['curvature_lr_scheduler'](curv_optim)) # prepare training training_args = dict(embedding=embedding, optimizer=optimizers, lr_scheduler=lr_schedulers, objective_fn=config['objective_fn'], save_dir=save_dir) training_args.update(config['training_params']) # use the right training engine if isinstance(embedding, ProductManifoldEmbedding): from graphembed.products import TrainingEngine elif 'min_alpha' in training_args or 'max_alpha' in training_args: from graphembed.train_da import TrainingEngine else: from graphembed.train import TrainingEngine # use a with-block to make sure we the threads are closed even if we kill # the process with ThreadPoolExecutor(max_workers=args.num_workers) as pool: if g is not None: with Timer('constructing FastPrecision', loglevel=logging.INFO): fp = FastPrecision(g) training_args['lazy_metrics'] = { 'Layer_Mean_F1': \ lambda p: pool.submit(fp.layer_mean_f1_scores, p), } # yapf: disable training_engine = TrainingEngine(**training_args) # train with Timer('training', loglevel=logging.INFO): training_engine(dataset)