Exemple #1
0
def main():
    if args.verbose:
        logging.getLogger().setLevel(logging.DEBUG)

    # use double precision by default
    torch.set_default_dtype(torch.float64)

    for ds_dir in fullpath_list(args.root_dir):
        ds_name = os.path.basename(ds_dir)
        if args.datasets and ds_name not in args.datasets:
            continue
        # Load the dataset graph in order to compute F1 scores.
        _, g = load_graph_pdists(os.path.join('../data',
                                              ds_name + '.edges.gz'),
                                 cache_dir='.cached_pdists')
        n_nodes = g.number_of_nodes()
        with Timer('constructing FastPrecision'):
            fp = FastPrecision(g)
            nodes_per_layer = fp.nodes_per_layer()[1:]
            nodes_per_layer = nodes_per_layer / np.sum(nodes_per_layer)

        for flipp_dir in fullpath_list(ds_dir):
            flipp = os.path.basename(flipp_dir).split('_')[1]
            if args.flip_probabilities and flipp not in args.flip_probabilities:
                continue

            for loss_fn_dir in fullpath_list(flipp_dir):
                loss_fn_str = os.path.basename(loss_fn_dir)
                if args.loss_fns and loss_fn_str not in args.loss_fns:
                    continue
                # create one plot per (dataset, flipp, loss_fn) combination
                width = 6 if args.leftmost or args.rightmost else 5
                fig, ax = plt.subplots(figsize=(width, 5))
                plot_id = 0

                for man_dir in fullpath_list(loss_fn_dir):
                    man_name = os.path.basename(man_dir)
                    if args.manifolds and man_name not in args.manifolds:
                        continue
                    factor_names = manifold_factors_from_path_label(man_name)
                    man_factors = build_manifold(*factor_names)
                    man_label = manifold_label_for_display(*factor_names)
                    dim = sum([m.dim for m in man_factors])
                    if args.dims and dim not in args.dims:
                        continue

                    # compute the metric
                    means, stds = comp_metric(ds_name, n_nodes, fp, flipp,
                                              loss_fn_str, man_dir,
                                              man_factors, man_label)
                    if means is None:
                        continue

                    # add them to the plot
                    plot_f1_scores(ax, means, stds, plot_id, label=man_label)
                    plot_id += 1

                # save the figure
                configure_and_save_plots(ax, fig, ds_name, flipp, loss_fn_str,
                                         nodes_per_layer)
Exemple #2
0
 def _load(flipp):
     gpdists, g = load_graph_pdists(ds_path,
                                    cache_dir='.cached_pdists',
                                    flip_probability=flipp)
     assert args.no_fp or g is not None
     n_nodes = nnm1d2_to_n(len(gpdists))
     ds = GraphDataset(gpdists if n_nodes < 5000 else gpdists.to('cpu'))
     fp = None if args.no_fp else FastPrecision(g)
     return g, ds, fp
def main():
    if args.verbose:
        logging.getLogger().setLevel(logging.INFO)

    # Default torch settings.
    torch.set_default_dtype(torch.float64)
    if torch.cuda.is_available():
        torch.set_default_tensor_type(torch.cuda.DoubleTensor)

    with ThreadPoolExecutor(max_workers=2) as tpool:
        for ds_path in args.datasets:
            ds_name = os.path.basename(ds_path).split('.')[0]

            # Load the graph.
            gpdists, g = load_graph_pdists(ds_path, cache_dir='.cached_pdists')
            n_nodes = g.number_of_nodes()
            ds = GraphDataset(gpdists if n_nodes < 5000 else gpdists.to('cpu'))
            fp = FastPrecision(g)

            for loss_fn_str in args.loss_fns:
                loss_fn, alpha = build_loss_fn(loss_fn_str)

                # Run the Euclidean baseline.
                if args.run_baseline:
                    for run_id in range(args.n_runs):
                        # Set the random seeds.
                        set_seeds(run_id)
                        # Create the output directory.
                        output_dir = make_exp_dir(args.save_dir, ds_name,
                                                  loss_fn_str, 'baseline',
                                                  str(run_id))
                        # Run the experiment.
                        exp_run_eucl(ds_name, ds, loss_fn, alpha, tpool, fp,
                                     output_dir)

                # Add the curvature regularizer if needed.
                # TODO(ccruceru): Put this into the naming of the loss function.
                if args.lambda_reg is not None:
                    loss_fn = Sum(loss_fn,
                                  CurvatureRegularizer(g, args.lambda_reg))

                # Run the products.
                for n_fact in args.factors:
                    for run_id in range(args.n_runs):
                        # Set the random seeds.
                        set_seeds(run_id)
                        # Create the output directory.
                        output_dir = make_exp_dir(args.save_dir, ds_name,
                                                  loss_fn_str, str(n_fact),
                                                  str(run_id))
                        # Run the experiment.
                        exp_run(ds_name, ds, loss_fn, alpha, n_fact, tpool, fp,
                                output_dir)
Exemple #4
0
def main():
    args = parse_args()

    # Fix the random seeds.
    set_seeds(args.random_seed)

    # Default torch settings.
    torch.set_default_dtype(torch.float64)
    if torch.cuda.is_available():
        torch.set_default_tensor_type(torch.cuda.DoubleTensor)

    # load data
    gpdists, g = load_graph_pdists(args.input_graph,
                                   cache_dir='.cached_pdists')
    n_nodes = g.number_of_nodes()
    ds = GraphDataset(gpdists)
    fp = FastPrecision(g)

    # run hyp2
    hyp = Lorentz(3)
    emb = ManifoldEmbedding(n_nodes, [hyp] * args.n_factors)
    for i in range(args.n_factors):
        emb.scales[i] = torch.nn.Parameter(torch.tensor(2.0))
    man_name = '_'.join('hyp2' for _ in range(args.n_factors))
    save_dir = os.path.join(args.save_dir, man_name)
    if args.hyp_snapshot or args.hyp_pretrained:
        logging.info('Loading embedding for %s', man_name)
        load_embedding(emb, save_dir)
    if not args.hyp_pretrained:
        train(ds, fp, emb, args.n_epochs, save_dir)

    # map it to SPD
    spd = SPD(2 * args.n_factors)
    spd_emb = ManifoldEmbedding(n_nodes, [spd])
    save_dir = os.path.join(args.save_dir, 'spd{}'.format(spd.dim))
    if args.spd_snapshot:
        logging.info('Loading embedding for SPD%d', spd.dim)
        load_embedding(spd_emb, save_dir)
    else:
        with torch.no_grad():
            spd_emb.xs[0] = ManifoldParameter(block_diag([
                h2_to_sspd2(emb.xs[i].mul(math.sqrt(2)))
                for i in range(args.n_factors)
            ]),
                                              manifold=spd)
        hyp_dists = emb.to('cpu').compute_dists(None)
        spd_dists = spd_emb.compute_dists(None).to('cpu')
        assert torch.allclose(hyp_dists, spd_dists, atol=1e-4)

    # run spd2
    train(ds, fp, spd_emb, args.n_epochs, save_dir, args.n_epochs)
Exemple #5
0
def main():
    torch.set_default_dtype(torch.float64)

    csv_file = open(args.results_file, 'w')
    csv_writer = csv.DictWriter(csv_file, delimiter=';', fieldnames=HEADERS)
    csv_writer.writeheader()

    for ds_dir in fullpath_list(args.root_dir):
        ds_name = os.path.basename(ds_dir)
        if args.datasets and ds_name not in args.datasets:
            continue
        # Load the dataset graph in order to compute F1 scores.
        gpdists, g = load_graph_pdists(os.path.join('../data',
                                                    ds_name + '.edges.gz'),
                                       cache_dir='.cached_pdists')
        gpdists.div_(gpdists.max())  # Normalization important for distortion!
        with Timer('constructing FastPrecision'):
            fp = FastPrecision(g)
        for flipp_dir in fullpath_list(ds_dir):
            flipp = os.path.basename(flipp_dir).split('_')[1]
            for loss_fn_dir in fullpath_list(flipp_dir):
                loss_fn_str = os.path.basename(loss_fn_dir)
                for man_dir in fullpath_list(loss_fn_dir):
                    man_name = os.path.basename(man_dir)
                    factor_names = manifold_factors_from_path_label(man_name)
                    man_factors = build_manifold(*factor_names)
                    man_label = manifold_label_for_display(*factor_names)

                    def add_dimensions(partial_entry):
                        partial_entry.update({
                                'dataset': ds_name,
                                'flip_probability': flipp,
                                'loss_fn': loss_fn_str,
                                'manifold': man_label,
                                'dim': sum([m.dim for m in man_factors]),
                        })  # yapf: disable
                        return partial_entry

                    # So I don't get confused again, note that this is OK w.r.t
                    # to flipp because when it is 0, there's only an 'orig' dir.
                    try:
                        partial_entry = process_exp_dir(
                            man_dir, man_factors, gpdists, fp)
                    except Exception as e:
                        logging.error('Failed to run for (%s): %s', man_dir,
                                      str(e))
                        continue
                    entry = add_dimensions(partial_entry)
                    csv_writer.writerow(entry)
    csv_file.close()
Exemple #6
0
def main():
    torch.set_default_dtype(torch.float64)

    csv_file = open('results.csv', 'w')
    csv_writer = csv.DictWriter(csv_file, delimiter=';', fieldnames=HEADERS)
    csv_writer.writeheader()

    for ds_dir in fullpath_list(args.root_dir):
        ds_name = os.path.basename(ds_dir)
        if args.datasets and ds_name not in args.datasets:
            continue
        # Load the dataset graph in order to compute F1 scores.
        gpdists, _ = load_graph_pdists(
            os.path.join('../data/dissimilarities', ds_name + '.npy'))
        gpdists.div_(gpdists.max())  # Normalization important for distortion!
        flipp_dir = os.path.join(ds_dir, 'flipp_0.0000')
        for loss_fn_dir in fullpath_list(flipp_dir):
            loss_fn_str = os.path.basename(loss_fn_dir)
            for man_dir in fullpath_list(loss_fn_dir):
                man_name = os.path.basename(man_dir)
                factor_names = manifold_factors_from_path_label(man_name)
                man_factors = build_manifold(*factor_names)
                man_label = manifold_label_for_display(*factor_names)

                def add_dimensions(partial_entry):
                    partial_entry.update({
                            'dataset': ds_name,
                            'loss_fn': loss_fn_str,
                            'manifold': man_label,
                            'dim': sum([m.dim for m in man_factors]),
                    })  # yapf: disable
                    return partial_entry

                partial_entry = process_exp_dir(os.path.join(man_dir, 'orig'),
                                                man_factors, gpdists)
                entry = add_dimensions(partial_entry)
                csv_writer.writerow(entry)
    csv_file.close()
Exemple #7
0
def main():
    args = parse_args()

    # Fix the random seeds.
    set_seeds(args.random_seed)

    # Default torch settings.
    torch.set_default_dtype(torch.float64)
    if torch.cuda.is_available():
        torch.set_default_tensor_type(torch.cuda.DoubleTensor)

    # load data
    gpdists, g = load_graph_pdists(args.input_graph,
                                   cache_dir='.cached_pdists')
    n_nodes = g.number_of_nodes()
    ds = GraphDataset(gpdists)
    fp = FastPrecision(g)

    # run hyp2
    emb = ManifoldEmbedding(n_nodes, [Lorentz(3)])
    path = os.path.join(args.save_dir, 'hyp2')
    train(ds, fp, emb, args.n_epochs, path)
    curvature_sq = 1 / emb.scales[0]

    # map it to SSPD
    sspd_emb = ManifoldEmbedding(n_nodes, [SPD(2)])
    sspd_emb.xs[0] = ManifoldParameter(h2_to_sspd2(emb.xs[0] /
                                                   curvature_sq.sqrt()),
                                       manifold=sspd_emb.manifolds[0])
    sspd_emb.scales[0] = torch.nn.Parameter(1 / curvature_sq / 2)
    assert torch.allclose(emb.compute_dists(None),
                          sspd_emb.compute_dists(None),
                          atol=1e-4)

    # run spd2
    path = os.path.join(args.save_dir, 'spd2')
    train(ds, fp, sspd_emb, args.n_epochs, path, args.n_epochs)
def main():
    torch.set_default_dtype(torch.float64)

    csv_file = open('results.csv', 'w')
    csv_writer = csv.DictWriter(csv_file, delimiter=';', fieldnames=HEADERS)
    csv_writer.writeheader()

    for ds_dir in fullpath_list(args.root_dir):
        ds_name = os.path.basename(ds_dir)
        if args.datasets and ds_name not in args.datasets:
            continue
        # Load the dataset graph in order to compute F1 scores.
        gpdists, g = load_graph_pdists(os.path.join('../data',
                                                    ds_name + '.edges.gz'),
                                       cache_dir='.cached_pdists')
        gpdists.div_(gpdists.max())  # Normalization important for distortion!
        n_nodes = g.number_of_nodes()
        with Timer('constructing FastPrecision'):
            fp = FastPrecision(g)
        for loss_fn_dir in fullpath_list(ds_dir):
            loss_fn = os.path.basename(loss_fn_dir)
            for n_factors_dir in fullpath_list(loss_fn_dir):
                n_factors = os.path.basename(n_factors_dir)
                n_factors = 0 if n_factors == 'baseline' else int(n_factors)
                run_dirs = list(fullpath_list(n_factors_dir))

                num_pdists = n_nodes * (n_nodes - 1) // 2
                n_runs = len(run_dirs)
                all_pdists = np.ndarray(shape=(num_pdists * n_runs))
                all_cs = np.zeros(shape=(n_runs, MAX_NUM_FACTORS))
                pearson_rs = np.ndarray(shape=(n_runs))
                distortions = np.ndarray(shape=(n_runs))

                for i, run_dir in enumerate(run_dirs):
                    # Load the embedding.
                    pattern = os.path.join(run_dir, 'embedding_*.pth')
                    path = latest_path_by_basename_numeric_order(pattern)
                    emb = load_embedding(path)

                    # The sorted curvatures.
                    if isinstance(emb, UniversalEmbedding):
                        cs = np.sort([-c.item()
                                      for c in emb.cuvature_params]).flatten()
                        all_cs[i, :n_factors] = cs

                    # Compute the manifold pairwise distances.
                    mpdists = emb.compute_dists(None)
                    mpdists.sqrt_()
                    indices = np.arange(i * num_pdists, (i + 1) * num_pdists)
                    all_pdists[indices] = mpdists.numpy()

                    # Compute the pearson R.
                    pearson_rs[i] = pearsonr(gpdists, mpdists)

                    # Compute the average distortion
                    distortions[i] = average_distortion(mpdists, gpdists)

                # Compute the F1 scores.
                with Timer('computing F1 scores'):
                    f1_means, f1_stds = fp.layer_mean_f1_scores(
                        all_pdists, n_runs)

                # Aggregate the metrics
                # - Pearson R
                r_mean = np.mean(pearson_rs)
                r_std = np.std(pearson_rs)
                # - average distortion
                dist_mean = np.mean(distortions)
                dist_std = np.std(distortions)

                # Average the curvatures.
                c_means = np.mean(all_cs, axis=0)
                c_stds = np.std(all_cs, axis=0)

                entry = {
                        'dataset': ds_name,
                        'loss_fn': loss_fn,
                        'n_factors': n_factors,

                        'c1_mean': c_means[0], 'c1_std': c_stds[0],
                        'c2_mean': c_means[1], 'c2_std': c_stds[1],
                        'c3_mean': c_means[2], 'c3_std': c_stds[2],
                        'c4_mean': c_means[3], 'c4_std': c_stds[3],
                        'c5_mean': c_means[4], 'c5_std': c_stds[4],
                        'c6_mean': c_means[5], 'c6_std': c_stds[5],

                        'f1_1_mean': f1_means[0], 'f1_1_std': f1_stds[0],
                        'f1_2_mean': f1_means[1], 'f1_2_std': f1_stds[1],
                        'f1_3_mean': f1_means[2], 'f1_3_std': f1_stds[2],
                        'f1_4_mean': f1_means[3], 'f1_4_std': f1_stds[3],
                        'f1_5_mean': f1_means[4], 'f1_5_std': f1_stds[4],
                        'f1_10_mean': f1_means[9], 'f1_10_std': f1_stds[9],

                        'auc_5': area_under_curve(f1_means[:5])[0],
                        'auc_10': area_under_curve(f1_means[:10])[0],
                        'auc_total': area_under_curve(f1_means)[0],

                        'r_mean': r_mean, 'r_std': r_std,

                        'dist_mean': dist_mean, 'dist_std': dist_std,
                }  # yapf: disable
                csv_writer.writerow(entry)

    csv_file.close()
Exemple #9
0
def main():
    args = parse_args()
    if args.verbose:
        logging.getLogger().setLevel(logging.DEBUG)

    config = parse_config(args.config)
    set_seeds(args.random_seed)
    save_dir = check_mkdir(config['save_dir_root'], increment=True)
    copyfile(args.config, os.path.join(save_dir, 'config.yaml'))

    # torch settings
    torch.set_default_dtype(torch.float64)  # use double precision
    if torch.cuda.is_available():  # place everything on CUDA
        # NOTE: We rely on this in several parts of the code.
        torch.set_default_tensor_type(torch.cuda.DoubleTensor)
    if args.detect_anomaly:
        torch.autograd.set_detect_anomaly(True)

    # prepare data
    gpdists, g = load_graph_pdists(config['input_graph'],
                                   cache_dir=config.get('cache_dir'))
    n_nodes = nnm1d2_to_n(len(gpdists))
    if 'preprocess' in config:
        gpdists = config['preprocess'](gpdists)
    dataset = GraphDataset(gpdists if n_nodes < 5000 else gpdists.to('cpu'))

    # the embedding
    embedding = config['embedding'](n_nodes)

    # the optimizers
    optimizers = []
    lr_schedulers = []
    if 'embedding_optimizer' in config:
        emb_optim = config['embedding_optimizer'](embedding.xs)
        optimizers.append(emb_optim)
        if 'embedding_lr_scheduler' in config:
            lr_schedulers.append(config['embedding_lr_scheduler'](emb_optim))
    if 'curvature_optimizer' in config:
        curv_optim = config['curvature_optimizer'](embedding.curvature_params)
        optimizers.append(curv_optim)
        if 'curvature_lr_scheduler' in config:
            lr_schedulers.append(config['curvature_lr_scheduler'](curv_optim))

    # prepare training
    training_args = dict(embedding=embedding,
                         optimizer=optimizers,
                         lr_scheduler=lr_schedulers,
                         objective_fn=config['objective_fn'],
                         save_dir=save_dir)
    training_args.update(config['training_params'])

    # use the right training engine
    if isinstance(embedding, ProductManifoldEmbedding):
        from graphembed.products import TrainingEngine
    elif 'min_alpha' in training_args or 'max_alpha' in training_args:
        from graphembed.train_da import TrainingEngine
    else:
        from graphembed.train import TrainingEngine

    # use a with-block to make sure we the threads are closed even if we kill
    # the process
    with ThreadPoolExecutor(max_workers=args.num_workers) as pool:
        if g is not None:
            with Timer('constructing FastPrecision', loglevel=logging.INFO):
                fp = FastPrecision(g)
            training_args['lazy_metrics'] = {
                'Layer_Mean_F1': \
                    lambda p: pool.submit(fp.layer_mean_f1_scores, p),
            }  # yapf: disable
        training_engine = TrainingEngine(**training_args)

        # train
        with Timer('training', loglevel=logging.INFO):
            training_engine(dataset)