Beispiel #1
0
def test_get_nb_edit_operations_symbolic_cml():
	"""Test get_nb_edit_operations_symbolic_cml().
	"""
	"""**1.   Get dataset.**"""

	from gklearn.utils import Dataset
	
	# Predefined dataset name, use dataset "MUTAG".
	ds_name = 'MUTAG'
	
	# Initialize a Dataset.
	dataset = Dataset()
	# Load predefined dataset "MUTAG".
	dataset.load_predefined_dataset(ds_name)
	graph1 = dataset.graphs[0]
	graph2 = dataset.graphs[1]
	
	"""**2.  Compute graph edit distance.**"""
	
# 	try:
	# Initialize label costs randomly.
	node_label_costs, edge_label_costs = _initialize_label_costs(dataset)
	
	# Compute GEDs.
	pi_forward, pi_backward, dis, node_labels, edge_labels = _compute_ged(dataset, node_label_costs, edge_label_costs)
	
	
	# Compute numbers of edit operations.
	
	from gklearn.ged.util.util import get_nb_edit_operations_symbolic_cml
	
	n_edit_operations = get_nb_edit_operations_symbolic_cml(graph1, graph2, pi_forward, pi_backward, node_labels, edge_labels)
	
	assert np.abs((np.dot(np.concatenate((node_label_costs, edge_label_costs)), n_edit_operations) - dis) / dis) < 10e-6
Beispiel #2
0
def xp_check_results_of_GEDEnv():
    """Compare results of GEDEnv to GEDLIB.
	"""
    """**1.   Get dataset.**"""

    from gklearn.utils import Dataset

    # Predefined dataset name, use dataset "MUTAG".
    ds_name = 'MUTAG'

    # Initialize a Dataset.
    dataset = Dataset()
    # Load predefined dataset "MUTAG".
    dataset.load_predefined_dataset(ds_name)

    results1 = compute_geds_by_GEDEnv(dataset)
    results2 = compute_geds_by_GEDLIB(dataset)

    # Show results.
    import pprint
    pp = pprint.PrettyPrinter(indent=4)  # pretty print
    print('Restuls using GEDEnv:')
    pp.pprint(results1)
    print()
    print('Restuls using GEDLIB:')
    pp.pprint(results2)

    return results1, results2
def get_infos(graph):
	from gklearn.utils import Dataset
	ds = Dataset()
	ds.load_graphs(graph)
	infos = ds.get_dataset_infos(keys=['all_degree_entropy', 'ave_node_degree'])
	infos['ave_degree_entropy'] = np.mean(infos['all_degree_entropy'])
	print(infos['ave_degree_entropy'], ',', infos['ave_node_degree'])
	return infos
def test_GEDEnv():
    """Test GEDEnv.
	"""
    """**1.   Get dataset.**"""

    from gklearn.utils import Dataset

    # Predefined dataset name, use dataset "MUTAG".
    ds_name = 'MUTAG'

    # Initialize a Dataset.
    dataset = Dataset()
    # Load predefined dataset "MUTAG".
    dataset.load_predefined_dataset(ds_name)
    graph1 = dataset.graphs[0]
    graph2 = dataset.graphs[1]
    """**2.  Compute graph edit distance.**"""

    try:
        from gklearn.ged.env import GEDEnv

        ged_env = GEDEnv()  # initailize GED environment.
        ged_env.set_edit_cost(
            'CONSTANT',  # GED cost type.
            edit_cost_constants=[3, 3, 1, 3, 3, 1]  # edit costs.
        )
        ged_env.add_nx_graph(graph1, '')  # add graph1
        ged_env.add_nx_graph(graph2, '')  # add graph2
        listID = ged_env.get_all_graph_ids()  # get list IDs of graphs
        ged_env.init(init_type='LAZY_WITHOUT_SHUFFLED_COPIES'
                     )  # initialize GED environment.
        options = {
            'initialization_method': 'RANDOM',  # or 'NODE', etc.
            'threads': 1  # parallel threads.
        }
        ged_env.set_method(
            'BIPARTITE',  # GED method.
            options  # options for GED method.
        )
        ged_env.init_method()  # initialize GED method.

        ged_env.run_method(listID[0], listID[1])  # run.

        pi_forward = ged_env.get_forward_map(listID[0],
                                             listID[1])  # forward map.
        pi_backward = ged_env.get_backward_map(listID[0],
                                               listID[1])  # backward map.
        dis = ged_env.get_upper_bound(listID[0],
                                      listID[1])  # GED bewteen two graphs.

        import networkx as nx
        assert len(pi_forward) == nx.number_of_nodes(graph1), len(
            pi_backward) == nx.number_of_nodes(graph2)

    except Exception as exception:
        assert False, exception
Beispiel #5
0
def compute_gram_matrices_by_class(ds_name,
                                   kernel_options,
                                   save_results=True,
                                   dir_save='',
                                   irrelevant_labels=None,
                                   edge_required=False):
    import os
    from gklearn.utils import Dataset, split_dataset_by_target

    # 1. get dataset.
    print('1. getting dataset...')
    dataset_all = Dataset()
    dataset_all.load_predefined_dataset(ds_name)
    dataset_all.trim_dataset(edge_required=edge_required)
    if not irrelevant_labels is None:
        dataset_all.remove_labels(**irrelevant_labels)


# 	dataset_all.cut_graphs(range(0, 10))
    datasets = split_dataset_by_target(dataset_all)

    gram_matrix_unnorm_list = []
    run_time_list = []

    print('start generating preimage for each class of target...')
    for idx, dataset in enumerate(datasets):
        target = dataset.targets[0]
        print('\ntarget =', target, '\n')

        # 2. initialize graph kernel.
        print('2. initializing graph kernel and setting parameters...')
        graph_kernel = get_graph_kernel_by_name(
            kernel_options['name'],
            node_labels=dataset.node_labels,
            edge_labels=dataset.edge_labels,
            node_attrs=dataset.node_attrs,
            edge_attrs=dataset.edge_attrs,
            ds_infos=dataset.get_dataset_infos(keys=['directed']),
            kernel_options=kernel_options)

        # 3. compute gram matrix.
        print('3. computing gram matrix...')
        gram_matrix, run_time = graph_kernel.compute(dataset.graphs,
                                                     **kernel_options)
        gram_matrix_unnorm = graph_kernel.gram_matrix_unnorm

        gram_matrix_unnorm_list.append(gram_matrix_unnorm)
        run_time_list.append(run_time)

    # 4. save results.
    print()
    print('4. saving results...')
    if save_results:
        os.makedirs(dir_save, exist_ok=True)
        np.savez(dir_save + 'gram_matrix_unnorm.' + ds_name + '.' +
                 kernel_options['name'] + '.gm',
                 gram_matrix_unnorm_list=gram_matrix_unnorm_list,
                 run_time_list=run_time_list)

    print('\ncomplete.')
    https://colab.research.google.com/drive/1Wfgn7WVuyOQQgwOvdUQBz0BzEVdp0YM3

**This script demonstrates how to compute a graph edit distance.**
---

**0.   Install `graphkit-learn`.**
"""
"""**1.   Get dataset.**"""

from gklearn.utils import Dataset

# Predefined dataset name, use dataset "MUTAG".
ds_name = 'MUTAG'

# Initialize a Dataset.
dataset = Dataset()
# Load predefined dataset "MUTAG".
dataset.load_predefined_dataset(ds_name)
graph1 = dataset.graphs[0]
graph2 = dataset.graphs[1]
print(graph1, graph2)
"""**2.  Compute graph edit distance.**"""

from gklearn.ged.env import GEDEnv

ged_env = GEDEnv()  # initailize GED environment.
ged_env.set_edit_cost(
    'CONSTANT',  # GED cost type.
    edit_cost_constants=[3, 3, 1, 3, 3, 1]  # edit costs.
)
ged_env.add_nx_graph(graph1, '')  # add graph1
def kernel_knn_cv(ds_name,
                  train_examples,
                  knn_options,
                  mpg_options,
                  kernel_options,
                  ged_options,
                  mge_options,
                  save_results=True,
                  load_gm='auto',
                  dir_save='',
                  irrelevant_labels=None,
                  edge_required=False,
                  cut_range=None):

    # 1. get dataset.
    print('1. getting dataset...')
    dataset_all = Dataset()
    dataset_all.load_predefined_dataset(ds_name)
    dataset_all.trim_dataset(edge_required=edge_required)
    if irrelevant_labels is not None:
        dataset_all.remove_labels(**irrelevant_labels)
    if cut_range is not None:
        dataset_all.cut_graphs(cut_range)

    if save_results:
        # create result files.
        print('creating output files...')
        fn_output_detail, fn_output_summary = __init_output_file_knn(
            ds_name, kernel_options['name'], mpg_options['fit_method'],
            dir_save)
    else:
        fn_output_detail, fn_output_summary = None, None

    # 2. compute/load Gram matrix a priori.
    print('2. computing/loading Gram matrix...')
    gram_matrix_unnorm, time_precompute_gm = __get_gram_matrix(
        load_gm, dir_save, ds_name, kernel_options, dataset_all)

    # 3. perform k-nn CV.
    print('3. performing k-nn CV...')
    if train_examples == 'k-graphs' or train_examples == 'expert' or train_examples == 'random':
        __kernel_knn_cv_median(dataset_all, ds_name, knn_options, mpg_options,
                               kernel_options, mge_options, ged_options,
                               gram_matrix_unnorm, time_precompute_gm,
                               train_examples, save_results, dir_save,
                               fn_output_detail, fn_output_summary)

    elif train_examples == 'best-dataset':
        __kernel_knn_cv_best_ds(dataset_all, ds_name, knn_options,
                                kernel_options, gram_matrix_unnorm,
                                time_precompute_gm, train_examples,
                                save_results, dir_save, fn_output_detail,
                                fn_output_summary)

    elif train_examples == 'trainset':
        __kernel_knn_cv_trainset(dataset_all, ds_name, knn_options,
                                 kernel_options, gram_matrix_unnorm,
                                 time_precompute_gm, train_examples,
                                 save_results, dir_save, fn_output_detail,
                                 fn_output_summary)

    print('\ncomplete.\n')
Beispiel #8
0
def generate_median_preimages_by_class(ds_name,
                                       mpg_options,
                                       kernel_options,
                                       ged_options,
                                       mge_options,
                                       save_results=True,
                                       save_medians=True,
                                       plot_medians=True,
                                       load_gm='auto',
                                       dir_save='',
                                       irrelevant_labels=None,
                                       edge_required=False,
                                       cut_range=None):
    import os.path
    from gklearn.preimage import MedianPreimageGenerator
    from gklearn.utils import split_dataset_by_target
    from gklearn.utils.graphfiles import saveGXL

    # 1. get dataset.
    print('1. getting dataset...')
    dataset_all = Dataset()
    dataset_all.load_predefined_dataset(ds_name)
    dataset_all.trim_dataset(edge_required=edge_required)
    if irrelevant_labels is not None:
        dataset_all.remove_labels(**irrelevant_labels)
    if cut_range is not None:
        dataset_all.cut_graphs(cut_range)
    datasets = split_dataset_by_target(dataset_all)

    if save_results:
        # create result files.
        print('creating output files...')
        fn_output_detail, fn_output_summary = _init_output_file_preimage(
            ds_name, kernel_options['name'], mpg_options['fit_method'],
            dir_save)

    sod_sm_list = []
    sod_gm_list = []
    dis_k_sm_list = []
    dis_k_gm_list = []
    dis_k_gi_min_list = []
    time_optimize_ec_list = []
    time_generate_list = []
    time_total_list = []
    itrs_list = []
    converged_list = []
    num_updates_ecc_list = []
    mge_decrease_order_list = []
    mge_increase_order_list = []
    mge_converged_order_list = []
    nb_sod_sm2gm = [0, 0, 0]
    nb_dis_k_sm2gm = [0, 0, 0]
    nb_dis_k_gi2sm = [0, 0, 0]
    nb_dis_k_gi2gm = [0, 0, 0]
    dis_k_max_list = []
    dis_k_min_list = []
    dis_k_mean_list = []
    if load_gm == 'auto':
        gm_fname = dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options[
            'name'] + '.gm.npz'
        gmfile_exist = os.path.isfile(os.path.abspath(gm_fname))
        if gmfile_exist:
            gmfile = np.load(gm_fname,
                             allow_pickle=True)  # @todo: may not be safe.
            gram_matrix_unnorm_list = [
                item for item in gmfile['gram_matrix_unnorm_list']
            ]
            time_precompute_gm_list = gmfile['run_time_list'].tolist()
        else:
            gram_matrix_unnorm_list = []
            time_precompute_gm_list = []
    elif not load_gm:
        gram_matrix_unnorm_list = []
        time_precompute_gm_list = []
    else:
        gm_fname = dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options[
            'name'] + '.gm.npz'
        gmfile = np.load(gm_fname,
                         allow_pickle=True)  # @todo: may not be safe.
        gram_matrix_unnorm_list = [
            item for item in gmfile['gram_matrix_unnorm_list']
        ]
        time_precompute_gm_list = gmfile['run_time_list'].tolist()


#	repeats_better_sod_sm2gm = []
#	repeats_better_dis_k_sm2gm = []
#	repeats_better_dis_k_gi2sm = []
#	repeats_better_dis_k_gi2gm = []

    print('starting generating preimage for each class of target...')
    idx_offset = 0
    for idx, dataset in enumerate(datasets):
        target = dataset.targets[0]
        print('\ntarget =', target, '\n')
        #		if target != 1:
        # 			continue

        num_graphs = len(dataset.graphs)
        if num_graphs < 2:
            print('\nnumber of graphs = ', num_graphs, ', skip.\n')
            idx_offset += 1
            continue

        # 2. set parameters.
        print('2. initializing mpg and setting parameters...')
        if load_gm:
            if gmfile_exist:
                mpg_options['gram_matrix_unnorm'] = gram_matrix_unnorm_list[
                    idx - idx_offset]
                mpg_options['runtime_precompute_gm'] = time_precompute_gm_list[
                    idx - idx_offset]
        mpg = MedianPreimageGenerator()
        mpg.dataset = dataset
        mpg.set_options(**mpg_options.copy())
        mpg.kernel_options = kernel_options.copy()
        mpg.ged_options = ged_options.copy()
        mpg.mge_options = mge_options.copy()

        # 3. compute median preimage.
        print('3. computing median preimage...')
        mpg.run()
        results = mpg.get_results()

        # 4. compute pairwise kernel distances.
        print('4. computing pairwise kernel distances...')
        _, dis_k_max, dis_k_min, dis_k_mean = mpg.graph_kernel.compute_distance_matrix(
        )
        dis_k_max_list.append(dis_k_max)
        dis_k_min_list.append(dis_k_min)
        dis_k_mean_list.append(dis_k_mean)

        # 5. save results (and median graphs).
        print('5. saving results (and median graphs)...')
        # write result detail.
        if save_results:
            print('writing results to files...')
            sod_sm2gm = get_relations(
                np.sign(results['sod_gen_median'] - results['sod_set_median']))
            dis_k_sm2gm = get_relations(
                np.sign(results['k_dis_gen_median'] -
                        results['k_dis_set_median']))
            dis_k_gi2sm = get_relations(
                np.sign(results['k_dis_set_median'] -
                        results['k_dis_dataset']))
            dis_k_gi2gm = get_relations(
                np.sign(results['k_dis_gen_median'] -
                        results['k_dis_dataset']))

            f_detail = open(dir_save + fn_output_detail, 'a')
            csv.writer(f_detail).writerow([
                ds_name,
                kernel_options['name'],
                ged_options['edit_cost'],
                ged_options['method'],
                ged_options['attr_distance'],
                mpg_options['fit_method'],
                num_graphs,
                target,
                1,
                results['sod_set_median'],
                results['sod_gen_median'],
                results['k_dis_set_median'],
                results['k_dis_gen_median'],
                results['k_dis_dataset'],
                sod_sm2gm,
                dis_k_sm2gm,
                dis_k_gi2sm,
                dis_k_gi2gm,
                results['edit_cost_constants'],
                results['runtime_precompute_gm'],
                results['runtime_optimize_ec'],
                results['runtime_generate_preimage'],
                results['runtime_total'],
                results['itrs'],
                results['converged'],
                results['num_updates_ecc'],
                results['mge']['num_decrease_order'] >
                0,  # @todo: not suitable for multi-start mge
                results['mge']['num_increase_order'] > 0,
                results['mge']['num_converged_descents'] > 0
            ])
            f_detail.close()

            # compute result summary.
            sod_sm_list.append(results['sod_set_median'])
            sod_gm_list.append(results['sod_gen_median'])
            dis_k_sm_list.append(results['k_dis_set_median'])
            dis_k_gm_list.append(results['k_dis_gen_median'])
            dis_k_gi_min_list.append(results['k_dis_dataset'])
            time_precompute_gm_list.append(results['runtime_precompute_gm'])
            time_optimize_ec_list.append(results['runtime_optimize_ec'])
            time_generate_list.append(results['runtime_generate_preimage'])
            time_total_list.append(results['runtime_total'])
            itrs_list.append(results['itrs'])
            converged_list.append(results['converged'])
            num_updates_ecc_list.append(results['num_updates_ecc'])
            mge_decrease_order_list.append(
                results['mge']['num_decrease_order'] > 0)
            mge_increase_order_list.append(
                results['mge']['num_increase_order'] > 0)
            mge_converged_order_list.append(
                results['mge']['num_converged_descents'] > 0)
            # # SOD SM -> GM
            if results['sod_set_median'] > results['sod_gen_median']:
                nb_sod_sm2gm[0] += 1
    #			repeats_better_sod_sm2gm.append(1)
            elif results['sod_set_median'] == results['sod_gen_median']:
                nb_sod_sm2gm[1] += 1
            elif results['sod_set_median'] < results['sod_gen_median']:
                nb_sod_sm2gm[2] += 1
            # # dis_k SM -> GM
            if results['k_dis_set_median'] > results['k_dis_gen_median']:
                nb_dis_k_sm2gm[0] += 1
    #			repeats_better_dis_k_sm2gm.append(1)
            elif results['k_dis_set_median'] == results['k_dis_gen_median']:
                nb_dis_k_sm2gm[1] += 1
            elif results['k_dis_set_median'] < results['k_dis_gen_median']:
                nb_dis_k_sm2gm[2] += 1
            # # dis_k gi -> SM
            if results['k_dis_dataset'] > results['k_dis_set_median']:
                nb_dis_k_gi2sm[0] += 1
    #			repeats_better_dis_k_gi2sm.append(1)
            elif results['k_dis_dataset'] == results['k_dis_set_median']:
                nb_dis_k_gi2sm[1] += 1
            elif results['k_dis_dataset'] < results['k_dis_set_median']:
                nb_dis_k_gi2sm[2] += 1
            # # dis_k gi -> GM
            if results['k_dis_dataset'] > results['k_dis_gen_median']:
                nb_dis_k_gi2gm[0] += 1
    #			repeats_better_dis_k_gi2gm.append(1)
            elif results['k_dis_dataset'] == results['k_dis_gen_median']:
                nb_dis_k_gi2gm[1] += 1
            elif results['k_dis_dataset'] < results['k_dis_gen_median']:
                nb_dis_k_gi2gm[2] += 1

            # write result summary for each letter.
            f_summary = open(dir_save + fn_output_summary, 'a')
            csv.writer(f_summary).writerow([
                ds_name,
                kernel_options['name'],
                ged_options['edit_cost'],
                ged_options['method'],
                ged_options['attr_distance'],
                mpg_options['fit_method'],
                num_graphs,
                target,
                results['sod_set_median'],
                results['sod_gen_median'],
                results['k_dis_set_median'],
                results['k_dis_gen_median'],
                results['k_dis_dataset'],
                sod_sm2gm,
                dis_k_sm2gm,
                dis_k_gi2sm,
                dis_k_gi2gm,
                results['runtime_precompute_gm'],
                results['runtime_optimize_ec'],
                results['runtime_generate_preimage'],
                results['runtime_total'],
                results['itrs'],
                results['converged'],
                results['num_updates_ecc'],
                results['mge']['num_decrease_order'] >
                0,  # @todo: not suitable for multi-start mge
                results['mge']['num_increase_order'] > 0,
                results['mge']['num_converged_descents'] > 0,
                nb_sod_sm2gm,
                nb_dis_k_sm2gm,
                nb_dis_k_gi2sm,
                nb_dis_k_gi2gm
            ])
            f_summary.close()

        # save median graphs.
        if save_medians:
            os.makedirs(dir_save + 'medians/', exist_ok=True)
            print('Saving median graphs to files...')
            fn_pre_sm = dir_save + 'medians/set_median.' + mpg_options[
                'fit_method'] + '.nbg' + str(num_graphs) + '.y' + str(
                    target) + '.repeat' + str(1)
            saveGXL(mpg.set_median,
                    fn_pre_sm + '.gxl',
                    method='default',
                    node_labels=dataset.node_labels,
                    edge_labels=dataset.edge_labels,
                    node_attrs=dataset.node_attrs,
                    edge_attrs=dataset.edge_attrs)
            fn_pre_gm = dir_save + 'medians/gen_median.' + mpg_options[
                'fit_method'] + '.nbg' + str(num_graphs) + '.y' + str(
                    target) + '.repeat' + str(1)
            saveGXL(mpg.gen_median,
                    fn_pre_gm + '.gxl',
                    method='default',
                    node_labels=dataset.node_labels,
                    edge_labels=dataset.edge_labels,
                    node_attrs=dataset.node_attrs,
                    edge_attrs=dataset.edge_attrs)
            fn_best_dataset = dir_save + 'medians/g_best_dataset.' + mpg_options[
                'fit_method'] + '.nbg' + str(num_graphs) + '.y' + str(
                    target) + '.repeat' + str(1)
            saveGXL(mpg.best_from_dataset,
                    fn_best_dataset + '.gxl',
                    method='default',
                    node_labels=dataset.node_labels,
                    edge_labels=dataset.edge_labels,
                    node_attrs=dataset.node_attrs,
                    edge_attrs=dataset.edge_attrs)

        # plot median graphs.
        if plot_medians and save_medians:
            if ged_options['edit_cost'] == 'LETTER2' or ged_options[
                    'edit_cost'] == 'LETTER' or ds_name == 'Letter-high' or ds_name == 'Letter-med' or ds_name == 'Letter-low':
                draw_Letter_graph(mpg.set_median, fn_pre_sm)
                draw_Letter_graph(mpg.gen_median, fn_pre_gm)
                draw_Letter_graph(mpg.best_from_dataset, fn_best_dataset)

        if (load_gm == 'auto' and not gmfile_exist) or not load_gm:
            gram_matrix_unnorm_list.append(mpg.gram_matrix_unnorm)

    # write result summary for each class.
    if save_results:
        sod_sm_mean = np.mean(sod_sm_list)
        sod_gm_mean = np.mean(sod_gm_list)
        dis_k_sm_mean = np.mean(dis_k_sm_list)
        dis_k_gm_mean = np.mean(dis_k_gm_list)
        dis_k_gi_min_mean = np.mean(dis_k_gi_min_list)
        time_precompute_gm_mean = np.mean(time_precompute_gm_list)
        time_optimize_ec_mean = np.mean(time_optimize_ec_list)
        time_generate_mean = np.mean(time_generate_list)
        time_total_mean = np.mean(time_total_list)
        itrs_mean = np.mean(itrs_list)
        num_converged = np.sum(converged_list)
        num_updates_ecc_mean = np.mean(num_updates_ecc_list)
        num_mge_decrease_order = np.sum(mge_decrease_order_list)
        num_mge_increase_order = np.sum(mge_increase_order_list)
        num_mge_converged = np.sum(mge_converged_order_list)
        sod_sm2gm_mean = get_relations(np.sign(sod_gm_mean - sod_sm_mean))
        dis_k_sm2gm_mean = get_relations(np.sign(dis_k_gm_mean -
                                                 dis_k_sm_mean))
        dis_k_gi2sm_mean = get_relations(
            np.sign(dis_k_sm_mean - dis_k_gi_min_mean))
        dis_k_gi2gm_mean = get_relations(
            np.sign(dis_k_gm_mean - dis_k_gi_min_mean))
        f_summary = open(dir_save + fn_output_summary, 'a')
        csv.writer(f_summary).writerow([
            ds_name, kernel_options['name'], ged_options['edit_cost'],
            ged_options['method'], ged_options['attr_distance'],
            mpg_options['fit_method'], num_graphs, 'all', sod_sm_mean,
            sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean, dis_k_gi_min_mean,
            sod_sm2gm_mean, dis_k_sm2gm_mean, dis_k_gi2sm_mean,
            dis_k_gi2gm_mean, time_precompute_gm_mean, time_optimize_ec_mean,
            time_generate_mean, time_total_mean, itrs_mean, num_converged,
            num_updates_ecc_mean, num_mge_decrease_order,
            num_mge_increase_order, num_mge_converged
        ])
        f_summary.close()

    # save total pairwise kernel distances.
    dis_k_max = np.max(dis_k_max_list)
    dis_k_min = np.min(dis_k_min_list)
    dis_k_mean = np.mean(dis_k_mean_list)
    print('The maximum pairwise distance in kernel space:', dis_k_max)
    print('The minimum pairwise distance in kernel space:', dis_k_min)
    print('The average pairwise distance in kernel space:', dis_k_mean)

    # write Gram matrices to file.
    if (load_gm == 'auto' and not gmfile_exist) or not load_gm:
        np.savez(dir_save + 'gram_matrix_unnorm.' + ds_name + '.' +
                 kernel_options['name'] + '.gm',
                 gram_matrix_unnorm_list=gram_matrix_unnorm_list,
                 run_time_list=time_precompute_gm_list)

    print('\ncomplete.\n')
Beispiel #9
0
def remove_best_graph(ds_name,
                      mpg_options,
                      kernel_options,
                      ged_options,
                      mge_options,
                      save_results=True,
                      save_medians=True,
                      plot_medians=True,
                      load_gm='auto',
                      dir_save='',
                      irrelevant_labels=None,
                      edge_required=False,
                      cut_range=None):
    """Remove the best graph from the median set w.r.t. distance in kernel space, and to see if it is possible to generate the removed graph using the graphs left in the median set.
	"""
    # 1. get dataset.
    print('1. getting dataset...')
    dataset_all = Dataset()
    dataset_all.load_predefined_dataset(ds_name)
    dataset_all.trim_dataset(edge_required=edge_required)
    if irrelevant_labels is not None:
        dataset_all.remove_labels(**irrelevant_labels)
    if cut_range is not None:
        dataset_all.cut_graphs(cut_range)
    datasets = split_dataset_by_target(dataset_all)

    if save_results:
        # create result files.
        print('creating output files...')
        fn_output_detail, fn_output_summary = __init_output_file(
            ds_name, kernel_options['name'], mpg_options['fit_method'],
            dir_save)
    else:
        fn_output_detail, fn_output_summary = None, None

    # 2. compute/load Gram matrix a priori.
    print('2. computing/loading Gram matrix...')
    gram_matrix_unnorm_list, time_precompute_gm_list = __get_gram_matrix(
        load_gm, dir_save, ds_name, kernel_options, datasets)

    sod_sm_list = []
    sod_gm_list = []
    dis_k_sm_list = []
    dis_k_gm_list = []
    dis_k_gi_min_list = []
    time_optimize_ec_list = []
    time_generate_list = []
    time_total_list = []
    itrs_list = []
    converged_list = []
    num_updates_ecc_list = []
    mge_decrease_order_list = []
    mge_increase_order_list = []
    mge_converged_order_list = []
    nb_sod_sm2gm = [0, 0, 0]
    nb_dis_k_sm2gm = [0, 0, 0]
    nb_dis_k_gi2sm = [0, 0, 0]
    nb_dis_k_gi2gm = [0, 0, 0]
    dis_k_max_list = []
    dis_k_min_list = []
    dis_k_mean_list = []
    best_dis_list = []
    print('starting experiment for each class of target...')
    idx_offset = 0
    for idx, dataset in enumerate(datasets):
        target = dataset.targets[0]
        print('\ntarget =', target, '\n')
        #		if target != 1:
        # 			continue

        num_graphs = len(dataset.graphs)
        if num_graphs < 2:
            print('\nnumber of graphs = ', num_graphs, ', skip.\n')
            idx_offset += 1
            continue

        # 3. get the best graph and remove it from median set.
        print('3. getting and removing the best graph...')
        gram_matrix_unnorm = gram_matrix_unnorm_list[idx - idx_offset]
        best_index, best_dis, best_graph = __get_best_graph(
            [g.copy() for g in dataset.graphs],
            normalize_gram_matrix(gram_matrix_unnorm.copy()))
        median_set_new = [
            dataset.graphs[i] for i in range(len(dataset.graphs))
            if i != best_index
        ]
        num_graphs -= 1
        if num_graphs == 1:
            continue
        best_dis_list.append(best_dis)

        dataset.load_graphs(median_set_new, targets=None)
        gram_matrix_unnorm_new = np.delete(gram_matrix_unnorm,
                                           best_index,
                                           axis=0)
        gram_matrix_unnorm_new = np.delete(gram_matrix_unnorm_new,
                                           best_index,
                                           axis=1)

        # 4. set parameters.
        print('4. initializing mpg and setting parameters...')
        mpg_options['gram_matrix_unnorm'] = gram_matrix_unnorm_new
        mpg_options['runtime_precompute_gm'] = time_precompute_gm_list[
            idx - idx_offset]
        mpg = MedianPreimageGenerator()
        mpg.dataset = dataset
        mpg.set_options(**mpg_options.copy())
        mpg.kernel_options = kernel_options.copy()
        mpg.ged_options = ged_options.copy()
        mpg.mge_options = mge_options.copy()

        # 5. compute median preimage.
        print('5. computing median preimage...')
        mpg.run()
        results = mpg.get_results()

        # 6. compute pairwise kernel distances.
        print('6. computing pairwise kernel distances...')
        _, dis_k_max, dis_k_min, dis_k_mean = mpg.graph_kernel.compute_distance_matrix(
        )
        dis_k_max_list.append(dis_k_max)
        dis_k_min_list.append(dis_k_min)
        dis_k_mean_list.append(dis_k_mean)

        # 7. save results (and median graphs).
        print('7. saving results (and median graphs)...')
        # write result detail.
        if save_results:
            print('writing results to files...')
            sod_sm2gm = get_relations(
                np.sign(results['sod_gen_median'] - results['sod_set_median']))
            dis_k_sm2gm = get_relations(
                np.sign(results['k_dis_gen_median'] -
                        results['k_dis_set_median']))
            dis_k_gi2sm = get_relations(
                np.sign(results['k_dis_set_median'] -
                        results['k_dis_dataset']))
            dis_k_gi2gm = get_relations(
                np.sign(results['k_dis_gen_median'] -
                        results['k_dis_dataset']))

            f_detail = open(dir_save + fn_output_detail, 'a')
            csv.writer(f_detail).writerow([
                ds_name,
                kernel_options['name'],
                ged_options['edit_cost'],
                ged_options['method'],
                ged_options['attr_distance'],
                mpg_options['fit_method'],
                num_graphs,
                target,
                1,
                results['sod_set_median'],
                results['sod_gen_median'],
                results['k_dis_set_median'],
                results['k_dis_gen_median'],
                results['k_dis_dataset'],
                best_dis,
                best_index,
                sod_sm2gm,
                dis_k_sm2gm,
                dis_k_gi2sm,
                dis_k_gi2gm,
                results['edit_cost_constants'],
                results['runtime_precompute_gm'],
                results['runtime_optimize_ec'],
                results['runtime_generate_preimage'],
                results['runtime_total'],
                results['itrs'],
                results['converged'],
                results['num_updates_ecc'],
                results['mge']['num_decrease_order'] >
                0,  # @todo: not suitable for multi-start mge
                results['mge']['num_increase_order'] > 0,
                results['mge']['num_converged_descents'] > 0
            ])
            f_detail.close()

            # compute result summary.
            sod_sm_list.append(results['sod_set_median'])
            sod_gm_list.append(results['sod_gen_median'])
            dis_k_sm_list.append(results['k_dis_set_median'])
            dis_k_gm_list.append(results['k_dis_gen_median'])
            dis_k_gi_min_list.append(results['k_dis_dataset'])
            time_precompute_gm_list.append(results['runtime_precompute_gm'])
            time_optimize_ec_list.append(results['runtime_optimize_ec'])
            time_generate_list.append(results['runtime_generate_preimage'])
            time_total_list.append(results['runtime_total'])
            itrs_list.append(results['itrs'])
            converged_list.append(results['converged'])
            num_updates_ecc_list.append(results['num_updates_ecc'])
            mge_decrease_order_list.append(
                results['mge']['num_decrease_order'] > 0)
            mge_increase_order_list.append(
                results['mge']['num_increase_order'] > 0)
            mge_converged_order_list.append(
                results['mge']['num_converged_descents'] > 0)
            # # SOD SM -> GM
            if results['sod_set_median'] > results['sod_gen_median']:
                nb_sod_sm2gm[0] += 1
    #			repeats_better_sod_sm2gm.append(1)
            elif results['sod_set_median'] == results['sod_gen_median']:
                nb_sod_sm2gm[1] += 1
            elif results['sod_set_median'] < results['sod_gen_median']:
                nb_sod_sm2gm[2] += 1
            # # dis_k SM -> GM
            if results['k_dis_set_median'] > results['k_dis_gen_median']:
                nb_dis_k_sm2gm[0] += 1
    #			repeats_better_dis_k_sm2gm.append(1)
            elif results['k_dis_set_median'] == results['k_dis_gen_median']:
                nb_dis_k_sm2gm[1] += 1
            elif results['k_dis_set_median'] < results['k_dis_gen_median']:
                nb_dis_k_sm2gm[2] += 1
            # # dis_k gi -> SM
            if results['k_dis_dataset'] > results['k_dis_set_median']:
                nb_dis_k_gi2sm[0] += 1
    #			repeats_better_dis_k_gi2sm.append(1)
            elif results['k_dis_dataset'] == results['k_dis_set_median']:
                nb_dis_k_gi2sm[1] += 1
            elif results['k_dis_dataset'] < results['k_dis_set_median']:
                nb_dis_k_gi2sm[2] += 1
            # # dis_k gi -> GM
            if results['k_dis_dataset'] > results['k_dis_gen_median']:
                nb_dis_k_gi2gm[0] += 1
    #			repeats_better_dis_k_gi2gm.append(1)
            elif results['k_dis_dataset'] == results['k_dis_gen_median']:
                nb_dis_k_gi2gm[1] += 1
            elif results['k_dis_dataset'] < results['k_dis_gen_median']:
                nb_dis_k_gi2gm[2] += 1

            # write result summary for each letter.
            f_summary = open(dir_save + fn_output_summary, 'a')
            csv.writer(f_summary).writerow([
                ds_name,
                kernel_options['name'],
                ged_options['edit_cost'],
                ged_options['method'],
                ged_options['attr_distance'],
                mpg_options['fit_method'],
                num_graphs,
                target,
                results['sod_set_median'],
                results['sod_gen_median'],
                results['k_dis_set_median'],
                results['k_dis_gen_median'],
                results['k_dis_dataset'],
                best_dis,
                best_index,
                sod_sm2gm,
                dis_k_sm2gm,
                dis_k_gi2sm,
                dis_k_gi2gm,
                results['runtime_precompute_gm'],
                results['runtime_optimize_ec'],
                results['runtime_generate_preimage'],
                results['runtime_total'],
                results['itrs'],
                results['converged'],
                results['num_updates_ecc'],
                results['mge']['num_decrease_order'] >
                0,  # @todo: not suitable for multi-start mge
                results['mge']['num_increase_order'] > 0,
                results['mge']['num_converged_descents'] > 0,
                nb_sod_sm2gm,
                nb_dis_k_sm2gm,
                nb_dis_k_gi2sm,
                nb_dis_k_gi2gm
            ])
            f_summary.close()

        # save median graphs.
        if save_medians:
            if not os.path.exists(dir_save + 'medians/'):
                os.makedirs(dir_save + 'medians/')
            print('Saving median graphs to files...')
            fn_pre_sm = dir_save + 'medians/set_median.' + mpg_options[
                'fit_method'] + '.nbg' + str(num_graphs) + '.y' + str(
                    target) + '.repeat' + str(1)
            saveGXL(mpg.set_median,
                    fn_pre_sm + '.gxl',
                    method='default',
                    node_labels=dataset.node_labels,
                    edge_labels=dataset.edge_labels,
                    node_attrs=dataset.node_attrs,
                    edge_attrs=dataset.edge_attrs)
            fn_pre_gm = dir_save + 'medians/gen_median.' + mpg_options[
                'fit_method'] + '.nbg' + str(num_graphs) + '.y' + str(
                    target) + '.repeat' + str(1)
            saveGXL(mpg.gen_median,
                    fn_pre_gm + '.gxl',
                    method='default',
                    node_labels=dataset.node_labels,
                    edge_labels=dataset.edge_labels,
                    node_attrs=dataset.node_attrs,
                    edge_attrs=dataset.edge_attrs)
            fn_best_dataset = dir_save + 'medians/g_best_dataset.' + mpg_options[
                'fit_method'] + '.nbg' + str(num_graphs) + '.y' + str(
                    target) + '.repeat' + str(1)
            saveGXL(best_graph,
                    fn_best_dataset + '.gxl',
                    method='default',
                    node_labels=dataset.node_labels,
                    edge_labels=dataset.edge_labels,
                    node_attrs=dataset.node_attrs,
                    edge_attrs=dataset.edge_attrs)
            fn_best_median_set = dir_save + 'medians/g_best_median_set.' + mpg_options[
                'fit_method'] + '.nbg' + str(num_graphs) + '.y' + str(
                    target) + '.repeat' + str(1)
            saveGXL(mpg.best_from_dataset,
                    fn_best_median_set + '.gxl',
                    method='default',
                    node_labels=dataset.node_labels,
                    edge_labels=dataset.edge_labels,
                    node_attrs=dataset.node_attrs,
                    edge_attrs=dataset.edge_attrs)

        # plot median graphs.
        if plot_medians and save_medians:
            if ged_options['edit_cost'] == 'LETTER2' or ged_options[
                    'edit_cost'] == 'LETTER' or ds_name == 'Letter-high' or ds_name == 'Letter-med' or ds_name == 'Letter-low':
                draw_Letter_graph(mpg.set_median, fn_pre_sm)
                draw_Letter_graph(mpg.gen_median, fn_pre_gm)
                draw_Letter_graph(mpg.best_from_dataset, fn_best_dataset)

    # write result summary for each letter.
    if save_results:
        sod_sm_mean = np.mean(sod_sm_list)
        sod_gm_mean = np.mean(sod_gm_list)
        dis_k_sm_mean = np.mean(dis_k_sm_list)
        dis_k_gm_mean = np.mean(dis_k_gm_list)
        dis_k_gi_min_mean = np.mean(dis_k_gi_min_list)
        best_dis_mean = np.mean(best_dis_list)
        time_precompute_gm_mean = np.mean(time_precompute_gm_list)
        time_optimize_ec_mean = np.mean(time_optimize_ec_list)
        time_generate_mean = np.mean(time_generate_list)
        time_total_mean = np.mean(time_total_list)
        itrs_mean = np.mean(itrs_list)
        num_converged = np.sum(converged_list)
        num_updates_ecc_mean = np.mean(num_updates_ecc_list)
        num_mge_decrease_order = np.sum(mge_decrease_order_list)
        num_mge_increase_order = np.sum(mge_increase_order_list)
        num_mge_converged = np.sum(mge_converged_order_list)
        sod_sm2gm_mean = get_relations(np.sign(sod_gm_mean - sod_sm_mean))
        dis_k_sm2gm_mean = get_relations(np.sign(dis_k_gm_mean -
                                                 dis_k_sm_mean))
        dis_k_gi2sm_mean = get_relations(
            np.sign(dis_k_sm_mean - dis_k_gi_min_mean))
        dis_k_gi2gm_mean = get_relations(
            np.sign(dis_k_gm_mean - dis_k_gi_min_mean))
        f_summary = open(dir_save + fn_output_summary, 'a')
        csv.writer(f_summary).writerow([
            ds_name, kernel_options['name'], ged_options['edit_cost'],
            ged_options['method'], ged_options['attr_distance'],
            mpg_options['fit_method'], num_graphs, 'all', sod_sm_mean,
            sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean, dis_k_gi_min_mean,
            best_dis_mean, '-', sod_sm2gm_mean, dis_k_sm2gm_mean,
            dis_k_gi2sm_mean, dis_k_gi2gm_mean, time_precompute_gm_mean,
            time_optimize_ec_mean, time_generate_mean, time_total_mean,
            itrs_mean, num_converged, num_updates_ecc_mean,
            num_mge_decrease_order, num_mge_increase_order, num_mge_converged
        ])
        f_summary.close()

    # save total pairwise kernel distances.
    dis_k_max = np.max(dis_k_max_list)
    dis_k_min = np.min(dis_k_min_list)
    dis_k_mean = np.mean(dis_k_mean_list)
    print('The maximum pairwise distance in kernel space:', dis_k_max)
    print('The minimum pairwise distance in kernel space:', dis_k_min)
    print('The average pairwise distance in kernel space:', dis_k_mean)

    print('\ncomplete.\n')
@author: ljia

**This script demonstrates how to generate a graph preimage using Boria's method with cost matrices learning.**
"""

"""**1.   Get dataset.**"""

from gklearn.utils import Dataset, split_dataset_by_target

# Predefined dataset name, use dataset "MAO".
ds_name = 'MAO'
# The node/edge labels that will not be used in the computation.
irrelevant_labels = {'node_attrs': ['x', 'y', 'z'], 'edge_labels': ['bond_stereo']}

# Initialize a Dataset.
dataset_all = Dataset()
# Load predefined dataset "MAO".
dataset_all.load_predefined_dataset(ds_name)
# Remove irrelevant labels.
dataset_all.remove_labels(**irrelevant_labels)
# Split the whole dataset according to the classification targets.
datasets = split_dataset_by_target(dataset_all)
# Get the first class of graphs, whose median preimage will be computed.
dataset = datasets[0]
len(dataset.graphs)

"""**2.  Set parameters.**"""

import multiprocessing

# Parameters for MedianPreimageGenerator (our method).
Beispiel #11
0
def generate_random_preimages_by_class(ds_name,
                                       rpg_options,
                                       kernel_options,
                                       save_results=True,
                                       save_preimages=True,
                                       load_gm='auto',
                                       dir_save='',
                                       irrelevant_labels=None,
                                       edge_required=False,
                                       cut_range=None):
    # 1. get dataset.
    print('1. getting dataset...')
    dataset_all = Dataset()
    dataset_all.load_predefined_dataset(ds_name)
    dataset_all.trim_dataset(edge_required=edge_required)
    if irrelevant_labels is not None:
        dataset_all.remove_labels(**irrelevant_labels)
    if cut_range is not None:
        dataset_all.cut_graphs(cut_range)
    datasets = split_dataset_by_target(dataset_all)

    if save_results:
        # create result files.
        print('creating output files...')
        fn_output_detail, fn_output_summary = _init_output_file_preimage(
            ds_name, kernel_options['name'], dir_save)

    dis_k_dataset_list = []
    dis_k_preimage_list = []
    time_precompute_gm_list = []
    time_generate_list = []
    time_total_list = []
    itrs_list = []
    num_updates_list = []
    if load_gm == 'auto':
        gm_fname = dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options[
            'name'] + '.gm.npz'
        gmfile_exist = os.path.isfile(os.path.abspath(gm_fname))
        if gmfile_exist:
            gmfile = np.load(gm_fname,
                             allow_pickle=True)  # @todo: may not be safe.
            gram_matrix_unnorm_list = [
                item for item in gmfile['gram_matrix_unnorm_list']
            ]
            time_precompute_gm_list = gmfile['run_time_list'].tolist()
        else:
            gram_matrix_unnorm_list = []
            time_precompute_gm_list = []
    elif not load_gm:
        gram_matrix_unnorm_list = []
        time_precompute_gm_list = []
    else:
        gm_fname = dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options[
            'name'] + '.gm.npz'
        gmfile = np.load(gm_fname,
                         allow_pickle=True)  # @todo: may not be safe.
        gram_matrix_unnorm_list = [
            item for item in gmfile['gram_matrix_unnorm_list']
        ]
        time_precompute_gm_list = gmfile['run_time_list'].tolist()

    print('starting generating preimage for each class of target...')
    idx_offset = 0
    for idx, dataset in enumerate(datasets):
        target = dataset.targets[0]
        print('\ntarget =', target, '\n')
        #		if target != 1:
        # 			continue

        num_graphs = len(dataset.graphs)
        if num_graphs < 2:
            print('\nnumber of graphs = ', num_graphs, ', skip.\n')
            idx_offset += 1
            continue

        # 2. set parameters.
        print('2. initializing mpg and setting parameters...')
        if load_gm:
            if gmfile_exist:
                rpg_options['gram_matrix_unnorm'] = gram_matrix_unnorm_list[
                    idx - idx_offset]
                rpg_options['runtime_precompute_gm'] = time_precompute_gm_list[
                    idx - idx_offset]
        rpg = RandomPreimageGenerator()
        rpg.dataset = dataset
        rpg.set_options(**rpg_options.copy())
        rpg.kernel_options = kernel_options.copy()

        # 3. compute preimage.
        print('3. computing preimage...')
        rpg.run()
        results = rpg.get_results()

        # 4. save results (and median graphs).
        print('4. saving results (and preimages)...')
        # write result detail.
        if save_results:
            print('writing results to files...')

            f_detail = open(dir_save + fn_output_detail, 'a')
            csv.writer(f_detail).writerow([
                ds_name, kernel_options['name'], num_graphs, target, 1,
                results['k_dis_dataset'], results['k_dis_preimage'],
                results['runtime_precompute_gm'],
                results['runtime_generate_preimage'], results['runtime_total'],
                results['itrs'], results['num_updates']
            ])
            f_detail.close()

            # compute result summary.
            dis_k_dataset_list.append(results['k_dis_dataset'])
            dis_k_preimage_list.append(results['k_dis_preimage'])
            time_precompute_gm_list.append(results['runtime_precompute_gm'])
            time_generate_list.append(results['runtime_generate_preimage'])
            time_total_list.append(results['runtime_total'])
            itrs_list.append(results['itrs'])
            num_updates_list.append(results['num_updates'])

            # write result summary for each letter.
            f_summary = open(dir_save + fn_output_summary, 'a')
            csv.writer(f_summary).writerow([
                ds_name, kernel_options['name'], num_graphs, target,
                results['k_dis_dataset'], results['k_dis_preimage'],
                results['runtime_precompute_gm'],
                results['runtime_generate_preimage'], results['runtime_total'],
                results['itrs'], results['num_updates']
            ])
            f_summary.close()

        # save median graphs.
        if save_preimages:
            os.makedirs(dir_save + 'preimages/', exist_ok=True)
            print('Saving preimages to files...')
            fn_best_dataset = dir_save + 'preimages/g_best_dataset.' + 'nbg' + str(
                num_graphs) + '.y' + str(target) + '.repeat' + str(1)
            saveGXL(rpg.best_from_dataset,
                    fn_best_dataset + '.gxl',
                    method='default',
                    node_labels=dataset.node_labels,
                    edge_labels=dataset.edge_labels,
                    node_attrs=dataset.node_attrs,
                    edge_attrs=dataset.edge_attrs)
            fn_preimage = dir_save + 'preimages/g_preimage.' + 'nbg' + str(
                num_graphs) + '.y' + str(target) + '.repeat' + str(1)
            saveGXL(rpg.preimage,
                    fn_preimage + '.gxl',
                    method='default',
                    node_labels=dataset.node_labels,
                    edge_labels=dataset.edge_labels,
                    node_attrs=dataset.node_attrs,
                    edge_attrs=dataset.edge_attrs)

        if (load_gm == 'auto' and not gmfile_exist) or not load_gm:
            gram_matrix_unnorm_list.append(rpg.gram_matrix_unnorm)

    # write result summary for each class.
    if save_results:
        dis_k_dataset_mean = np.mean(dis_k_dataset_list)
        dis_k_preimage_mean = np.mean(dis_k_preimage_list)
        time_precompute_gm_mean = np.mean(time_precompute_gm_list)
        time_generate_mean = np.mean(time_generate_list)
        time_total_mean = np.mean(time_total_list)
        itrs_mean = np.mean(itrs_list)
        num_updates_mean = np.mean(num_updates_list)
        f_summary = open(dir_save + fn_output_summary, 'a')
        csv.writer(f_summary).writerow([
            ds_name, kernel_options['name'], num_graphs, 'all',
            dis_k_dataset_mean, dis_k_preimage_mean, time_precompute_gm_mean,
            time_generate_mean, time_total_mean, itrs_mean, num_updates_mean
        ])
        f_summary.close()

    # write Gram matrices to file.
    if (load_gm == 'auto' and not gmfile_exist) or not load_gm:
        np.savez(dir_save + 'gram_matrix_unnorm.' + ds_name + '.' +
                 kernel_options['name'] + '.gm',
                 gram_matrix_unnorm_list=gram_matrix_unnorm_list,
                 run_time_list=time_precompute_gm_list)

    print('\ncomplete.\n')
    https://colab.research.google.com/drive/17Q2QCl9CAtDweGF8LiWnWoN2laeJqT0u

**This script demonstrates how to compute a graph kernel.**
---

**0.   Install `graphkit-learn`.**
"""
"""**1.   Get dataset.**"""

from gklearn.utils import Dataset

# Predefined dataset name, use dataset "MUTAG".
ds_name = 'MUTAG'

# Initialize a Dataset.
dataset = Dataset()
# Load predefined dataset "MUTAG".
dataset.load_predefined_dataset(ds_name)
len(dataset.graphs)
"""**2.  Compute graph kernel.**"""

from gklearn.kernels import PathUpToH

# Initailize parameters for graph kernel computation.
kernel_options = {'depth': 3, 'k_func': 'MinMax', 'compute_method': 'trie'}

# Initialize graph kernel.
graph_kernel = PathUpToH(
    node_labels=dataset.node_labels,  # list of node label names.
    edge_labels=dataset.edge_labels,  # list of edge label names.
    ds_infos=dataset.get_dataset_infos(
def xp_simple_preimage():
    import numpy as np
    """**1.   Get dataset.**"""

    from gklearn.utils import Dataset, split_dataset_by_target

    # Predefined dataset name, use dataset "MAO".
    ds_name = 'MAO'
    # The node/edge labels that will not be used in the computation.
    irrelevant_labels = {
        'node_attrs': ['x', 'y', 'z'],
        'edge_labels': ['bond_stereo']
    }

    # Initialize a Dataset.
    dataset_all = Dataset()
    # Load predefined dataset "MAO".
    dataset_all.load_predefined_dataset(ds_name)
    # Remove irrelevant labels.
    dataset_all.remove_labels(**irrelevant_labels)
    # Split the whole dataset according to the classification targets.
    datasets = split_dataset_by_target(dataset_all)
    # Get the first class of graphs, whose median preimage will be computed.
    dataset = datasets[0]
    len(dataset.graphs)
    """**2.  Set parameters.**"""

    import multiprocessing

    # Parameters for MedianPreimageGenerator (our method).
    mpg_options = {
        'fit_method':
        'k-graphs',  # how to fit edit costs. "k-graphs" means use all graphs in median set when fitting.
        'init_ecc': [4, 4, 2, 1, 1, 1],  # initial edit costs.
        'ds_name': ds_name,  # name of the dataset.
        'parallel': True,  # whether the parallel scheme is to be used.
        'time_limit_in_sec':
        0,  # maximum time limit to compute the preimage. If set to 0 then no limit.
        'max_itrs':
        10,  # maximum iteration limit to optimize edit costs. If set to 0 then no limit.
        'max_itrs_without_update':
        3,  # If the times that edit costs is not update is more than this number, then the optimization stops.
        'epsilon_residual':
        0.01,  # In optimization, the residual is only considered changed if the change is bigger than this number.
        'epsilon_ec':
        0.1,  # In optimization, the edit costs are only considered changed if the changes are bigger than this number.
        'verbose': 2  # whether to print out results.
    }
    # Parameters for graph kernel computation.
    kernel_options = {
        'name': 'PathUpToH',  # use path kernel up to length h.
        'depth': 9,
        'k_func': 'MinMax',
        'compute_method': 'trie',
        'parallel': 'imap_unordered',  # or None
        'n_jobs': multiprocessing.cpu_count(),
        'normalize':
        True,  # whether to use normalized Gram matrix to optimize edit costs.
        'verbose': 2  # whether to print out results.
    }
    # Parameters for GED computation.
    ged_options = {
        'method': 'IPFP',  # use IPFP huristic.
        'initialization_method': 'RANDOM',  # or 'NODE', etc.
        'initial_solutions':
        10,  # when bigger than 1, then the method is considered mIPFP.
        'edit_cost': 'CONSTANT',  # use CONSTANT cost.
        'attr_distance':
        'euclidean',  # the distance between non-symbolic node/edge labels is computed by euclidean distance.
        'ratio_runs_from_initial_solutions': 1,
        'threads': multiprocessing.cpu_count(
        ),  # parallel threads. Do not work if mpg_options['parallel'] = False.
        'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES'
    }
    # Parameters for MedianGraphEstimator (Boria's method).
    mge_options = {
        'init_type':
        'MEDOID',  # how to initial median (compute set-median). "MEDOID" is to use the graph with smallest SOD.
        'random_inits':
        10,  # number of random initialization when 'init_type' = 'RANDOM'.
        'time_limit':
        600,  # maximum time limit to compute the generalized median. If set to 0 then no limit.
        'verbose': 2,  # whether to print out results.
        'refine': False  # whether to refine the final SODs or not.
    }
    print('done.')
    """**3.   Compute the Gram matrix and distance matrix.**"""

    from gklearn.utils.utils import get_graph_kernel_by_name

    # Get a graph kernel instance.
    graph_kernel = get_graph_kernel_by_name(
        kernel_options['name'],
        node_labels=dataset.node_labels,
        edge_labels=dataset.edge_labels,
        node_attrs=dataset.node_attrs,
        edge_attrs=dataset.edge_attrs,
        ds_infos=dataset.get_dataset_infos(keys=['directed']),
        kernel_options=kernel_options)
    # Compute Gram matrix.
    gram_matrix, run_time = graph_kernel.compute(dataset.graphs,
                                                 **kernel_options)

    # Compute distance matrix.
    from gklearn.utils import compute_distance_matrix
    dis_mat, _, _, _ = compute_distance_matrix(gram_matrix)

    print('done.')
    """**4.   Find the candidate graph.**"""

    from gklearn.preimage.utils import compute_k_dis

    # Number of the nearest neighbors.
    k_neighbors = 10

    # For each graph G in dataset, compute the distance between its image \Phi(G) and the mean of its neighbors' images.
    dis_min = np.inf  # the minimum distance between possible \Phi(G) and the mean of its neighbors.
    for idx, G in enumerate(dataset.graphs):
        # Find the k nearest neighbors of G.
        dis_list = dis_mat[
            idx]  # distance between \Phi(G) and image of each graphs.
        idx_sort = np.argsort(
            dis_list)  # sort distances and get the sorted indices.
        idx_nearest = idx_sort[1:k_neighbors +
                               1]  # indices of the k-nearest neighbors.
        dis_k_nearest = [dis_list[i] for i in idx_nearest
                         ]  # k-nearest distances, except the 0.
        G_k_nearest = [dataset.graphs[i]
                       for i in idx_nearest]  # k-nearest neighbors.

        # Compute the distance between \Phi(G) and the mean of its neighbors.
        dis_tmp = compute_k_dis(
            idx,  # the index of G in Gram matrix.
            idx_nearest,  # the indices of the neighbors
            [1 / k_neighbors] * k_neighbors,  # coefficients for neighbors. 
            gram_matrix,
            withterm3=False)
        # Check if the new distance is smallers.
        if dis_tmp < dis_min:
            dis_min = dis_tmp
            G_cand = G
            G_neighbors = G_k_nearest

    print('The minimum distance is', dis_min)
    """**5.   Run median preimage generator.**"""

    from gklearn.preimage import MedianPreimageGenerator

    # Set the dataset as the k-nearest neighbors.
    dataset.load_graphs(G_neighbors)

    # Create median preimage generator instance.
    mpg = MedianPreimageGenerator()
    # Add dataset.
    mpg.dataset = dataset
    # Set parameters.
    mpg.set_options(**mpg_options.copy())
    mpg.kernel_options = kernel_options.copy()
    mpg.ged_options = ged_options.copy()
    mpg.mge_options = mge_options.copy()
    # Run.
    mpg.run()
    """**4. Get results.**"""

    # Get results.
    import pprint
    pp = pprint.PrettyPrinter(indent=4)  # pretty print
    results = mpg.get_results()
    pp.pprint(results)

    draw_graph(mpg.set_median)
    draw_graph(mpg.gen_median)
    draw_graph(G_cand)
Beispiel #14
0
def chooseDataset(ds_name):
    """Choose dataset according to name.
	"""
    from gklearn.utils import Dataset

    dataset = Dataset()

    # no node labels (and no edge labels).
    if ds_name == 'Alkane':
        dataset.load_predefined_dataset(ds_name)
        dataset.trim_dataset(edge_required=False)
        irrelevant_labels = {
            'node_attrs': ['x', 'y', 'z'],
            'edge_labels': ['bond_stereo']
        }
        dataset.remove_labels(**irrelevant_labels)
    # node symbolic labels.
    elif ds_name == 'Acyclic':
        dataset.load_predefined_dataset(ds_name)
        dataset.trim_dataset(edge_required=False)
        irrelevant_labels = {
            'node_attrs': ['x', 'y', 'z'],
            'edge_labels': ['bond_stereo']
        }
        dataset.remove_labels(**irrelevant_labels)
    # node non-symbolic labels.
    elif ds_name == 'Letter-med':
        dataset.load_predefined_dataset(ds_name)
        dataset.trim_dataset(edge_required=False)
    # node symbolic and non-symbolic labels (and edge symbolic labels).
    elif ds_name == 'AIDS':
        dataset.load_predefined_dataset(ds_name)
        dataset.trim_dataset(edge_required=False)
    # edge non-symbolic labels (no node labels).
    elif ds_name == 'Fingerprint_edge':
        dataset.load_predefined_dataset('Fingerprint')
        dataset.trim_dataset(edge_required=True)
        irrelevant_labels = {'edge_attrs': ['orient', 'angle']}
        dataset.remove_labels(**irrelevant_labels)
    # edge non-symbolic labels (and node non-symbolic labels).
    elif ds_name == 'Fingerprint':
        dataset.load_predefined_dataset(ds_name)
        dataset.trim_dataset(edge_required=True)
    # edge symbolic and non-symbolic labels (and node symbolic and non-symbolic labels).
    elif ds_name == 'Cuneiform':
        dataset.load_predefined_dataset(ds_name)
        dataset.trim_dataset(edge_required=True)

    dataset.cut_graphs(range(0, 3))

    return dataset