Example #1
0
def split_dataset_by_target(dataset):
    import warnings
    warnings.simplefilter('always', DeprecationWarning)
    warnings.warn(
        'This function has been moved to "gklearn.dataset" module. The function "gklearn.utils.dataset.split_dataset_by_target" has not been maintained since Nov 12th, 2020 (version 0.2.1) and will be removed since version 0.4.0.',
        DeprecationWarning)

    from gklearn.preimage.utils import get_same_item_indices

    graphs = dataset.graphs
    targets = dataset.targets
    datasets = []
    idx_targets = get_same_item_indices(targets)
    for key, val in idx_targets.items():
        sub_graphs = [graphs[i] for i in val]
        sub_dataset = Dataset()
        sub_dataset.load_graphs(sub_graphs, [key] * len(val))
        node_labels = dataset.node_labels.copy(
        ) if dataset.node_labels is not None else None
        node_attrs = dataset.node_attrs.copy(
        ) if dataset.node_attrs is not None else None
        edge_labels = dataset.edge_labels.copy(
        ) if dataset.edge_labels is not None else None
        edge_attrs = dataset.edge_attrs.copy(
        ) if dataset.edge_attrs is not None else None
        sub_dataset.set_labels(node_labels=node_labels,
                               node_attrs=node_attrs,
                               edge_labels=edge_labels,
                               edge_attrs=edge_attrs)
        datasets.append(sub_dataset)
        # @todo: clean_labels?
    return datasets
Example #2
0
def __get_shuffles(y_all, n_splits, test_size):
    rs = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=0)
    train_indices = [[] for _ in range(n_splits)]
    test_indices = [[] for _ in range(n_splits)]
    idx_targets = get_same_item_indices(y_all)
    train_nums = []
    keys = []
    for key, item in idx_targets.items():
        i = 0
        for train_i, test_i in rs.split(item):  # @todo: careful when parallel.
            train_indices[i] += [item[idx] for idx in train_i]
            test_indices[i] += [item[idx] for idx in test_i]
            i += 1
        train_nums.append(len(train_i))
        keys.append(key)
    return train_indices, test_indices, train_nums, keys
Example #3
0
def split_dataset_by_target(dataset):
	from gklearn.preimage.utils import get_same_item_indices

	graphs = dataset.graphs
	targets = dataset.targets
	datasets = []
	idx_targets = get_same_item_indices(targets)
	for key, val in idx_targets.items():
		sub_graphs = [graphs[i] for i in val]
		sub_dataset = Dataset()
		sub_dataset.load_graphs(sub_graphs, [key] * len(val))
		node_labels = dataset.node_labels.copy() if dataset.node_labels is not None else None
		node_attrs = dataset.node_attrs.copy() if dataset.node_attrs is not None else None
		edge_labels = dataset.edge_labels.copy() if dataset.edge_labels is not None else None
		edge_attrs = dataset.edge_attrs.copy() if dataset.edge_attrs is not None else None
		sub_dataset.set_labels(node_labels=node_labels, node_attrs=node_attrs, edge_labels=edge_labels, edge_attrs=edge_attrs)
		datasets.append(sub_dataset)
		# @todo: clean_labels?
	return datasets
def visualize_distances_in_ged_letter_h():
    from fitDistance import compute_geds
    from preimage.test_k_closest_graphs import reform_attributes

    ds = {
        'dataset':
        'cpp_ext/data/collections/Letter.xml',
        'graph_dir':
        os.path.dirname(os.path.realpath(__file__)) +
        '/cpp_ext/data/datasets/Letter/HIGH/'
    }  # node/edge symb
    Gn_original, y_all = loadDataset(ds['dataset'],
                                     extra_params=ds['graph_dir'])
    #    Gn = Gn[0:50]

    # compute distance matrix
    #    median_set = [22, 29, 54, 74]
    gkernel = 'structuralspkernel'
    fit_method = 'expert'
    ds_name = 'letter-h'
    fname_medians = fit_method + '.' + gkernel
    dir_output = 'results/xp_letter_h/'
    k = 150
    repeat = 0
    #    edit_costs = [0.16229209837639536, 0.06612870523413916, 0.04030113378793905, 0.20723547009415202, 0.3338607220394598, 0.27054392518077297]
    edit_costs = [3, 3, 1, 3, 3, 1]
    #    edit_costs = [7, 3, 5, 9, 2, 6]

    # get indices by classes.
    y_idx = get_same_item_indices(y_all)
    for i, (y, values) in enumerate(y_idx.items()):
        print('\ny =', y)

        Gn = [Gn_original[g].copy() for g in values]
        # add set median.
        fname_sm = dir_output + 'medians/' + y + '/set_median.k' + str(int(k)) \
            + '.y' + y + '.repeat' + str(repeat) + '.gxl'
        set_median = loadGXL(fname_sm)
        Gn.append(set_median)
        # add generalized median (estimated pre-image.)
        fname_gm = dir_output + 'medians/' + y + '/gen_median.k' + str(int(k)) \
            + '.y' + y + '.repeat' + str(repeat) + '.gxl'
        gen_median = loadGXL(fname_gm)
        Gn.append(gen_median)

        # compute/load ged matrix.
        # compute.
        algo_options = '--threads 1 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
        params_ged = {
            'dataset': 'Letter',
            'lib': 'gedlibpy',
            'cost': 'CONSTANT',
            'method': 'IPFP',
            'algo_options': algo_options,
            'stabilizer': None,
            'edit_cost_constant': edit_costs
        }
        for g in Gn:
            reform_attributes(g)
        _, ged_mat, _ = compute_geds(Gn, params_ged=params_ged, parallel=True)
        np.savez(dir_output + 'ged_mat.' + fname_medians + '.y' + y +
                 '.with_medians.gm',
                 ged_mat=ged_mat)
        #        # load from file.
        #        gmfile = np.load('dir_output + 'ged_mat.' + fname_medians + '.y' + y + '.with_medians.gm.npz')
        #        ged_mat = gmfile['ged_mat']
        #        # change medians.
        #        algo_options = '--threads 1 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
        #        params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP',
        #                    'algo_options': algo_options, 'stabilizer': None,
        #                    'edit_cost_constant': edit_costs}
        #        for idx in tqdm(range(len(Gn) - 2), desc='computing GEDs', file=sys.stdout):
        #            dis, _, _ = GED(Gn[idx], set_median, **params_ged)
        #            ged_mat[idx, -2] = dis
        #            ged_mat[-2, idx] = dis
        #            dis, _, _ = GED(Gn[idx], gen_median, **params_ged)
        #            ged_mat[idx, -1] = dis
        #            ged_mat[-1, idx] = dis
        #        np.savez(dir_output + 'ged_mat.' + fname_medians + '.y' + y + '.with_medians.gm',
        #                 ged_mat=ged_mat)

        # visualization.
        median_set = range(0, len(values))
        visualize_graph_dataset('ged',
                                'tsne',
                                draw_figure,
                                draw_params={'y_idx': y_idx},
                                dis_mat=ged_mat,
                                median_set=median_set)
def visualize_distances_in_kernel_letter_h():

    ds = {
        'dataset':
        'cpp_ext/data/collections/Letter.xml',
        'graph_dir':
        os.path.dirname(os.path.realpath(__file__)) +
        '/cpp_ext/data/datasets/Letter/HIGH/'
    }  # node/edge symb
    Gn_original, y_all = loadDataset(ds['dataset'],
                                     extra_params=ds['graph_dir'])
    #    Gn = Gn[0:50]

    # compute distance matrix
    #    median_set = [22, 29, 54, 74]
    gkernel = 'structuralspkernel'
    fit_method = 'expert'
    node_label = None
    edge_label = None
    ds_name = 'letter-h'
    fname_medians = fit_method + '.' + gkernel
    dir_output = 'results/xp_letter_h/'
    k = 150
    repeat = 0

    # get indices by classes.
    y_idx = get_same_item_indices(y_all)
    for i, (y, values) in enumerate(y_idx.items()):
        print('\ny =', y)

        Gn = [Gn_original[g].copy() for g in values]
        # add set median.
        fname_sm = dir_output + 'medians/' + y + '/set_median.k' + str(int(k)) \
            + '.y' + y + '.repeat' + str(repeat) + '.gxl'
        set_median = loadGXL(fname_sm)
        Gn.append(set_median)
        # add generalized median (estimated pre-image.)
        fname_gm = dir_output + 'medians/' + y + '/gen_median.k' + str(int(k)) \
            + '.y' + y + '.repeat' + str(repeat) + '.gxl'
        gen_median = loadGXL(fname_gm)
        Gn.append(gen_median)

        # compute distance matrix
        median_set = range(0, len(values))

        Gn_median_set = [Gn[i].copy() for i in median_set]
        Kmatrix_median = compute_kernel(Gn + Gn_median_set, gkernel,
                                        node_label, edge_label, False)
        Kmatrix = Kmatrix_median[0:len(Gn), 0:len(Gn)]
        dis_mat, _, _, _ = kernel_distance_matrix(Gn,
                                                  node_label,
                                                  edge_label,
                                                  Kmatrix=Kmatrix,
                                                  gkernel=gkernel)
        print('average distances: ',
              np.mean(np.mean(dis_mat[0:len(Gn) - 2, 0:len(Gn) - 2])))
        print('min distances: ',
              np.min(np.min(dis_mat[0:len(Gn) - 2, 0:len(Gn) - 2])))
        print('max distances: ',
              np.max(np.max(dis_mat[0:len(Gn) - 2, 0:len(Gn) - 2])))

        # add distances for the image of exact median \psi.
        dis_k_median_list = []
        for idx, g in enumerate(Gn):
            dis_k_median_list.append(
                dis_gstar(idx,
                          range(len(Gn),
                                len(Gn) + len(Gn_median_set)),
                          [1 / len(Gn_median_set)] * len(Gn_median_set),
                          Kmatrix_median,
                          withterm3=False))
        dis_mat_median = np.zeros((len(Gn) + 1, len(Gn) + 1))
        for i in range(len(Gn)):
            for j in range(i, len(Gn)):
                dis_mat_median[i, j] = dis_mat[i, j]
                dis_mat_median[j, i] = dis_mat_median[i, j]
        for i in range(len(Gn)):
            dis_mat_median[i, -1] = dis_k_median_list[i]
            dis_mat_median[-1, i] = dis_k_median_list[i]

        # visualization.


#    visualize_graph_dataset('graph-kernel', 'tsne', Gn)
#    visualize_graph_dataset('graph-kernel', 'tsne', draw_figure,
#                            draw_params={'y_idx': y_idx}, dis_mat=dis_mat_median)
        visualize_graph_dataset('graph-kernel',
                                'tsne',
                                draw_figure,
                                draw_params={'y_idx': y_idx},
                                dis_mat=dis_mat_median,
                                median_set=median_set)
def visualize_distances_in_ged():
    from gklearn.preimage.fitDistance import compute_geds
    from gklearn.preimage.ged import GED
    ds = {
        'name': 'monoterpenoides',
        'dataset': '../datasets/monoterpenoides/dataset_10+.ds'
    }  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'])
    #    Gn = Gn[0:50]
    # add set median.
    fname_medians = 'expert.treelet'
    fname_sm = 'preimage/results/test_k_closest_graphs/set_median.' + fname_medians + '.gxl'
    set_median = loadGXL(fname_sm)
    Gn.append(set_median)
    # add generalized median (estimated pre-image.)
    fname_gm = 'preimage/results/test_k_closest_graphs/gen_median.' + fname_medians + '.gxl'
    gen_median = loadGXL(fname_gm)
    Gn.append(gen_median)

    # compute/load ged matrix.
    #    # compute.
    ##    k = 4
    ##    edit_costs = [0.16229209837639536, 0.06612870523413916, 0.04030113378793905, 0.20723547009415202, 0.3338607220394598, 0.27054392518077297]
    #    edit_costs = [3, 3, 1, 3, 3, 1]
    ##    edit_costs = [7, 3, 5, 9, 2, 6]
    #    algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
    #    params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP',
    #                'algo_options': algo_options, 'stabilizer': None,
    #                'edit_cost_constant': edit_costs}
    #    _, ged_mat, _ = compute_geds(Gn, params_ged=params_ged, parallel=True)
    #    np.savez('results/test_k_closest_graphs/ged_mat.' + fname_medians + '.with_medians.gm', ged_mat=ged_mat)
    # load from file.
    gmfile = np.load('results/test_k_closest_graphs/ged_mat.' + fname_medians +
                     '.with_medians.gm.npz')
    ged_mat = gmfile['ged_mat']
    #    # change medians.
    #    edit_costs = [3, 3, 1, 3, 3, 1]
    #    algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
    #    params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP',
    #                'algo_options': algo_options, 'stabilizer': None,
    #                'edit_cost_constant': edit_costs}
    #    for idx in tqdm(range(len(Gn) - 2), desc='computing GEDs', file=sys.stdout):
    #        dis, _, _ = GED(Gn[idx], set_median, **params_ged)
    #        ged_mat[idx, -2] = dis
    #        ged_mat[-2, idx] = dis
    #        dis, _, _ = GED(Gn[idx], gen_median, **params_ged)
    #        ged_mat[idx, -1] = dis
    #        ged_mat[-1, idx] = dis
    #    np.savez('results/test_k_closest_graphs/ged_mat.' + fname_medians + '.with_medians.gm',
    #             ged_mat=ged_mat)

    # get indices by classes.
    y_idx = get_same_item_indices(y_all)

    # visualization.
    median_set = [22, 29, 54, 74]
    visualize_graph_dataset('ged',
                            'tsne',
                            draw_figure,
                            draw_params={'y_idx': y_idx},
                            dis_mat=ged_mat,
                            median_set=median_set)
def visualize_distances_in_kernel():

    ds = {
        'name': 'monoterpenoides',
        'dataset': '../datasets/monoterpenoides/dataset_10+.ds'
    }  # node/edge symb
    Gn, y_all = loadDataset(ds['dataset'])
    #    Gn = Gn[0:50]
    fname_medians = 'expert.treelet'
    # add set median.
    fname_sm = 'results/test_k_closest_graphs/set_median.' + fname_medians + '.gxl'
    set_median = loadGXL(fname_sm)
    Gn.append(set_median)
    # add generalized median (estimated pre-image.)
    fname_gm = 'results/test_k_closest_graphs/gen_median.' + fname_medians + '.gxl'
    gen_median = loadGXL(fname_gm)
    Gn.append(gen_median)

    # compute distance matrix
    median_set = [22, 29, 54, 74]
    gkernel = 'treeletkernel'
    node_label = 'atom'
    edge_label = 'bond_type'
    Gn_median_set = [Gn[i].copy() for i in median_set]
    Kmatrix_median = compute_kernel(Gn + Gn_median_set, gkernel, node_label,
                                    edge_label, True)
    Kmatrix = Kmatrix_median[0:len(Gn), 0:len(Gn)]
    dis_mat, _, _, _ = kernel_distance_matrix(Gn,
                                              node_label,
                                              edge_label,
                                              Kmatrix=Kmatrix,
                                              gkernel=gkernel)
    print('average distances: ',
          np.mean(np.mean(dis_mat[0:len(Gn) - 2, 0:len(Gn) - 2])))
    print('min distances: ',
          np.min(np.min(dis_mat[0:len(Gn) - 2, 0:len(Gn) - 2])))
    print('max distances: ',
          np.max(np.max(dis_mat[0:len(Gn) - 2, 0:len(Gn) - 2])))

    # add distances for the image of exact median \psi.
    dis_k_median_list = []
    for idx, g in enumerate(Gn):
        dis_k_median_list.append(
            dis_gstar(idx,
                      range(len(Gn),
                            len(Gn) + len(Gn_median_set)),
                      [1 / len(Gn_median_set)] * len(Gn_median_set),
                      Kmatrix_median,
                      withterm3=False))
    dis_mat_median = np.zeros((len(Gn) + 1, len(Gn) + 1))
    for i in range(len(Gn)):
        for j in range(i, len(Gn)):
            dis_mat_median[i, j] = dis_mat[i, j]
            dis_mat_median[j, i] = dis_mat_median[i, j]
    for i in range(len(Gn)):
        dis_mat_median[i, -1] = dis_k_median_list[i]
        dis_mat_median[-1, i] = dis_k_median_list[i]

    # get indices by classes.
    y_idx = get_same_item_indices(y_all)

    # visualization.
    #    visualize_graph_dataset('graph-kernel', 'tsne', Gn)
    #    visualize_graph_dataset('graph-kernel', 'tsne', draw_figure,
    #                            draw_params={'y_idx': y_idx}, dis_mat=dis_mat_median)
    visualize_graph_dataset('graph-kernel',
                            'tsne',
                            draw_figure,
                            draw_params={'y_idx': y_idx},
                            dis_mat=dis_mat_median,
                            median_set=median_set)
def test_median_graph_estimator_symb():
	from gklearn.utils import load_dataset
	from gklearn.ged.median import MedianGraphEstimator, constant_node_costs
	from gklearn.gedlib import librariesImport, gedlibpy
	from gklearn.preimage.utils import get_same_item_indices
	import multiprocessing

	# estimator parameters.
	init_type = 'MEDOID'
	num_inits = 1
	threads = multiprocessing.cpu_count()
	time_limit = 60000
	
	# algorithm parameters.
	algo = 'IPFP'
	initial_solutions = 1
	algo_options_suffix = ' --initial-solutions ' + str(initial_solutions) + ' --ratio-runs-from-initial-solutions 1 --initialization-method NODE '

	edit_cost_name = 'CONSTANT'
	edit_cost_constants = [4, 4, 2, 1, 1, 1]
	ds_name = 'MUTAG'
	
	# Load dataset.
	dataset = '../../../datasets/MUTAG/MUTAG_A.txt'
	Gn, y_all, label_names = load_dataset(dataset)
	y_idx = get_same_item_indices(y_all)
	for i, (y, values) in enumerate(y_idx.items()):
		Gn_i = [Gn[val] for val in values]
		break
	Gn_i = Gn_i[0:10]
	
	# Set up the environment.
	ged_env = gedlibpy.GEDEnv()
	# gedlibpy.restart_env()
	ged_env.set_edit_cost(edit_cost_name, edit_cost_constant=edit_cost_constants)
	for G in Gn_i:
		ged_env.add_nx_graph(G, '')
	graph_ids = ged_env.get_all_graph_ids()
	set_median_id = ged_env.add_graph('set_median')
	gen_median_id = ged_env.add_graph('gen_median')
	ged_env.init(init_option='EAGER_WITHOUT_SHUFFLED_COPIES')
	
	# Set up the estimator.
	mge = MedianGraphEstimator(ged_env, constant_node_costs(edit_cost_name))
	mge.set_refine_method(algo, '--threads ' + str(threads) + ' --initial-solutions ' + str(initial_solutions) + ' --ratio-runs-from-initial-solutions 1')
	
	mge_options = '--time-limit ' + str(time_limit) + ' --stdout 2 --init-type ' + init_type
	mge_options += ' --random-inits ' + str(num_inits) + ' --seed ' + '1'  + ' --update-order TRUE --refine FALSE --randomness PSEUDO --parallel TRUE '# @todo: std::to_string(rng())
	
	# Select the GED algorithm.
	algo_options = '--threads ' + str(threads) + algo_options_suffix
	mge.set_options(mge_options)
	mge.set_label_names(node_labels=label_names['node_labels'],
					  edge_labels=label_names['edge_labels'], 
					  node_attrs=label_names['node_attrs'], 
					  edge_attrs=label_names['edge_attrs'])
	mge.set_init_method(algo, algo_options)
	mge.set_descent_method(algo, algo_options)
	
	# Run the estimator.
	mge.run(graph_ids, set_median_id, gen_median_id)
	
	# Get SODs.
	sod_sm = mge.get_sum_of_distances('initialized')
	sod_gm = mge.get_sum_of_distances('converged')
	print('sod_sm, sod_gm: ', sod_sm, sod_gm)
	
	# Get median graphs.
	set_median = ged_env.get_nx_graph(set_median_id)
	gen_median = ged_env.get_nx_graph(gen_median_id)
	
	return set_median, gen_median