Example #1
0
def plot_ged_time_helper(dataset, models, metric, rs):
    font = {'family': 'serif',
            'size': 22}
    matplotlib.rc('font', **font)

    plt.figure(0)
    plt.figure(figsize=(16, 10))

    xs = get_test_graph_sizes(dataset)
    so = np.argsort(xs)
    xs.sort()
    for model in models:
        mat = rs[model].mat(metric.name, norm=True)
        print('plotting for {}'.format(model))
        ys = np.mean(mat, 1)[so]
        plt.plot(xs, ys, **get_plotting_arg(args1, model))
        plt.scatter(xs, ys, s=200, label=model, **get_plotting_arg(args2, model))
    plt.xlabel('query graph size')
    ax = plt.gca()
    ax.set_xticks(xs)
    plt.ylabel('average {}'.format(metric.ylabel))
    plt.legend(loc='best', ncol=2)
    plt.grid(linestyle='dashed')
    plt.tight_layout()
    # plt.show()
    sp = get_result_path() + '/{}/{}/ged_{}_mat_{}_{}.png'.format( \
        dataset, metric, metric, dataset, '_'.join(models))
    plt.savefig(sp)
    print('Saved to {}'.format(sp))
Example #2
0
def post_real_dataset_run_convert_csv_to_np():
    """ Use in case only csv is generated,
        and numpy matrices need to be saved. """
    dataset = 'imdbmulti'
    model = 'CDKMCS'
    ds_metric = 'mcs'
    row_graphs = load_data(dataset, False).graphs
    col_graphs = load_data(dataset, True).graphs
    num_cpu = 40
    computer_name = 'scai1_all'
    ts = '2018-10-09T13:41:13.942414'
    outdir = '{}/{}'.format(get_result_path(), dataset)
    csv_fn = '{}/csv/{}_{}_{}_{}_{}_{}cpus.csv'.format(
        outdir, ds_metric, dataset, model, ts, computer_name, num_cpu)
    data = load_from_exsiting_csv(csv_fn, ds_metric)
    m = len(row_graphs)
    n = len(col_graphs)
    # -3 is identifier that the csv the data came from didn't include the data point.
    ds_mat = np.full((m, n), -3)
    time_mat = np.full((m, n), -3)
    cnt = 0
    print('m: {}, n: {}, m*n: {}'.format(m, n, m * n))
    for (i, j), row_data in data.items():
        if cnt % 1000 == 0:
            print(cnt)
        ds_mat[i][j] = row_data[4]
        time_mat[i][j] = row_data[6] if ds_metric == 'ged' else row_data[7]
        cnt += 1
    print(cnt)
    assert (cnt == m * n)
    save_as_np(outdir, ds_metric, ds_mat, time_mat, ts,
               dataset, row_graphs, col_graphs, model, computer_name, num_cpu)
Example #3
0
def run():
    events_tracker = TerminalEventsTracker(log_pth="../logs.txt",
                                           report_every_responses_nb=1000)

    settings_path = "./settings.json"
    proxies_save_pth, creds_save_pth = utils.get_proxy_and_creds_paths(
        settings_path)
    checkp_data, checkp_requester = utils.get_data_requester_checkpoint_paths(
        settings_path)
    result_file = utils.get_result_path(settings_path)
    backups_path = utils.get_backups_path(settings_path)

    proxy_storage = ProxyStorage(proxies_save_pth)
    creds_storage = CredsStorage(creds_save_pth)

    runner = VkCrawlRunnerWithCheckpoints(
        start_user_id=142478661,
        data_resume_checkpoint_save_pth=checkp_data,
        tracker=events_tracker,
        proxy_storage=proxy_storage,
        creds_storage=creds_storage,
        requester_checkpoints_path=checkp_requester,
        requester_max_requests_per_loop=4000,
        long_term_save_pth=result_file,
        data_backup_path=str(backups_path / "parsed_backup.jsonl"),
        loops_per_checkpoint=3,
        use_async=True,
        nb_sessions=8,
        dmp_long_term_steps=2000)
    runner.run()
Example #4
0
 def _load_result_mat(self, dataset, metric):
     file_p = get_result_path() + '/{}/{}/{}_{}_mat_{}_{}_*.npy'.format( \
         dataset, metric, self.dist_metric(), metric, dataset, self.model_)
     li = glob(file_p)
     if not li:
         raise RuntimeError('No results found {}'.format(file_p))
     file = self._choose_result_file(li)
     return np.load(file)
Example #5
0
def exp5():
    """ Query visualization. """
    dataset = 'imdbmulti'
    model = 'astar'
    concise = True
    norms = [True, False]
    dir = get_result_path() + '/{}/query_vis/{}'.format(dataset, model)
    create_dir_if_not_exists(dir)
    info_dict = {
        # draw node config
        'draw_node_size': 150 if dataset != 'linux' else 10,
        'draw_node_label_enable': True,
        'node_label_name': None if dataset == 'linux' else 'type',
        'draw_node_label_font_size': 6,
        'draw_node_color_map': TYPE_COLOR_MAP,
        # draw edge config
        'draw_edge_label_enable': False,
        'edge_label_name': 'valence',
        'draw_edge_label_font_size': 6,
        # graph text info config
        'each_graph_text_list': [],
        'each_graph_text_font_size': 8,
        'each_graph_text_pos': [0.5, 1.05],
        # graph padding: value range: [0, 1]
        'top_space': 0.20 if concise else 0.26,  # out of whole graph
        'bottom_space': 0.05,
        'hbetween_space': 0.6 if concise else 1,  # out of the subgraph
        'wbetween_space': 0,
        # plot config
        'plot_dpi': 200,
        'plot_save_path_eps': '',
        'plot_save_path_png': ''
    }
    train_data = load_data(dataset, train=True)
    test_data = load_data(dataset, train=False)
    row_graphs = test_data.graphs
    col_graphs = train_data.graphs
    r = load_result(dataset, model, row_graphs=row_graphs, col_graphs=col_graphs)
    tr = load_result(dataset, TRUE_MODEL, row_graphs=row_graphs, col_graphs=col_graphs)
    for norm in norms:
        ids = r.get_sort_id_mat(norm)
        m, n = r.m_n()
        num_vis = 10
        for i in range(num_vis):
            q = test_data.graphs[i]
            gids = np.concatenate([ids[i][:3], [ids[i][int(n / 2)]], ids[i][-3:]])
            gs = [train_data.graphs[j] for j in gids]
            info_dict['each_graph_text_list'] = \
                [get_text_label(dataset, r, tr, i, i, q, model, norm, True, concise)] + \
                [get_text_label(dataset, r, tr, i, j,
                                train_data.graphs[j], model, norm, False, concise) \
                 for j in gids]
            # print(info_dict['each_graph_text_list'])
            info_dict['plot_save_path_png'] = '{}/query_vis_{}_{}_{}{}.{}'.format(
                dir, dataset, model, i, get_norm_str(norm), 'png')
            info_dict['plot_save_path_eps'] = '{}/query_vis_{}_{}_{}{}.{}'.format(
                dir, dataset, model, i, get_norm_str(norm), 'eps')
            vis(q, gs, info_dict)
Example #6
0
def draw_emb_hist_heat_helper(gcn_id, nel, cmap_color, dataset,
                              row_graphs, col_graphs, ids, true_r, ds_norm,
                              plot_max_num, extra_dir):
    plt_cnt = 0
    for i in range(len(row_graphs)):
        # gids = column ids of [worst match, best match]
        gids = np.concatenate([ids[i][:1], ids[i][-1:]])
        for j in gids:
            _, d = true_r.dist_sim(i, j, ds_norm)
            # nel is [train + val ... test]
            query_nel_idx = len(col_graphs) + i
            match_nel_idx = j
            # result is dot product between the query (test) and match (train/val)
            result = np.dot(nel[query_nel_idx], nel[match_nel_idx].T)
            plt.figure()
            sns_plot = sns.heatmap(result, fmt='d', cmap=cmap_color)
            fig = sns_plot.get_figure()
            dir = '{}/{}/{}'.format(get_result_path(), dataset, 'heatmap')
            fn = '{}_{}_{}_gcn{}'.format(i, j, d, gcn_id)
            plt_cnt += save_fig(fig, dir, fn, print_path=False)
            if extra_dir:
                plt_cnt += save_fig(fig, extra_dir + '/heatmap', fn, print_path=False)
            plt.close()
            result_array = []
            for m in range(len(result)):
                for n in range(len(result[m])):
                    result_array.append(result[m][n])
            plt.figure()
            plt.xlim(-1, 1)
            plt.ylim(0, 100)
            sns_plot = sns.distplot(result_array, bins=16, color='r',
                                    kde=False, rug=False, hist=True)
            fig = sns_plot.get_figure()
            dir = '{}/{}/{}'.format(get_result_path(), dataset, 'histogram')
            fn = '{}_{}_{}_gcn{}'.format(i, j, d, gcn_id)
            plt_cnt += save_fig(fig, dir, fn, print_path=False)
            if extra_dir:
                plt_cnt += save_fig(fig, extra_dir + '/histogram', fn, print_path=False)
            plt.close()
        if plt_cnt > plot_max_num:
            print('Saved {} node embeddings mne plots for gcn{}'.format(plt_cnt, gcn_id))
            return
    print('Saved {} node embeddings mne plots for gcn{}'.format(plt_cnt, gcn_id))
Example #7
0
def clean_up():
    rp = get_result_path()
    for file in sorted_nicely(glob('{}/{}'.format(rp, f))):
        bnf = basename(file)
        print_info(file, bnf)
        t = prompt('Delete? [y/n]', ['y', 'n'])
        if t == 'y':
            exec('rm -rf {}'.format(file))
        elif t == 'n':
            print('Skip')
        else:
            assert (False)
    print('Done')
Example #8
0
def plot_preck(dataset, dsmetric, models, rs, true_result, metric, norms,
               plot_results=True, extra_dir=None):
    """ Plot prec@k. """
    create_dir_if_not_exists('{}/{}/{}'.format(
        get_result_path(), dataset, metric))
    rtn = {}
    for norm in norms:
        _, n = true_result.m_n()
        ks = range(1, n)
        d = plot_preck_helper(
            dataset, dsmetric, models, rs, true_result, metric, norm, ks,
            False, plot_results, extra_dir)
        rtn.update(d)
    return rtn
Example #9
0
 def _load_result_mat(self, metric, model, m, n):
     file_p = get_result_path() + '/{}/{}/{}_{}_mat_{}_{}_*.npy'.format(
         self.dataset, metric, self.dist_metric(), metric, self.dataset,
         model)
     li = glob(file_p)
     if not li:
         if 'astar' in model:
             if self.dataset != 'imdbmulti':
                 raise RuntimeError('Not imdbmulti and no astar results!')
             return self._load_merged_astar_from_other_three(metric, m, n)
         else:
             raise RuntimeError('No results found {}'.format(file_p))
     file = self._choose_result_file(li, m, n)
     return np.load(file)
Example #10
0
def rename():
    rp = get_result_path()
    for dirpath, dirs, files in walk('{}/{}'.format(rp, f)):
        for bfn in files:
            if target in bfn:
                continue
            dest_bfn = bfn.replace(source, target)
            t = prompt('Rename {} to {}? [y/n]'.format(bfn, dest_bfn),
                       ['y', 'n'])
            if t == 'y':
                exec('mv {} {}'.format(join(dirpath, bfn),
                                       join(dirpath, dest_bfn)))
            elif t == 'n':
                print('Skip')
            else:
                assert (False)
    print('Done')
Example #11
0
 def _load_sim_mat(self):
     fn = get_result_path() + '/{}/sim/{}_graph2vec_dim_{}_sim_{}.npy'.format( \
         self.dataset, self.dataset, self.dim, self.sim)
     if isfile(fn):
         with open(fn, 'rb') as handle:
             sim_mat = load_pkl(handle)
             print('Loaded sim mat from {}'.format(fn))
             return sim_mat
     train_emb = self._load_emb(True)
     test_emb = self._load_emb(False)
     if self.sim == 'dot':
         sim_mat = test_emb.dot(train_emb.T)
     else:
         raise RuntimeError('Unknown sim {}'.format(self.sim))
     with open(fn, 'wb') as handle:
         save_pkl(sim_mat, handle)
         print('Saved sim mat {} to {}'.format(sim_mat.shape, fn))
     return sim_mat
Example #12
0
 def _load_result_mat(self, metric, model, m, n):
     file_p = get_result_path() + '/{}/{}/{}_{}_mat_{}_{}_*.npy'.format(
         self.dataset, metric, self.ds_metric, metric, self.dataset, model)
     li = glob(file_p)
     if not li:
         if 'astar' in model:
             if self.dataset not in [
                     'imdbmulti', 'webeasy', 'linux_imdb', 'nci109', 'ptc',
                     'mutag'
             ]:
                 raise RuntimeError(
                     'Not imdbmulti/webeasy/linux_imdb/... and no astar results in {}!'
                     .format(file_p))
             return self._load_merged_astar_from_other_three(metric, m, n)
         else:
             raise RuntimeError('No results found {}'.format(file_p))
     file = self._choose_result_file(li, m, n)
     return np.load(file)
Example #13
0
def plot_preck_helper(dataset, dsmetric, models, rs, true_result, metric, norm, ks,
                      logscale, plot_results, extra_dir):
    print_ids = []
    numbers = {}
    assert (metric[0:6] == 'prec@k')
    if len(metric) > 6:
        rm = float(metric.split('_')[1])
    else:
        rm = 0
    for model in models:
        precs = prec_at_ks(true_result, rs[model], norm, ks, rm, print_ids)
        numbers[model] = {'ks': ks, 'precs': precs}
    rtn = {'preck{}_{}'.format(get_norm_str(norm), rm): numbers}
    if not plot_results:
        return rtn
    plt.figure(figsize=(16, 10))
    for model in models:
        ks = numbers[model]['ks']
        inters = numbers[model]['precs']
        if logscale:
            pltfunc = plt.semilogx
        else:
            pltfunc = plt.plot
        pltfunc(ks, inters, **get_plotting_arg(args1, model))
        plt.scatter(ks, inters, s=200, label=shorten_name(model),
                    **get_plotting_arg(args2, model))
    plt.xlabel('k')
    # ax = plt.gca()
    # ax.set_xticks(ks)
    plt.ylabel(metric)
    plt.ylim([-0.06, 1.06])
    plt.legend(loc='best', ncol=2)
    plt.grid(linestyle='dashed')
    plt.tight_layout()
    # plt.show()
    kss = 'k_{}_{}'.format(min(ks), max(ks))
    bfn = '{}_{}_{}_{}_{}{}_{}'.format(
        dsmetric, metric, dataset, '_'.join(models), kss, get_norm_str(norm), rm)
    dir = '{}/{}/{}'.format(get_result_path(), dataset, metric)
    save_fig(plt, dir, bfn)
    if extra_dir:
        save_fig(plt, extra_dir, bfn)
    print(metric, 'plotted')
    return rtn
Example #14
0
 def _load_emb(self, train):
     fn = get_result_path(
     ) + '/{}/emb/{}_graph2vec_{}_emb_dim_{}.npy'.format(
         self.dataset, self.dataset, 'train' if train else 'test', self.dim)
     if isfile(fn):
         emb = np.load(fn)
         print('Loaded emb {} from {}'.format(emb.shape, fn))
         return emb
     data = load_data(self.dataset, train=train)
     id_map = self._gid_to_matrixid(data)
     emb = np.zeros((len(data.graphs), self.dim))
     cnt = 0
     d = self._load_json_emb()
     for f in d:
         gid = get_file_base_id(f)
         if gid in id_map:
             emb[id_map[gid]] = d[f]
             cnt += 1
     if cnt != len(id_map):
         raise RuntimeError('Mismatch: {} != {}').format(cnt, len(id_map))
     np.save(fn, emb)
     print('Saved emb {} to {}'.format(emb.shape, fn))
     return emb
Example #15
0
def plot_single_number_metric(dataset, dsmetric, models, rs, true_result, metric, norms,
                              ds_kernel=None,
                              thresh_poss=None, thresh_negs=None,
                              thresh_poss_sim=None, thresh_negs_sim=None,
                              plot_results=True,
                              extra_dir=None):
    """ Plot mrr or mse. """
    create_dir_if_not_exists('{}/{}/{}'.format(
        get_result_path(), dataset, metric))
    rtn = {}
    if norms and thresh_poss and thresh_negs:
        assert (len(norms) == len(thresh_poss) == len(thresh_negs))
    for i, norm in enumerate(norms):
        thresh_pos = thresh_poss[i] if thresh_poss else None
        thresh_neg = thresh_negs[i] if thresh_negs else None
        thresh_pos_sim = thresh_poss_sim[i] if thresh_poss_sim else None
        thresh_neg_sim = thresh_negs_sim[i] if thresh_negs_sim else None
        d = plot_single_number_metric_helper(
            dataset, dsmetric, models, rs, true_result, metric, norm, ds_kernel,
            thresh_pos, thresh_neg, thresh_pos_sim, thresh_neg_sim,
            plot_results, extra_dir)
        rtn.update(d)
    return rtn
def test_model(args):

    models = os.listdir(args.save_path)

    # load dataset
    data_paths = get_data_path(args.mode, args.encoder)
    datasets = MatchSumPipe(args.candidate_num,
                            args.encoder).process_from_file(data_paths)
    print('Information of dataset is:')
    print(datasets)
    test_set = datasets.datasets['test']

    # need 1 gpu for testing
    device = int(args.gpus)

    args.batch_size = 1

    for cur_model in models:

        print('Current model is {}'.format(cur_model))

        # load model
        model = torch.load(join(args.save_path, cur_model))

        # configure testing
        dec_path, ref_path = get_result_path(args.save_path, cur_model)
        test_metric = MatchRougeMetric(data=read_jsonl(data_paths['test']),
                                       dec_path=dec_path,
                                       ref_path=ref_path,
                                       n_total=len(test_set))
        tester = Tester(data=test_set,
                        model=model,
                        metrics=[test_metric],
                        batch_size=args.batch_size,
                        device=device,
                        use_tqdm=False)
        tester.test()
Example #17
0
def plot_heatmap(gs1_str, gs2_str, dist_mat, thresh_pos, thresh_neg,
                 dataset, dist_metric, norm):
    m, n = dist_mat.shape
    label_mat, num_poses, num_negs, _, _ = \
        get_classification_labels_from_dist_mat(
            dist_mat, thresh_pos, thresh_neg)
    title = '{} pos pairs ({:.2%})\n{} neg pairs ({:.2%})'.format(
        num_poses, num_poses / (m * n), num_negs, num_negs / (m * n))
    sorted_label_mat = np.sort(label_mat, axis=1)[:, ::-1]
    mat_str = '{}({})_{}({})_{}_{}'.format(
        gs1_str, m, gs2_str, n, thresh_pos, thresh_neg)
    fn = '{}_acc_{}_labels_heatmap_{}{}'.format(dist_metric, mat_str,
                                                dataset, get_norm_str(norm))
    dir = '{}/{}/classif_labels'.format(get_result_path(), dataset)
    create_dir_if_not_exists(dir)
    plot_heatmap_helper(sorted_label_mat, title, dir, fn,
                        cmap='bwr')
    sorted_dist_mat = np.sort(dist_mat, axis=1)
    mat_str = '{}({})_{}({})'.format(
        gs1_str, m, gs2_str, n)
    fn = '{}_acc_{}_dist_heatmap_{}{}'.format(dist_metric, mat_str,
                                              dataset, get_norm_str(norm))
    plot_heatmap_helper(sorted_dist_mat, '', dir, fn,
                        cmap='tab20')
from utils import get_result_path
import pandas as pd
import os
import copy
from ast import literal_eval
from os.path import join

name = 'aids700nef'
dataset = join(
    get_result_path(), name, 'mcs',
    'mcs_aids700nef_mccreesh2017_2018-11-27T02:36:27.553945_redacted-desktop_all_4cpus'
)
df = pd.read_csv('{}.csv'.format(dataset), sep=',')
# for index, chunk in enumerate(pd.read_csv('{}.csv'.format(dataset), sep=',', chunksize=1)):
print('read csv')
hits = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
cur_hit = 0
for index, row in df.iterrows():
    perc = index / len(df)
    if cur_hit < len(hits) and abs(perc - hits[cur_hit]) <= 0.05:
        print('{}/{}={:.1%}'.format(index, len(df), perc))
        cur_hit += 1

    node_mapping = {}
    edge_mapping = row['node_mapping']

    edge_mapping = literal_eval(edge_mapping)[0]
    row['edge_mapping'] = [copy.deepcopy(edge_mapping)]

    # one node with one node mapping
    if edge_mapping == {}:
    'movie': '#ff6666',
    'tvSeries': '#ff6666',
    'actor': 'lightskyblue',
    'actress': '#ffb3e6',
    'director': 'yellowgreen',
    'composer': '#c2c2f0',
    'producer': '#ffcc99',
    'cinematographer': 'gold'}


if __name__ == '__main__':
    plot_what = 'att_vis'
    concise = True
    dataset = 'aids700nef'
    model = 'astar'
    dir = get_result_path() + '/{}/{}/{}'.format(dataset, plot_what, model)
    weight_data = load_as_dict("/home/songbian/Documents/fork/GraphEmbedding/"
                               "model/Siamese/logs/"
                               "siamese_classification_aids700nef_2018-07-28T10:09:33/"
                               "test_info.pickle")
    weight = weight_data['atts']
    info_dict = {
        # draw node config
        'draw_node_size': 800 if dataset != 'linux' else 20,
        'draw_node_label_enable': True,
        'node_label_name': None if dataset == 'linux' else 'type',
        'draw_node_label_font_size': 16,
        'draw_node_color_map': TYPE_COLOR_MAP,
        # draw edge config
        'draw_edge_label_enable': False,
        'edge_label_name': 'valence',
Example #20
0
    save_path = info_dict['plot_save_path']
    if save_path is None or save_path == "":
        plt.show()
    else:
        sp = info_dict['plot_save_path']
        print('Saving query vis plot to {}'.format(sp))
        plt.savefig(sp, dpi=info_dict['plot_dpi'])


if __name__ == '__main__':
    dataset = 'linux'
    model = 'astar'
    concise = True
    ext = 'png'
    norms = [True, False]
    dir = get_result_path() + '/{}/att_vis_ourrank/{}'.format(dataset, model)
    create_dir_if_not_exists(dir)
    info_dict = {
        # draw node config
        'draw_node_size': 10 if dataset != 'linux' else 10,
        'draw_node_label_enable': True,
        'node_label_name': None if dataset == 'linux' else 'type',
        'draw_node_label_font_size': 6,
        'draw_node_color_map': {
            'C': '#ff6666',
            'O': 'lightskyblue',
            'N': 'yellowgreen',
            'movie': '#ff6666',
            'tvSeries': '#ff6666',
            'actor': 'lightskyblue',
            'actress': '#ffb3e6',
Example #21
0
def draw_ranking(dataset, ds_metric, true_r, pred_r, model_name, node_feat_name,
                 plot_node_ids=False, plot_gids=False, ds_norm=True,
                 existing_mappings=None,
                 extra_dir=None, plot_max_num=np.inf):
    plot_what = 'query_demo'
    concise = True
    dir = get_result_path() + '/{}/{}/{}'.format(dataset, plot_what,
                                                 true_r.get_model())
    info_dict = {
        # draw node config
        'draw_node_size': 20,
        'draw_node_label_enable': True,
        'show_labels': plot_node_ids,
        'node_label_type': 'label' if plot_node_ids else 'type',
        'node_label_name': 'type',
        'draw_node_label_font_size': 6,
        'draw_node_color_map': get_color_map(true_r.get_all_gs()),
        # draw edge config
        'draw_edge_label_enable': False,
        'draw_edge_label_font_size': 6,
        # graph text info config
        'each_graph_text_list': [],
        'each_graph_text_font_size': 10,
        'each_graph_text_pos': [0.5, 1.05],
        # graph padding: value range: [0, 1]
        'top_space': 0.20 if concise else 0.26,  # out of whole graph
        'bottom_space': 0.05,
        'hbetween_space': 0.6 if concise else 1,  # out of the subgraph
        'wbetween_space': 0,
        # plot config
        'plot_dpi': 200,
        'plot_save_path_eps': '',
        'plot_save_path_png': ''
    }
    test_gs = true_r.get_row_gs()
    train_gs = None
    if true_r.has_single_col_gs():
        train_gs = true_r.get_single_col_gs()
        if plot_node_ids and existing_mappings:
            # existing_orderings: [train + val ... test]
            test_gs = reorder_gs_based_on_exsiting_mappings(
                test_gs, existing_mappings[len(train_gs):], node_feat_name)
            train_gs = reorder_gs_based_on_exsiting_mappings(
                train_gs, existing_mappings[0:len(train_gs)], node_feat_name)
    plt_cnt = 0
    ids_groundtruth = true_r.get_sort_id_mat(ds_norm)
    ids_rank = pred_r.get_sort_id_mat(ds_norm)
    for i in range(len(test_gs)):
        q = test_gs[i]
        if not true_r.has_single_col_gs():
            train_gs = true_r.get_col_gs(i)
        middle_idx = len(train_gs) // 2
        # Choose the top 6 matches, the overall middle match, and the worst match.
        selected_ids = list(range(6))
        selected_ids.extend([middle_idx, -1])
        # Get the selected graphs from the groundtruth and the model.
        gids_groundtruth = np.array(ids_groundtruth[i][selected_ids])
        gids_rank = np.array(ids_rank[i][selected_ids])
        # Top row graphs are only the groundtruth outputs.
        gs_groundtruth = [train_gs[j] for j in gids_groundtruth]
        # Bottom row graphs are the query graph + model ranking.
        gs_rank = [test_gs[i]]
        gs_rank = gs_rank + [train_gs[j] for j in gids_rank]
        gs = gs_groundtruth + gs_rank

        # Create the plot labels.
        text = []
        # First label is the name of the groundtruth algorithm, rest are scores for the graphs.
        text += [get_text_label_for_ranking(
            ds_metric, true_r, i, i, ds_norm, True, dataset, gids_groundtruth, plot_gids)]
        text += [get_text_label_for_ranking(
            ds_metric, true_r, i, j, ds_norm, False, dataset, gids_groundtruth, plot_gids)
            for j in gids_groundtruth]
        # Start bottom row labels, just ranked from 1 to N with some fancy formatting.
        text.append("Rank by\n{}".format(model_name))
        for j in range(len(gids_rank)):
            ds = format_ds(pred_r.pred_ds(i, gids_rank[j], ds_norm))
            if j == len(gids_rank) - 2:
                rtn = '\n ...   {}   ...\n{}'.format(int(len(train_gs) / 2), ds)
            elif j == len(gids_rank) - 1:
                rtn = '\n {}\n{}'.format(int(len(train_gs)), ds)
            else:
                rtn = '\n {}\n{}'.format(str(j + 1), ds)
            # rtn = '\n {}: {:.2f}'.format('sim', pred_r.sim_mat_[i][j])
            text.append(rtn)

        # Perform the visualization.
        info_dict['each_graph_text_list'] = text
        fn = '{}_{}_{}_{}{}'.format(
            plot_what, dataset, true_r.get_model(), i, get_norm_str(ds_norm))
        info_dict, plt_cnt = set_save_paths_for_vis(
            info_dict, dir, extra_dir, fn, plt_cnt)
        vis_small(q, gs, info_dict)
        if plt_cnt > plot_max_num:
            print('Saved {} query demo plots'.format(plt_cnt))
            return
    print('Saved {} query demo plots'.format(plt_cnt))
Example #22
0
        paths['test']  = 'data/test_CNNDM_' + encoder + '.jsonl'
    return paths

path = get_data_path("test","bert")
print(path)
# # for name in path:
# #     assert exists(path[name])
# #     print(path[name])


datasets = MatchSumPipe(20, "bert").process_from_file(path)
print('Information of dataset is:')
print(datasets)
test_set = datasets.datasets['test']
device = int(0)
batch_size = 1

for cur_model in models:
    print('Current model is {}'.format(cur_model))

    # load model
    model = torch.load(join(save_path, cur_model))

    # configure testing
    dec_path, ref_path = get_result_path(save_path, cur_model)
    test_metric = MatchRougeMetric(data=read_jsonl(path['test']), dec_path=dec_path, 
                              ref_path=ref_path, n_total = len(test_set))
    tester = Tester(data=test_set, model=model, metrics=[test_metric], 
                     batch_size=batch_size, device=device, use_tqdm=False)
    tester.test()
def plot_true_pairs(dataset_name, num_pairs, fix_match_pos, want_gid_tuples,
                    need_eps):
    dir = join(get_result_path(), dataset_name, 'matching_vis')
    want = ['true']
    _plot_pairs(None, dataset_name, num_pairs, fix_match_pos, dir, want,
                want_gid_tuples, need_eps)
Example #24
0
def plot_single_number_metric_helper(dataset, dsmetric, models, rs, true_result,
                                     metric, norm,
                                     ds_kernel, thresh_pos, thresh_neg,
                                     thresh_pos_sim, thresh_neg_sim,
                                     plot_results, extra_dir):
    # dsmetric: distance/similarity metric, e.g. ged, mcs, etc.
    # metric: eval metric.
    print_ids = []
    rtn = {}
    val_list = []
    for model in models:
        if metric == 'mrr':
            val = mean_reciprocal_rank(
                true_result, rs[model], norm, print_ids)
        elif metric == 'mse':
            val = mean_squared_error(
                true_result, rs[model], ds_kernel, norm)
        elif metric == 'dev':
            val = mean_deviation(
                true_result, rs[model], ds_kernel, norm)
        elif metric == 'time':
            val = average_time(rs[model])
        elif 'acc' in metric:
            val = accuracy(
                true_result, rs[model], thresh_pos, thresh_neg,
                thresh_pos_sim, thresh_neg_sim, norm)
            pos_acc, neg_acc, acc = val
            if metric == 'pos_acc':
                val = pos_acc
            elif metric == 'neg_acc':
                val = neg_acc
            elif metric == 'acc':
                val = acc  # only the overall acc
            else:
                assert (metric == 'accall')
        elif metric == 'kendalls_tau':
            val = kendalls_tau(true_result, rs[model], norm)
        elif metric == 'spearmans_rho':
            val = spearmans_rho(true_result, rs[model], norm)
        else:
            raise RuntimeError('Unknown {}'.format(metric))
        # print('{} {}: {}'.format(metric, model, mrr_mse_time))
        rtn[model] = val
        val_list.append(val)
    rtn = {'{}{}'.format(metric, get_norm_str(norm)): rtn}
    if not plot_results:
        return rtn
    plt = plot_multiple_bars(val_list, models, metric)
    if metric == 'time':
        ylabel = 'time (msec)'
        norm = None
    elif metric == 'pos_acc':
        ylabel = 'pos_recall'
    elif metric == 'neg_acc':
        ylabel = 'neg_recall'
    elif metric == 'kendalls_tau':
        ylabel = 'Kendall\'s $\\tau$'
    elif metric == 'spearmans_rho':
        ylabel = 'Spearman\'s $\\rho$'
    else:
        ylabel = metric
    plt.ylabel(ylabel)
    if metric == 'time':
        plt.yscale('log')
    metric_addi_info = ''
    bfn = '{}_{}{}_{}_{}{}'.format(
        dsmetric, metric, metric_addi_info,
        dataset, '_'.join(models),
        get_norm_str(norm))
    sp = get_result_path() + '/{}/{}/'.format(dataset, metric)
    save_fig(plt, sp, bfn)
    if extra_dir:
        save_fig(plt, extra_dir, bfn)
    print(metric, 'plotted')
    return rtn
Example #25
0
                dist_mat[i][j] = d
    save(sfn, dist_mat)
    print('Saved to {}'.format(sfn))
    return dist_mat


if __name__ == '__main__':
    dataset = 'imdbmulti'
    dist_metric = 'ged'
    dist_algo = 'astar'
    dist_calculator = DistCalculator(dataset, dist_metric, dist_algo)
    # The server qilin calculated all the pairwise distances between
    # the training graphs.
    # Thus, enrich the distance map (i.e. calculator) using the qilin results.
    mat1 = np.load('{}/{}/{}/{}.npy'.format(
        get_result_path(), dataset, dist_metric,
        'ged_ged_mat_imdbmulti_beam80_2018-08-02T22:38:34_qilin_all_20cpus'))
    mat2 = np.load('{}/{}/{}/{}.npy'.format(
        get_result_path(), dataset, dist_metric,
        'ged_ged_mat_imdbmulti_hungarian_2018-08-03T13:40:54_qilin_all_20cpus')
                   )
    mat3 = np.load('{}/{}/{}/{}.npy'.format(
        get_result_path(), dataset, dist_metric,
        'ged_ged_mat_imdbmulti_vj_2018-08-04T10:21:15_qilin_all_20cpus'))
    row_gs = load_data(dataset, train=True).graphs
    col_gs = load_data(dataset, train=True).graphs
    dist_calculator.load_from_dist_mat([mat1, mat2, mat3],
                                       row_gs,
                                       col_gs,
                                       check_symmetry=False)
Example #26
0
    #--- Get params
    parser = argparse.ArgumentParser()
    parser.add_argument('--experiment', type=str, default=None, help='name of experiment', required=True)
    args = parser.parse_args()

    #-- Load params
    experiment = args.experiment


    results = {}
    n_splits = 10

    for i_split in range(1,n_splits+1):
        # path
        path = utils.get_result_path(experiment, str(i_split))
        with open(path,'r') as f:
            lines = [line.strip() for line in f.readlines()]

        for line in lines:
            # parse line
            parts = line.split(': ')
            if len(parts)==3:
                settings = parts[0]
                element = parts[1]
                value = float(parts[2])

                # if key does not exist add array
                key = settings+':'+element
                if key not in results:
                    results[key] = []
Example #27
0
def visualize_embeddings(dataset,
                         orig_embs,
                         true_result,
                         thresh_pos,
                         thresh_neg,
                         thresh_pos_sim,
                         thresh_neg_sim,
                         norm,
                         pred_r,
                         eps_dir=None):
    # label_mat, _, _ = true_result.classification_mat(
    #    thresh_pos, thresh_neg, thresh_pos_sim, thresh_neg_sim, norm)
    tsne = TSNE(n_components=2)
    embs = tsne.fit_transform(orig_embs)
    dir = '{}/{}/emb_vis'.format(get_result_path(), dataset)
    create_dir_if_not_exists(dir)
    if eps_dir:
        create_dir_if_not_exists(eps_dir)
    m = np.shape(pred_r.sort_id_mat_)[0]
    n = np.shape(pred_r.sort_id_mat_)[1]
    # m = np.shape(label_mat)[0]
    # n = np.shape(label_mat)[1]
    plt_cnt = 0
    print('TSNE embeddings: {} --> {} to plot'.format(orig_embs.shape,
                                                      embs.shape))
    for i in range(m):
        axis_x_red = []
        axis_y_red = []
        axis_x_blue = []
        axis_y_blue = []
        axis_x_query = []
        axis_y_query = []
        for j in range(10):
            axis_x_blue.append(embs[pred_r.sort_id_mat_[i][j], 0])
            axis_y_blue.append(embs[pred_r.sort_id_mat_[i][j], 1])
        for j in range(n - 10):
            axis_x_red.append(embs[pred_r.sort_id_mat_[i][j + 10], 0])
            axis_y_red.append(embs[pred_r.sort_id_mat_[i][j + 10], 1])
        axis_x_query.append(embs[i + n, 0])
        axis_y_query.append(embs[i + n, 1])

        cm = plt.cm.get_cmap("Reds")

        plt.figure()
        plt.scatter(axis_x_red,
                    axis_y_red,
                    s=30,
                    c=sorted(range(n - 10), reverse=False),
                    marker='o',
                    alpha=0.6,
                    cmap=plt.cm.get_cmap("Blues"))
        plt.scatter(axis_x_blue,
                    axis_y_blue,
                    s=15,
                    c=sorted(range(10), reverse=True),
                    marker='s',
                    alpha=0.6,
                    cmap=plt.cm.get_cmap("Reds"))
        plt.scatter(axis_x_query,
                    axis_y_query,
                    s=400,
                    c='limegreen',
                    marker='P',
                    alpha=0.6)
        plt.axis('off')
        cur_axes = plt.gca()
        cur_axes.axes.get_xaxis().set_visible(False)
        cur_axes.axes.get_yaxis().set_visible(False)
        plt.tight_layout()
        plt.savefig(dir + '/' + str(i) + '.png',
                    bbox_inches='tight',
                    pad_inches=0)
        if eps_dir:
            plt.savefig(eps_dir + '/' + str(i) + '.png',
                        bbox_inches='tight',
                        pad_inches=0)
            plt.savefig(eps_dir + '/' + str(i) + '.eps',
                        bbox_inches='tight',
                        pad_inches=0)
        plt_cnt += 1
        plt.close()
    print('Saved {} embedding visualization plots'.format(plt_cnt))
Example #28
0
    for g in gs:
        if g.graph['gid'] == gid:
            return g
    return None


if __name__ == '__main__':
    dataset = 'mutag'
    dist_metric = 'ged'
    dist_algo = 'astar'
    dist_sim_calculator = DistSimCalculator(dataset, dist_metric, dist_algo)
    # The server qilin calculated all the pairwise distances between
    # the training graphs.
    # Thus, enrich the distance map (i.e. calculator) using the qilin results.
    csv3 = ('{}/{}/csv/{}.csv'.format(
        get_result_path(), dataset,
        'ged_mutag_beam80_2019-01-22T13:55:19.744928_qilin_all_20cpus'))
    csv1 = ('{}/{}/csv/{}.csv'.format(
        get_result_path(), dataset,
        'ged_mutag_hungarian_2019-01-22T14:09:43.111557_feilong_all_15cpus'))
    csv2 = ('{}/{}/csv/{}.csv'.format(
        get_result_path(), dataset,
        'ged_mutag_vj_2019-01-22T16:34:47.260820_qilin_all_20cpus'))
    row_gs = load_data(dataset, train=True).graphs
    col_gs = load_data(dataset, train=True).graphs
    dist_sim_calculator.load(row_gs,
                             col_gs,
                             csv_filenames=[csv1, csv2, csv3],
                             ds_metric='ged',
                             check_symmetry=False)
    # dataset = 'webeasy'
Example #29
0
    parser.add_argument('--experiment',
                        type=str,
                        default=None,
                        help='name of experiment',
                        required=True)
    args = parser.parse_args()

    #-- Load params
    experiment = args.experiment

    results = {}
    n_splits = 10

    for i_split in range(1, n_splits + 1):
        # path
        path = utils.get_result_path(experiment, str(i_split))
        with open(path, 'r') as f:
            lines = [line.strip() for line in f.readlines()]

        for line in lines:
            # parse line
            parts = line.split(': ')
            if len(parts) == 3:
                settings = parts[0]
                element = parts[1]
                value = float(parts[2])

                # if key does not exist add array
                key = settings + ':' + element
                if key not in results:
                    results[key] = []
Example #30
0
def real_dataset_run_helper(computer_name, dataset, ds_metric, algo, row_graphs, col_graphs,
                            num_cpu, timeout):
    if ds_metric == 'ged':
        func = ged
    elif ds_metric == 'mcs':
        func = mcs
        # For MCS, since the solver can handle labeled and unlabeled graphs, but the compressed
        # encoding must be labeled (need to tell it to ignore labels or not).
        # TODO: this should go in some kind of config file specific for mcs
        if node_has_type_attrib(row_graphs[0]):
            labeled = True
            label_key = 'type'
            print('Has node type')
        else:
            labeled = False
            label_key = ''
            print('Does not have node type')
    else:
        raise RuntimeError('Unknown distance similarity metric {}'.format(ds_metric))
    m = len(row_graphs)
    n = len(col_graphs)
    ds_mat = np.zeros((m, n))
    time_mat = np.zeros((m, n))
    outdir = '{}/{}'.format(get_result_path(), dataset)
    create_dir_if_not_exists(outdir + '/csv')
    create_dir_if_not_exists(outdir + '/{}'.format(ds_metric))
    create_dir_if_not_exists(outdir + '/time')
    exsiting_csv = prompt('File path to exsiting csv files?')
    exsiting_entries = load_from_exsiting_csv(exsiting_csv, ds_metric, skip_eval=False)
    is_symmetric = prompt('Is the ds matrix symmetric? (1/0)', options=['0', '1']) == '1'
    if is_symmetric:
        assert (m == n)
    smart_needed = prompt('Is smart pair sorting needed? (1/0)', options=['0', '1']) == '1'
    csv_fn = '{}/csv/{}_{}_{}_{}_{}_{}cpus.csv'.format(
        outdir, ds_metric, dataset, algo, get_ts(), computer_name, num_cpu)
    file = open(csv_fn, 'w')
    print('Saving to {}'.format(csv_fn))
    if ds_metric == 'ged':
        print_and_log('i,j,i_gid,j_gid,i_node,j_node,i_edge,j_edge,ged,lcnt,time(msec)',
                      file)
    else:
        print_and_log(
            'i,j,i_gid,j_gid,i_node,j_node,i_edge,j_edge,mcs,node_mapping,edge_mapping,time(msec)',
            file)
    # Multiprocessing.
    pool = mp.Pool(processes=num_cpu)
    # Submit to pool workers.
    results = {}
    pairs_to_run = get_all_pairs_to_run(row_graphs, col_graphs, smart_needed)
    for k, (i, j) in enumerate(pairs_to_run):
        g1, g2 = row_graphs[i], col_graphs[j]
        i_gid, j_gid = g1.graph['gid'], g2.graph['gid']
        if (i_gid, j_gid) in exsiting_entries:
            continue
        if is_symmetric and (j_gid, i_gid) in exsiting_entries:
            continue
        if ds_metric == 'mcs':
            results[(i, j)] = pool.apply_async(
                func, args=(g1, g2, algo, labeled, label_key, True, True, timeout,))
        else:
            results[(i, j)] = pool.apply_async(
                func, args=(g1, g2, algo, True, True, timeout,))
        print_progress(k, m, n, 'submit: {} {} {} {} cpus;'.
                       format(algo, dataset, computer_name, num_cpu))
    # Retrieve results from pool workers or a loaded csv file (previous run).
    for k, (i, j) in enumerate(pairs_to_run):
        print_progress(k, m, n, 'work: {} {} {} {} {} cpus;'.
                       format(ds_metric, algo, dataset, computer_name, num_cpu))
        g1, g2 = row_graphs[i], col_graphs[j]
        i_gid, j_gid = g1.graph['gid'], g2.graph['gid']
        if (i, j) not in results:
            lcnt, mcs_node_mapping, mcs_edge_mapping = None, None, None
            tmp = exsiting_entries.get((i_gid, j_gid))
            if tmp:
                if ds_metric == 'ged':
                    i_gid, j_gid, i_node, j_node, ds, lcnt, t = tmp
                else:
                    i_gid, j_gid, i_node, j_node, ds, mcs_node_mapping, mcs_edge_mapping, t = tmp
            else:
                assert (is_symmetric)
                get_from = exsiting_entries[(j_gid, i_gid)]
                if ds_metric == 'ged':
                    j_gid, i_gid, j_node, i_node, ds, lcnt, t = \
                        get_from
                else:
                    j_gid, i_gid, j_node, i_node, ds, mcs_node_mapping, mcs_edge_mapping, t = \
                        get_from
            if ds_metric == 'ged':
                assert (lcnt is not None)
                assert (g1.graph['gid'] == i_gid)
                assert (g2.graph['gid'] == j_gid)
                assert (g1.number_of_nodes() == i_node)
                assert (g2.number_of_nodes() == j_node)
                s = form_ged_print_string(i, j, g1, g2, ds, lcnt, t)
            else:
                assert (mcs_node_mapping is not None and
                        mcs_edge_mapping is not None)
                s = form_mcs_print_string(
                    i, j, g1, g2, ds, mcs_node_mapping, mcs_edge_mapping, t)
        else:
            if ds_metric == 'ged':
                ds, lcnt, g1_a, g2_a, t = results[(i, j)].get()
                i_gid, j_gid, i_node, j_node = \
                    g1.graph['gid'], g2.graph['gid'], \
                    g1.number_of_nodes(), g2.number_of_nodes()
                assert (g1.number_of_nodes() == g1_a.number_of_nodes())
                assert (g2.number_of_nodes() == g2_a.number_of_nodes())
                exsiting_entries[(i_gid, j_gid)] = \
                    (i_gid, j_gid, i_node, j_node, ds, lcnt, t)
                s = form_ged_print_string(i, j, g1, g2, ds, lcnt, t)
            else:  # MCS
                ds, mcs_node_mapping, mcs_edge_mapping, t = \
                    results[(i, j)].get()
                exsiting_entries[(i_gid, j_gid)] = \
                    (ds, mcs_node_mapping, mcs_edge_mapping, t)
                s = form_mcs_print_string(
                    i, j, g1, g2, ds, mcs_node_mapping, mcs_edge_mapping, t)
        print_and_log(s, file)
        if ds_metric == 'mcs' and (i_gid, j_gid) in exsiting_entries:
            # Save memory, clear the mappings since they're saved to file.
            exsiting_entries[(i_gid, j_gid)] = list(exsiting_entries[(i_gid, j_gid)])
            exsiting_entries[(i_gid, j_gid)][1] = {}
            exsiting_entries[(i_gid, j_gid)][2] = {}
        ds_mat[i][j] = ds
        time_mat[i][j] = t
    file.close()
    save_as_np(outdir, ds_metric, ds_mat, time_mat, get_ts(),
               dataset, row_graphs, col_graphs, algo, computer_name, num_cpu)
Example #31
0
    test_model = args.test_model
    snapshot = args.snapshot
    experiment = args.experiment
    gauss_var = args.gauss_var

    #--- GPU
    caffe.set_mode_gpu()

    #--- LOAD SMOTHED POSITION MAPS
    position_maps = utils.load_position_maps(split_name, gauss_var)

    #--- LOAD TEST DATA
    test_data = utils.get_test_data_path(split_name)

    #--- GET TEST RESULTS PATH
    test_res_path = utils.get_result_path(experiment, split_name)

    ###---  TEST
    print 'Testing'
    sys.stdout.flush()

    net_results, position_results = test_net(test_model, snapshot, test_data, test_iters, position_maps)

    im_acc, price_acc, name_acc = net_results
    print 'NET: image accuracy:', im_acc
    print 'NET: price accuracy:', price_acc
    print 'NET: name accuracy:', name_acc

    p_im_acc, p_price_acc, p_name_acc = position_results
    print 'NET+POSITION: image accuracy:', p_im_acc
    print 'NET+POSITION: price accuracy:', p_price_acc