예제 #1
0
def colored_motifs():
    basedir = '/data/ssikdar/Attributed-VRG/'
    names = [
        'polbooks', 'football', 'wisconsin', 'texas', 'cornell', 'polblogs'
    ]
    # names = ['citeseer', 'cora', 'airports']
    models = ['AVRG', 'CL', 'AGM', 'DC-SBM', 'CELL', 'NetGAN', 'original']

    for name in names:
        for model in models:
            print(f'Running {name!r} {model!r}')
            if model == 'AVRG':
                model_ = 'AVRG-fancy_mu-random_leiden_5'
            else:
                model_ = model

            graphs_filename = join(basedir, 'output/graphs/', name,
                                   f'{model_}_10.pkl')
            if model == 'original':
                graphs = [nx.read_gml(join(basedir, 'input', f'{name}.gml'))]
            else:
                graphs = load_pickle(graphs_filename)
            if graphs is None:
                continue
            # batch_motif_counter(name, model, basedir, overwrite=False, graphs=None, motif_filename=None):
            batch_motif_counter(name=name,
                                model=model,
                                basedir=basedir,
                                graphs=graphs,
                                overwrite=False)

    return
예제 #2
0
def autoencoders(outdir, name, model):
    model_path = join(outdir, 'output', 'other_models', 'autoencoders')
    # if not Path(model_path).exists():
    #     os.makedirs(model_path)
    model_path = join(model_path, f'{name}_{model}_mat.pkl')
    graphs_path = join(outdir, 'output', 'graphs', name, f'{model}_10.pkl')

    # if Path(graphs_path).exists():
    #     return
    #
    input_g, _ = get_graph(name, basedir=outdir)
    if Path(model_path).exists():
        thresh_mat = load_pickle(model_path)
        graphs = []
        ns, ms = [], []
        for _ in range(10):
            g = get_graph_from_prob_matrix(thresh_mat, thresh=0.5)
            nx.set_node_attributes(g,
                                   name='value',
                                   values=nx.get_node_attributes(
                                       input_g, 'value'))
            ns.append(g.order())
            ms.append(g.size())
            graphs.append(g)
        print('Avg n, m', np.round(np.mean(ns), 3), np.round(np.mean(ms), 3))
        dump_pickle(graphs, graphs_path)
        return

    from other_models.autoencoders.fit import fit_model

    _, thresh_mat = fit_model(g=input_g, model_name=model)

    dump_pickle(thresh_mat, model_path)
    return
예제 #3
0
def main():
    machine_name, outdir = get_machine_name_and_outdir()
    names = [
        'karate', 'football', 'polbooks', 'wisconsin', 'texas', 'film',
        'cornell', 'cora', 'citeseer', 'airports', 'polblogs', 'chameleon',
        'pubmed', 'squirrel'
    ]

    clusterings = [
        'cond', 'spectral', 'leiden', 'louvain', 'infomap', 'labelprop',
        'random', 'leadingeig', 'consensus'
    ][:-1]

    for name in names:
        g, _ = get_graph(name, basedir=outdir)
        make_dirs(outdir=outdir, name=name)
        for clustering in clusterings:
            tree = load_pickle(
                join(outdir, 'output', 'trees', name,
                     f'{clustering}_list.pkl'))
            if tree is None: continue
            root = create_tree(tree)
            faulty_tnodes = tree_okay(root=root, g=g)
            if faulty_tnodes > 0:
                print(f'{name}\t{clustering}\t{faulty_tnodes:,d} errors')

    return
예제 #4
0
파일: runner.py 프로젝트: AVRGauthors/AVRG
def generate_graphs(name: str, grammar: AttributedVRG, num_graphs: int, extract_type: str, gen_type: str,
                    basedir: str, graphs_filename: str, mixing_dict: Union[None, Dict] = None, attr_name: Union[str, None] = None, fancy=None,
                    inp_deg_ast: float = None, inp_attr_ast: float = None, use_pickle: bool = False,
                    save_snapshots: bool = False, alpha: Union[None, float] = None,
                    write_pickle: bool = True) -> List[nx.Graph]:

    # make_dirs(outdir=outdir, name=name)
    # if fancy and grammar_type == 'AVRG': grammar_type += '-fancy'
    # if alpha is not None: grammar_type += f'-{int(alpha * 100)}'
    gen_filename = f'{basedir}/output/generators/{name}/{gen_type}_{extract_type}_{grammar.clustering}_{grammar.mu}_{num_graphs}.pkl'

    if use_pickle and check_file_exists(graphs_filename):
        if not save_snapshots:
            logging.error(f'Graph pickle found! {graphs_filename!r}')
            return load_pickle(graphs_filename)
        if save_snapshots and check_file_exists(gen_filename):
            logging.error(f'Gen pickle found, skipping: {gen_filename!r}')
            return load_pickle(graphs_filename)

    logging.error(f'Graphs filename: {graphs_filename!r}')

    if isinstance(grammar, AttributedVRG):
        assert attr_name != ''
        # assert fancy is not None
        if 'greedy' in gen_type:
            assert inp_attr_ast is not None and inp_deg_ast is not None
            gen = GreedyAttributeRandomGenerator(grammar=grammar, mixing_dict=mixing_dict, attr_name=attr_name,
                                                 inp_attr_ast=inp_attr_ast, inp_deg_ast=inp_deg_ast,
                                                 save_snapshots=save_snapshots, alpha=alpha)
        else:
            gen = AttributedRandomGenerator(grammar=grammar, mixing_dict=mixing_dict, attr_name=attr_name,
                                            use_fancy_rewiring=fancy, save_snapshots=save_snapshots)
    elif isinstance(grammar, VRG):
        gen = RandomGenerator(grammar=grammar, save_snapshots=save_snapshots)
    elif isinstance(grammar, NCE):
        gen = NCEGenerator(grammar=grammar)
    else:
        raise NotImplementedError(f'Invalid grammar type {type(grammar)!r}')

    graphs = gen.generate(num_graphs=num_graphs)
    if write_pickle:
        dump_pickle(graphs, graphs_filename)
    if save_snapshots:
        dump_pickle(gen, gen_filename)

    return graphs
예제 #5
0
def batch_synthetic_generator_runner():
    # frac = np.linspace(0, 1, 21, endpoint=True) * 100
    frac = np.linspace(0, 100, 11, endpoint=True,
                       dtype=int)  # change it to increments of 10 for now
    names = [f'toy-comm-{f}' for f in frac]
    # names = ['karate', 'football', 'polbooks', 'eucore', 'flights', 'chess', 'polblogs']
    num_graphs = 5
    outdir = '/data/ssikdar/attributed-vrg/dumps'
    use_pickle = True
    save_snapshots = False
    shuffle = 'edges'

    args = []
    for name in names:
        # input_graph, attr_name = get_graph(name)
        input_graph, attr_name = nx.read_gexf(
            f'./input/shuffled/{shuffle}/{name}.gexf', node_type=int), 'block'
        name = f'{name}-{shuffle}'
        if attr_name == '':
            mix_dict, inp_deg_ast, inp_attr_ast = None, None, None
        else:
            mix_dict = get_mixing_dict(input_graph, attr_name=attr_name)
            inp_deg_ast = nx.degree_assortativity_coefficient(input_graph)
            inp_attr_ast = nx.attribute_assortativity_coefficient(
                input_graph, attr_name)

        for grammar_filename in glob(f'{outdir}/grammars/{name}/*'):
            grammar = load_pickle(grammar_filename)
            if isinstance(grammar, AttributedVRG):
                grammar_type = 'AVRG'
                fancy = True
                args.append((name, grammar, num_graphs, grammar_type, outdir,
                             mix_dict, attr_name, fancy, inp_deg_ast,
                             inp_attr_ast, use_pickle, save_snapshots))

                grammar_type = 'AVRG-greedy'
                # args.append((name, grammar, num_graphs, grammar_type, outdir, mix_dict, attr_name, fancy,
                #              inp_deg_ast, inp_attr_ast, use_pickle, save_snapshots))
                for alpha in (0, 0.5, 1):
                    args.append(
                        (name, grammar, num_graphs, grammar_type, outdir,
                         mix_dict, attr_name, fancy, inp_deg_ast, inp_attr_ast,
                         use_pickle, save_snapshots, alpha))
            else:
                assert isinstance(grammar, VRG)
                grammar_type = 'VRG'
                fancy = None
                args.append((name, grammar, num_graphs, grammar_type, outdir,
                             mix_dict, attr_name, fancy, inp_deg_ast,
                             inp_attr_ast, use_pickle, save_snapshots))

    parallel_async(func=generate_graphs, args=args, num_workers=10)
    # generate_graphs(grammar: Union[VRG, NCE, AttributedVRG], num_graphs: int, grammar_type: str, outdir: str = 'dumps',
    #                 mixing_dict: Union[None, Dict] = None, attr_name: Union[str, None] = None, fancy = None,
    #                 inp_deg_ast: float = None, inp_attr_ast: float = None)

    return
예제 #6
0
파일: runner.py 프로젝트: AVRGauthors/AVRG
def get_grammars(name: str,  grammar_type: str, extract_type: str, clustering: str, mu: int, input_graph: nx.Graph,
                 use_grammar_pickle: bool, use_cluster_pickle: bool, attr_name: str, outdir: str, count: int = 1,
                 grammar_filename: str = '', write_pickle: bool = True, list_of_list_clusters=None) -> List[Union[VRG, NCE]]:
    """
    Dump the stats
    :return:
    """
    if input_graph.name != name:
        input_graph.name = name
    # make_dirs(outdir, name)  # make the directories if needed

    # print(f'Extracting {count} grammars')
    grammars = []

    for i in range(count):
        if grammar_filename == '':
            raise Exception('filename empty')

        if use_grammar_pickle and check_file_exists(grammar_filename):
            logging.error(f'Using pickled grammar from {grammar_filename!r}')
            grammar = load_pickle(grammar_filename)
        else:
            if list_of_list_clusters is None:
                list_of_list_filename = os.path.join(outdir, 'output', 'trees', input_graph.name, f'{clustering}_list.pkl')
                if not Path(list_of_list_filename).exists():
                    logging.error(f'Skipping grammar, name {input_graph.name!r} clustering {clustering!r}')
                    continue
                list_of_list_clusters = get_clustering(g=input_graph, outdir=outdir,
                                                       clustering=clustering, use_pickle=use_cluster_pickle,
                                                       filename=list_of_list_filename)
            root = create_tree(list_of_list_clusters) if isinstance(list_of_list_clusters, list) else list_of_list_clusters
            # dc = dasgupta_cost(g=g, root=root, use_parallel=True)
            lmg: LightMultiGraph = nx_to_lmg(nx_g=input_graph)

            logging.error(f'Extracting grammar: {grammar_filename}')
            if grammar_type == 'VRG':
                extractor = VRGExtractor(g=lmg, extract_type=extract_type, mu=mu, root=root, clustering=clustering)
            elif grammar_type == 'NCE':
                extractor = NCEExtractor(g=lmg, extract_type=extract_type, mu=mu, root=root, clustering=clustering)
            elif grammar_type == 'AVRG':
                assert attr_name != ''
                extractor = AVRGExtractor(g=lmg, attr_name=attr_name, extract_type=extract_type, clustering=clustering,
                                          mu=mu, root=root)
            else:
                raise NotImplementedError(f'Invalid grammar type {grammar_type!r}')

            grammar = extractor.extract()
            logging.error(str(grammar))
            if write_pickle:
                dump_pickle(grammar, grammar_filename)
        grammars.append(grammar)
    return grammars
예제 #7
0
def read_batched_graphs(basedir, name):
    input_graphs = load_pickle(join(basedir, 'input', f'{name}.graphs'))
    cleaned_graphs = []

    for i, g in enumerate(input_graphs):
        g.remove_edges_from(nx.selfloop_edges(g))
        if not nx.is_connected(g):
            nodes_lcc = max(nx.connected_components(g), key=len)
            g = g.subgraph(nodes_lcc).copy()
        g = nx.convert_node_labels_to_integers(g, label_attribute='orig_label')
        g.name = f'{name}_{i}'
        cleaned_graphs.append(g)

    return cleaned_graphs
예제 #8
0
def batched_graphs_generator(basedir, clusterings, name, mus=None):
    # num_graphs = 5 if 'polblogs' in name else 10
    num_graphs = 10
    use_pickle = True
    save_snapshots = False
    attr_name = 'value'
    mus = [5]
    alpha = None
    input_graphs = read_batched_graphs(basedir=basedir, name=name)
    extract_types = ['mu_random']

    args = []
    for i, input_graph in enumerate(input_graphs):
        mix_dict = get_mixing_dict(input_graph, attr_name=attr_name)
        inp_deg_ast = nx.degree_assortativity_coefficient(input_graph)
        inp_attr_ast = nx.attribute_assortativity_coefficient(
            input_graph, attr_name)

        for grammar_filename in glob(
                f'{basedir}/output/grammars/{name}/*_{i}.pkl'):
            grammar = load_pickle(grammar_filename)
            if grammar.mu not in mus or grammar.clustering not in clusterings or grammar.extract_type not in extract_types:
                continue

            extract_type = grammar.extract_type.replace('_', '-')
            if isinstance(grammar, AttributedVRG):
                for gen_type, fancy in zip(('AVRG-regular', 'AVRG-fancy'),
                                           (False, True)):
                    graphs_filename = f'{basedir}/output/graphs/{name}/{gen_type}_{extract_type}_{grammar.clustering}_{grammar.mu}_{num_graphs}_{i}.pkl'
                    args.append((name, grammar, num_graphs, extract_type,
                                 gen_type, basedir, graphs_filename, mix_dict,
                                 attr_name, fancy, inp_deg_ast, inp_attr_ast,
                                 use_pickle, save_snapshots, alpha))

                for alpha, gen_type in zip(
                    (0, 0.5, 1),
                    ('AVRG-greedy-attr', 'AVRG-greedy-50', 'AVRG-greedy-deg')):
                    graphs_filename = f'{basedir}/output/graphs/{name}/{gen_type}_{extract_type}_{grammar.clustering}_{grammar.mu}_{num_graphs}_{i}.pkl'
                    args.append((name, grammar, num_graphs, extract_type,
                                 gen_type, basedir, graphs_filename, mix_dict,
                                 attr_name, fancy, inp_deg_ast, inp_attr_ast,
                                 use_pickle, save_snapshots, alpha))

    # random.shuffle(args)
    parallel_async(func=generate_graphs, args=args, num_workers=8)
    return
예제 #9
0
def batched_graphs_grammars(basedir, name, clusterings):
    input_graphs = read_batched_graphs(basedir=basedir, name=name)
    attr_name = 'value'
    grammar_types = ['AVRG']  # ['VRG', 'AVRG']
    extract_types = ['mu_random']  #, 'mu_level', 'all_tnodes']
    mus = [5]
    use_cluster_pickle = True
    use_grammar_pickle = True
    count = 1

    args = []
    for i, input_graph in enumerate(input_graphs):
        for clustering in clusterings:
            list_of_list_clusters = load_pickle(
                join(basedir, 'output', 'trees', name,
                     f'{clustering}_{i}.pkl'))
            for grammar_type in grammar_types:
                for extract_type in extract_types:
                    extract = extract_type.replace('_', '-')
                    for mu in mus:
                        grammar_filename = f'{basedir}/output/grammars/{name}/{grammar_type}_{extract}_{clustering}_{mu}_{i}.pkl'

                        arg = (name, grammar_type, extract_type, clustering,
                               mu, input_graph, True, True, attr_name, basedir,
                               1, grammar_filename, True,
                               list_of_list_clusters)
                        args.append(arg)
                        if extract_type == 'all_tnodes':  # here mu is not important for all_tnodes
                            break

    # print(args[: 3])

    try:
        parallel_async(func=get_grammars, args=args, num_workers=5)
    except Exception as e:
        print(e)
    return
예제 #10
0
파일: runner.py 프로젝트: AVRGauthors/AVRG
def get_clustering(g: nx.Graph, outdir: str, clustering: str, use_pickle: bool, filename='',
                   write_pickle: bool = True) -> Any:
    """
    wrapper method for getting dendrogram. uses an existing pickle if it can.
    :param g: graph
    :param outdir: output directory where picles are stored
    :param clustering: name of clustering method
    :param use_pickle: flag to whether or not to use the pickle
    :return: root node of the dendrogram
    """
    if g.name == 'sample':
        list_of_list_clusters = [
            [
                [[0], [1]],
                [[2], [[3], [4]]]
            ],
            [
                [[5], [6]],
                [[7], [8]]
            ]
        ]
        return list_of_list_clusters

    if filename == '':
        list_of_list_filename = os.path.join(outdir, 'output', 'trees', g.name, f'{clustering}_list.pkl')
    else:
        list_of_list_filename = filename

    if check_file_exists(list_of_list_filename) and use_pickle:
        logging.error(f'Using existing pickle for {clustering!r} clustering\n')
        list_of_list_clusters = load_pickle(list_of_list_filename)

    else:
        tqdm.write(f'Running {clustering!r} clustering on {g.name!r}...')
        if clustering == 'random':
            list_of_list_clusters = partitions.get_random_partition(g)
        elif clustering == 'consensus':
            # delete the matlab tree and sc files
            matlab_files_path = './src/matlab_clustering/HierarchicalConsensus/data'
            tree_path = os.path.join(matlab_files_path, f'{g.name}_tree.mat')
            sc_path = os.path.join(matlab_files_path, f'{g.name}_sc.vec')
            if check_file_exists(tree_path):
                os.remove(tree_path)
            if check_file_exists(sc_path):
                os.remove(sc_path)
            list_of_list_clusters = get_consensus_root(g=g, gname=g.name)
        elif clustering in ('leiden', 'louvain', 'infomap', 'labelprop', 'leadingeig'):
            try:
                list_of_list_clusters = partitions.louvain_leiden_infomap_label_prop(g, method=clustering)
            except Exception as e:
                list_of_list_clusters = []
        elif clustering == 'cond':
            list_of_list_clusters = partitions.approx_min_conductance_partitioning(g)
        elif clustering == 'spectral':
            list_of_list_clusters = partitions.spectral_kmeans(g, K=int(math.sqrt(g.order() // 2)))
        else:
            raise NotImplementedError(f'Invalid clustering algorithm {clustering!r}')

        if len(list_of_list_clusters) != 0 and write_pickle:
            dump_pickle(list_of_list_clusters, list_of_list_filename)

    return list_of_list_clusters
예제 #11
0
    def _partition_graph(self,
                         test_frac=.1,
                         val_frac=.05,
                         prevent_disconnect=True,
                         verbose=False,
                         use_pickle=False):
        # NOTE: Splits are randomized and results might slightly deviate from reported numbers in the paper.
        # taken from https://github.com/lucashu1/link-prediction/blob/master/gae/preprocessing.py
        if self.splits_filename is None:
            self.splits_filename = join(
                self.outdir, 'output', 'splits',
                f'{self.dataset}_{int(test_frac * 100)}_{int(val_frac * 100)}')
        if use_pickle and check_file_exists(self.splits_filename):
            logging.error(f'Using pickle at {splits_filename!r}')
            adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \
                test_edges, test_edges_false = load_pickle(splits_filename)
        else:
            g = nx.Graph(self.input_graph)
            adj = nx.to_scipy_sparse_matrix(g)
            orig_num_cc = nx.number_connected_components(g)

            adj_triu = sp.triu(adj)  # upper triangular portion of adj matrix
            adj_tuple = sparse_to_tuple(
                adj_triu)  # (coords, values, shape), edges only 1 way
            edges = adj_tuple[0]  # all edges, listed only once (not 2 ways)
            # edges_all = sparse_to_tuple(adj)[0] # ALL edges (includes both ways)
            num_test = int(np.floor(
                edges.shape[0] *
                test_frac))  # controls how large the test set should be
            num_val = int(np.floor(
                edges.shape[0] *
                val_frac))  # controls how alrge the validation set should be

            # Store edges in list of ordered tuples (node1, node2) where node1 < node2
            edge_tuples = [(min(edge[0], edge[1]), max(edge[0], edge[1]))
                           for edge in edges]
            all_edge_tuples = set(edge_tuples)
            train_edges = set(
                edge_tuples)  # initialize train_edges to have all edges
            test_edges = set()
            val_edges = set()

            if verbose:
                print('generating test/val sets...', end=' ', flush=True)

            # Iterate over shuffled edges, add to train/val sets
            np.random.shuffle(edge_tuples)
            for edge in edge_tuples:
                node1, node2 = edge

                g.remove_edge(
                    node1, node2
                )  # If removing edge would disconnect a connected component, backtrack
                if prevent_disconnect:
                    if nx.number_connected_components(g) > orig_num_cc:
                        g.add_edge(node1, node2)
                        continue

                # Fill test_edges first
                if len(test_edges) < num_test:
                    test_edges.add(edge)
                    train_edges.remove(edge)

                # Then, fill val_edges
                elif len(val_edges) < num_val:
                    val_edges.add(edge)
                    train_edges.remove(edge)

                # Both edge lists full --> break loop
                elif len(test_edges) == num_test and len(val_edges) == num_val:
                    break

            if (len(val_edges) < num_val) or (len(test_edges) < num_test):
                print(
                    'WARNING: not enough removable edges to perform full train-test split!'
                )
                print(
                    f'Num. (test, val) edges requested: {num_test, num_val})')
                print(
                    f'Num. (test, val) edges returned: {len(test_edges), len(val_edges)}'
                )

            if prevent_disconnect:
                assert nx.number_connected_components(g) == orig_num_cc

            if verbose:
                print('creating false test edges...', end=' ', flush=True)

            test_edges_false = set()
            while len(test_edges_false) < num_test:
                idx_i = np.random.randint(0, adj.shape[0])
                idx_j = np.random.randint(0, adj.shape[0])

                if idx_i == idx_j: continue

                false_edge = (min(idx_i, idx_j), max(idx_i, idx_j))

                # Make sure false_edge not an actual edge, and not a repeat
                if false_edge in all_edge_tuples: continue
                if false_edge in test_edges_false: continue

                test_edges_false.add(false_edge)

            if verbose:
                print('creating false val edges...', end=' ', flush=True)

            val_edges_false = set()
            while len(val_edges_false) < num_val:
                idx_i = np.random.randint(0, adj.shape[0])
                idx_j = np.random.randint(0, adj.shape[0])

                if idx_i == idx_j: continue

                false_edge = (min(idx_i, idx_j), max(idx_i, idx_j))

                # Make sure false_edge in not an actual edge, not in test_edges_false, not a repeat
                if false_edge in all_edge_tuples or \
                        false_edge in test_edges_false or \
                        false_edge in val_edges_false:
                    continue

                val_edges_false.add(false_edge)

            if verbose: print('creating false train edges...')

            train_edges_false = set()
            while len(train_edges_false) < len(train_edges):
                idx_i = np.random.randint(0, adj.shape[0])
                idx_j = np.random.randint(0, adj.shape[0])

                if idx_i == idx_j: continue

                false_edge = (min(idx_i, idx_j), max(idx_i, idx_j))

                # Make sure false_edge in not an actual edge, not in test_edges_false,
                # not in val_edges_false, not a repeat
                if false_edge in all_edge_tuples or \
                        false_edge in test_edges_false or \
                        false_edge in val_edges_false or \
                        false_edge in train_edges_false:
                    continue

                train_edges_false.add(false_edge)

            if verbose:
                print('final checks for disjointness...', end=' ', flush=True)

            # assert: false_edges are actually false (not in all_edge_tuples)
            assert test_edges_false.isdisjoint(all_edge_tuples)
            assert val_edges_false.isdisjoint(all_edge_tuples)
            assert train_edges_false.isdisjoint(all_edge_tuples)

            # assert: test, val, train false edges disjoint
            assert test_edges_false.isdisjoint(val_edges_false)
            assert test_edges_false.isdisjoint(train_edges_false)
            assert val_edges_false.isdisjoint(train_edges_false)

            # assert: test, val, train positive edges disjoint
            assert val_edges.isdisjoint(train_edges)
            assert test_edges.isdisjoint(train_edges)
            assert val_edges.isdisjoint(test_edges)

            if verbose: print('creating adj_train...', end=' ', flush=True)

            # Re-build adj matrix using remaining graph
            adj_train = nx.adjacency_matrix(g)

            # Convert edge-lists to numpy arrays
            train_edges = np.array(
                [list(edge_tuple) for edge_tuple in train_edges])
            train_edges_false = np.array(
                [list(edge_tuple) for edge_tuple in train_edges_false])
            val_edges = np.array(
                [list(edge_tuple) for edge_tuple in val_edges])
            val_edges_false = np.array(
                [list(edge_tuple) for edge_tuple in val_edges_false])
            test_edges = np.array(
                [list(edge_tuple) for edge_tuple in test_edges])
            test_edges_false = np.array(
                [list(edge_tuple) for edge_tuple in test_edges_false])

            if verbose: print('Done with train-test split!')

            # NOTE: these edge lists only contain single direction of edge!
            dump_pickle((adj_train, train_edges, train_edges_false, val_edges,
                         val_edges_false, test_edges, test_edges_false),
                        splits_filename)
        logging.error(
            f'train (T/F): {len(train_edges)} valid: {len(val_edges)} ({val_frac*100}%) test: {len(test_edges)} ({test_frac*100}%)'
        )
        return adj_train, train_edges, train_edges_false, val_edges, val_edges_false, test_edges, test_edges_false
예제 #12
0
    ]  # 'netgan', 'cell', ]
    # names = ['citeseer']
    models = ['cell']

    for name in names:
        name_fname = join(basedir, 'stats/link_pred', f'{name}.csv')
        orig_g, att_name = get_graph(name, basedir=basedir)
        model_dfs = []
        trials = 10
        test_frac, val_frac = 0.1, 0.05
        for model in models:
            model_rows = []
            model_fname = join(basedir, 'stats/link_pred',
                               f'{name}_{model}.csv')
            if Path(model_fname).exists():
                model_df = load_pickle(model_fname)
                continue
            for trial in range(1, trials + 1):
                splits_filename = join(
                    basedir, 'output', 'splits',
                    f'{name}_{int(test_frac*100)}_{int(val_frac*100)}_{trial}.pkl'
                )

                link_pred = LinkPrediction(input_graph=orig_g,
                                           test_valid_split=(test_frac,
                                                             val_frac),
                                           dataset=name,
                                           use_pickle=True,
                                           outdir=basedir,
                                           splits_filename=splits_filename
                                           )  # use a diff split each time
예제 #13
0
def make_graph_df(name,
                  fname,
                  orig_graph,
                  mu,
                  clustering,
                  attr_name,
                  grammar_type,
                  bipartite=False):
    deg_ast_fn = nx.degree_assortativity_coefficient
    attr_ast_fn = nx.attribute_assortativity_coefficient

    gen_graphs = load_pickle(fname)
    if gen_graphs is None:
        return pd.DataFrame()

    cols = [
        'name', 'orig_n', 'orig_m', 'orig_degree_ast', 'orig_sp_ast_50',
        'orig_sp_ast_100', 'orig_sp_ast_500', 'attr_name', 'orig_attr_ast',
        'model', 'mu', 'clustering', 'gen_n', 'gen_m', 'gen_degree_ast',
        'gen_sp_ast_50', 'gen_sp_ast_100', 'gen_sp_ast_500', 'gen_attr_ast',
        'total_rewired_edges', 'fancy_rewired_edges', 'degree_js',
        'pagerank_js', 'lambda_dist', 'deg_ast_diff', 'attr_ast_diff',
        'is_bipartite'
    ]

    row = {col: np.nan for col in cols}

    orig_deg_ast = deg_ast_fn(orig_graph)

    orig_attr_ast = attr_ast_fn(orig_graph,
                                attr_name) if attr_name != '' else np.nan
    orig_gstats = GraphStats(orig_graph)
    orig_sp_ast_50 = orig_gstats.shortest_path_ast(alpha=0.5)
    orig_sp_ast_100 = orig_gstats.shortest_path_ast(alpha=1)
    orig_sp_ast_500 = orig_gstats.shortest_path_ast(alpha=5)

    orig_h_dict = get_compatibility_matrix(orig_graph, attr_name)
    orig_h = orig_h_dict['homophily_ratio']
    orig_h_mat = orig_h_dict['compatibility_mat']
    orig_h_map = orig_h_dict['mapping']
    orig_is_bip = nx.algorithms.bipartite.is_bipartite(
        orig_graph) if bipartite else np.nan

    rows = []

    for g in gen_graphs:
        gen_gstats = GraphStats(g)
        gpc = GraphPairCompare(orig_gstats, gen_gstats)
        gen_deg_ast = deg_ast_fn(g)
        gen_sp_ast_50 = gen_gstats.shortest_path_ast(alpha=0.5)
        gen_sp_ast_100 = gen_gstats.shortest_path_ast(alpha=1)
        gen_sp_ast_500 = gen_gstats.shortest_path_ast(alpha=5)

        gen_attr_ast = attr_ast_fn(g, attr_name) if attr_name != '' else np.nan
        total_rewired_edges = g.graph.get('total_rewirings', 0)
        fancy_rewired_edges = g.graph.get('fancy_rewirings', 0)
        h_dict = get_compatibility_matrix(g, attr_name)
        h = h_dict['homophily_ratio']
        h_mat = h_dict['compatibility_mat']
        h_map = h_dict['mapping']

        gen_is_bip = nx.algorithms.bipartite.is_bipartite(
            g) if bipartite else np.nan

        row = dict(name=name,
                   orig_n=orig_graph.order(),
                   orig_m=orig_graph.size(),
                   orig_deg_ast=orig_deg_ast,
                   orig_sp_ast_50=orig_sp_ast_50,
                   orig_sp_ast_100=orig_sp_ast_100,
                   orig_sp_ast_500=orig_sp_ast_500,
                   orig_attr_ast=orig_attr_ast,
                   attr_name=attr_name,
                   model=grammar_type,
                   clustering=clustering,
                   mu=mu,
                   orig_homophily_ratio=orig_h,
                   orig_homophily_mat=orig_h_mat,
                   orig_homophily_map=orig_h_map,
                   orig_is_bipartite=orig_is_bip,
                   gen_n=g.order(),
                   gen_m=g.size(),
                   gen_deg_ast=gen_deg_ast,
                   gen_sp_ast_50=gen_sp_ast_50,
                   gen_sp_ast_100=gen_sp_ast_100,
                   gen_sp_ast_500=gen_sp_ast_500,
                   gen_attr_ast=gen_attr_ast,
                   total_rewired_edges=total_rewired_edges,
                   fancy_rewired_edges=fancy_rewired_edges,
                   degree_js=gpc.degree_js(),
                   pagerank_js=gpc.pagerank_js(),
                   lambda_dist=gpc.lambda_dist(),
                   gen_homophily_ratio=h,
                   gen_homophily_mat=h_mat,
                   gen_homophily_map=h_map,
                   deg_mix_dist_dict=gpc.deg_mixing_dist_dict(),
                   attr_mix_dist_dict=gpc.attr_mixing_dist_dict(),
                   gen_is_biparite=gen_is_bip)
        rows.append(row)
    return pd.DataFrame(rows)
예제 #14
0
def make_graph_df_new(name: str,
                      fname: str,
                      basedir: str,
                      orig_gstats: GraphStats,
                      slow_stats: bool,
                      model: str = 'AVRG'):
    gen_graphs = load_pickle(fname)
    if gen_graphs is None:
        return pd.DataFrame()

    # break down the filename to figure out the different parts
    path = Path(fname)

    if model == 'AVRG':
        pattern = r'(.+)\_(.+)\_(.+)\_(.+)\_(\d+)'
        m = re.match(pattern, path.stem)
        if m is None:
            return
        gen_type, extract_type, clustering, mu, _ = m.groups()
    else:
        pattern = r'(.+)\_(\d+)'
        m = re.match(pattern, path.stem)
        if m is None:
            return
        model, _ = m.groups()
        gen_type, extract_type, clustering, mu = np.nan, np.nan, np.nan, np.nan

    if slow_stats:
        orig_sp_ast_5 = orig_gstats.shortest_path_ast(alpha=0.05,
                                                      fname=join(
                                                          basedir, 'input',
                                                          f'{name}.gml'))
        orig_sp_ast_50 = orig_gstats.shortest_path_ast(alpha=0.5,
                                                       fname=join(
                                                           basedir, 'input',
                                                           f'{name}.gml'))
        orig_sp_ast_100 = orig_gstats.shortest_path_ast(alpha=1,
                                                        fname=join(
                                                            basedir, 'input',
                                                            f'{name}.gml'))
        orig_sp_ast_500 = orig_gstats.shortest_path_ast(alpha=5,
                                                        fname=join(
                                                            basedir, 'input',
                                                            f'{name}.gml'))
        orig_sp_ast_1000 = orig_gstats.shortest_path_ast(alpha=10,
                                                         fname=join(
                                                             basedir, 'input',
                                                             f'{name}.gml'))
        orig_apl = orig_gstats.average_path_length()
        orig_avg_cc = orig_gstats.average_clustering()

    orig_stats = _get_basic_stats(
        gstats=orig_gstats,
        kind='orig')  # add basic stats of the original graph
    rows = []

    for g in gen_graphs:
        orig_graph = orig_gstats.graph
        row = dict(name=name,
                   orig_graph=np.nan,
                   model=model,
                   gen_type=gen_type,
                   extract_type=extract_type,
                   clustering=clustering,
                   mu=mu)
        row.update(orig_stats)  # add the original stats

        gen_gstats = GraphStats(g)
        row.update(_get_basic_stats(gen_gstats, kind='gen'))

        gpc = GraphPairCompare(orig_gstats, gen_gstats)
        row.update(
            dict(degree_js=gpc.degree_js(),
                 pagerank_js=gpc.pagerank_js(),
                 lambda_dist=gpc.lambda_dist()))

        if slow_stats:
            gen_sp_ast_5 = gen_gstats.shortest_path_ast(alpha=0.05)
            gen_sp_ast_50 = gen_gstats.shortest_path_ast(alpha=0.5)
            gen_sp_ast_100 = gen_gstats.shortest_path_ast(alpha=1)
            gen_sp_ast_500 = gen_gstats.shortest_path_ast(alpha=5)
            gen_sp_ast_1000 = gen_gstats.shortest_path_ast(alpha=10)

            gen_apl = gen_gstats.average_path_length()
            gen_avg_cc = gen_gstats.average_clustering()

            row.update(
                dict(orig_apl=orig_apl,
                     orig_avg_cc=orig_avg_cc,
                     gen_apl=gen_apl,
                     gen_avg_cc=gen_avg_cc))

            row.update(
                dict(orig_sp_ast_5=orig_sp_ast_5,
                     orig_sp_ast_50=orig_sp_ast_50,
                     orig_sp_ast_100=orig_sp_ast_100,
                     orig_sp_ast_500=orig_sp_ast_500,
                     orig_sp_ast_1000=orig_sp_ast_1000))

            row.update(
                dict(gen_sp_ast_5=gen_sp_ast_5,
                     gen_sp_ast_50=gen_sp_ast_50,
                     gen_sp_ast_100=gen_sp_ast_100,
                     gen_sp_ast_500=gen_sp_ast_500,
                     gen_sp_ast_1000=gen_sp_ast_1000))

        rows.append(row)
    return pd.DataFrame(rows)
예제 #15
0
def batch_generator_runner(names,
                           basedir,
                           clusterings,
                           mus=None,
                           extract_types=None,
                           save_snapshots=False,
                           num_workers=10,
                           shuffle=False):
    num_graphs = 10  # we need 1 graph to chart the progress  # TODO: change this in the future?
    use_pickle = True
    save_snapshots = save_snapshots
    if mus is None:
        mus = list(range(3, 11)) + [-1]
    alpha = None

    args = []
    for name in names:
        input_graph, attr_name = get_graph(name, basedir=basedir)
        if input_graph.size() > 3_000:
            save_snapshots = False

        mix_dict = get_mixing_dict(input_graph, attr_name=attr_name)
        inp_deg_ast = nx.degree_assortativity_coefficient(input_graph)
        inp_attr_ast = nx.attribute_assortativity_coefficient(
            input_graph, attr_name)

        for grammar_filename in glob(f'{basedir}/output/grammars/{name}/*'):
            grammar = load_pickle(grammar_filename)
            extract_type = grammar.extract_type.replace('_', '-')
            if grammar.mu not in mus or grammar.clustering not in clusterings or extract_type not in extract_types:
                continue
            print(Path(grammar_filename).stem)

            if isinstance(grammar, AttributedVRG):
                for gen_type, fancy in zip(('AVRG-regular', 'AVRG-fancy'),
                                           (False, True)):
                    graphs_filename = f'{basedir}/output/graphs/{name}/{gen_type}_{extract_type}_{grammar.clustering}_{grammar.mu}_{num_graphs}.pkl'
                    args.append((name, grammar, num_graphs, extract_type,
                                 gen_type, basedir, graphs_filename, mix_dict,
                                 attr_name, fancy, inp_deg_ast, inp_attr_ast,
                                 use_pickle, save_snapshots, alpha))

                for alpha, gen_type in zip(
                    (0, 0.5, 1),
                    ('AVRG-greedy-attr', 'AVRG-greedy-50', 'AVRG-greedy-deg')):
                    fancy = None
                    graphs_filename = f'{basedir}/output/graphs/{name}/{gen_type}_{extract_type}_{grammar.clustering}_{grammar.mu}_{num_graphs}.pkl'
                    args.append((name, grammar, num_graphs, extract_type,
                                 gen_type, basedir, graphs_filename, mix_dict,
                                 attr_name, fancy, inp_deg_ast, inp_attr_ast,
                                 use_pickle, save_snapshots, alpha))

            else:
                continue  # skip VRGs
                # assert isinstance(grammar, VRG)
                # grammar_type = 'VRG'
                # fancy = None
                # graphs_filename = f'{basedir}/output/graphs/{name}/{grammar_type}_{grammar.clustering}_{grammar.mu}_{num_graphs}.pkl'
                # args.append((name, grammar, num_graphs, grammar_type, outdir, mix_dict, attr_name, fancy,
                #              inp_deg_ast, inp_attr_ast, use_pickle, save_snapshots, alpha, graphs_filename))
    if shuffle:
        random.shuffle(args)
    try:
        parallel_async(func=generate_graphs,
                       args=args,
                       num_workers=num_workers)
    except Exception as e:
        print(e)
    return
예제 #16
0
def make_gen_df(base_path, names=None, clusterings=None, num_samples=None):
    """
    num_samples is for number of samples of generated graph
    """
    rows = []
    cols = [
        'snap_id', 'name', 'model', 'clustering', 'attr_name', 'orig_n',
        'orig_m', 'orig_deg_ast', 'orig_attr_ast', 'mu', 'n', 'm', 't',
        'term_graph', 'term_n', 'term_m', 'term_degree_js', 'term_pagerank_js',
        'term_lambda_dist', 'term_deg_ast', 'term_attr_ast'
    ]

    if names is None:
        names = [
            'karate', 'football', 'polbooks', 'us-flights', 'cora', 'citeseer',
            'polblogs', 'pubmed'
        ][:-1]

    # mus = [5, 6]
    mus = range(3, 11)
    snap_id = 0  # snap id is the track of generated graphs - 10

    if clusterings is None:
        clusterings = ['cond', 'leiden', 'spectral', 'consensus']

    for name in names:
        orig_graph, attr_name = get_graph(name, basedir=base_path)
        orig_deg_ast = nx.degree_assortativity_coefficient(orig_graph)
        orig_att_ast = nx.attribute_assortativity_coefficient(
            orig_graph, attr_name)

        orig_gstats = GraphStats(orig_graph)

        for gen_filename in glob.glob(
                f'{base_path}/output/generators/{name}/*'):
            path = Path(gen_filename)
            gen: RandomGenerator = load_pickle(
                path
            )  # all gen snapshots has 10 different generations - we need maybe just 1
            if gen is None: continue

            print(path.stem, end='\t', flush=True)
            pattern = r'(.*)\_(\w+)\_(\d+)\_(\d+)'
            m = re.match(pattern, path.stem)
            grammar_type, clustering, mu, _ = m.groups()
            mu = int(mu)
            if mu not in mus or clustering not in clusterings: continue

            generated_graph_snapshots = gen.all_gen_snapshots[0]
            del gen  # delete the object to save memory

            if num_samples is None:
                num_samples = len(generated_graph_snapshots)

            indices = sorted(
                set(
                    np.linspace(0,
                                len(generated_graph_snapshots) - 1,
                                num_samples,
                                dtype=int,
                                endpoint=True)))

            for t in indices:
                graph = generated_graph_snapshots[t]
                terminal_graph = filter_terminal_graph(graph)
                terminal_graph = un_nest_attr_dict(terminal_graph)
                row = {col: np.nan for col in cols}

                row.update(
                    dict(snap_id=snap_id,
                         name=name,
                         model=grammar_type,
                         clustering=clustering,
                         attr_name=attr_name,
                         orig_n=orig_graph.order(),
                         orig_m=orig_graph.size(),
                         orig_deg_ast=orig_deg_ast,
                         orig_attr_ast=orig_att_ast,
                         mu=mu,
                         t=t,
                         n=graph.order(),
                         m=graph.size(),
                         term_graph=terminal_graph,
                         term_n=terminal_graph.order(),
                         term_m=terminal_graph.size()))

                if terminal_graph.size() > 0:
                    gen_gstats = GraphStats(terminal_graph)

                    gpc = GraphPairCompare(orig_gstats, gen_gstats)
                    row.update(
                        term_degree_js=gpc.degree_js(),
                        term_pagerank_js=gpc.pagerank_js(),
                        term_lambda_dist=gpc.lambda_dist(),
                        term_deg_ast=nx.degree_assortativity_coefficient(
                            terminal_graph),
                        term_attr_ast=nx.attribute_assortativity_coefficient(
                            terminal_graph, attr_name),
                        deg_mix_dist_dict=gpc.deg_mixing_dist_dict(),
                        attr_mix_dist_dict=gpc.attr_mixing_dist_dict())
                rows.append(row)
            temp_df = pd.DataFrame(rows)
            temp_df.to_csv(f'{base_path}/stats/temp_gen_df.csv', index=False)
    return pd.DataFrame(rows)
예제 #17
0
def make_grammar_df(base_path, names, clusterings, overwrite):
    cost_dict_pickle_fname = join(basedir, 'input', 'cost_dict.pkl')
    root_dict_pickle_fname = join(basedir, 'input', 'root_dict.pkl')
    cost_dict = load_pickle(cost_dict_pickle_fname)
    root_dict = load_pickle(root_dict_pickle_fname)
    if root_dict is None:
        root_dict = {}
        recompute = True
    else:
        recompute = False
        for name in names:
            if name not in root_dict:
                recompute = True
            for clustering in clusterings:
                if clustering not in root_dict[name]:
                    recompute = True

    if recompute:
        for name in tqdm(names, desc='Name'):
            orig_graph, attr_name = get_graph(name, basedir=base_path)
            if name not in root_dict:
                root_dict[name] = {}
            for clustering in tqdm(clusterings, desc='Clustering',
                                   leave=False):
                if clustering in root_dict[name]:
                    continue
                root = load_pickle(
                    f'{base_path}/output/trees/{name}/{clustering}_list.pkl')
                if root is None:
                    continue
                if isinstance(root, list):
                    root = create_tree(root)
                ht, avg_branch_factor, _ = get_tree_stats(g=orig_graph,
                                                          root=root,
                                                          cost=False)
                dc = cost_dict[name][clustering]
                root_dict[name][clustering] = ht, avg_branch_factor, dc
            dump_pickle(root_dict, root_dict_pickle_fname)

    print(root_dict)
    dl_dict = {}

    for name in names:
        temp_fname = f'{base_path}/stats/temp/_grammar_df_{name}.csv'
        if Path(temp_fname).exists() and not overwrite:
            print(f'Skipping {name!r}')
            continue

        orig_graph, attr_name = get_graph(name, basedir=base_path)
        dl_dict[name] = graph_mdl(orig_graph, attributed=True)
        rows = []

        print('\n\n', name)
        files = glob.glob(f'{base_path}/output/grammars/{name}/*.pkl')
        for fname in tqdm(files, total=len(files), desc=f'{name}'):
            path = Path(fname)
            pattern = r'(\w+)_(.+)\_(\w+)_(.+).*'
            m = re.match(pattern, path.stem)
            if m is None:
                continue
            grammar_type, extract_type, clustering, mu = m.groups()
            if clustering not in clusterings:  # skip over clusterings we dont care about
                continue

            if grammar_type.startswith('VRG'):  # skip over regular VRGs
                continue

            tqdm.write(f'{grammar_type}, {extract_type}, {clustering}, {mu}')
            ht, avg_branch_factor, dc = root_dict[name][clustering]

            vrg = load_pickle(fname)
            if vrg is None:
                continue
            graph_dl = dl_dict[name]

            row = dict(name=name,
                       orig_n=orig_graph.order(),
                       orig_m=orig_graph.size(),
                       grammar_type=grammar_type,
                       extract_type=vrg.extract_type,
                       mu=int(mu),
                       clustering=clustering,
                       cost=dc,
                       branch_factor=avg_branch_factor,
                       height=ht,
                       graph_dl=graph_dl,
                       num_rules=vrg.num_rules,
                       unique_rules=len(vrg.unique_rule_list),
                       grammar_dl=vrg.cost)
            rows.append(row)
        temp_df = pd.DataFrame(rows)
        temp_df.to_csv(temp_fname, index=False)

    # for name in names:
    #     temp_fname = f'{base_path}/stats/_grammar_df_{name}.csv'
    #     if Path(temp_fname).exists():
    #         os.remove(temp_fname)
    return pd.DataFrame(rows)