def colored_motifs(): basedir = '/data/ssikdar/Attributed-VRG/' names = [ 'polbooks', 'football', 'wisconsin', 'texas', 'cornell', 'polblogs' ] # names = ['citeseer', 'cora', 'airports'] models = ['AVRG', 'CL', 'AGM', 'DC-SBM', 'CELL', 'NetGAN', 'original'] for name in names: for model in models: print(f'Running {name!r} {model!r}') if model == 'AVRG': model_ = 'AVRG-fancy_mu-random_leiden_5' else: model_ = model graphs_filename = join(basedir, 'output/graphs/', name, f'{model_}_10.pkl') if model == 'original': graphs = [nx.read_gml(join(basedir, 'input', f'{name}.gml'))] else: graphs = load_pickle(graphs_filename) if graphs is None: continue # batch_motif_counter(name, model, basedir, overwrite=False, graphs=None, motif_filename=None): batch_motif_counter(name=name, model=model, basedir=basedir, graphs=graphs, overwrite=False) return
def autoencoders(outdir, name, model): model_path = join(outdir, 'output', 'other_models', 'autoencoders') # if not Path(model_path).exists(): # os.makedirs(model_path) model_path = join(model_path, f'{name}_{model}_mat.pkl') graphs_path = join(outdir, 'output', 'graphs', name, f'{model}_10.pkl') # if Path(graphs_path).exists(): # return # input_g, _ = get_graph(name, basedir=outdir) if Path(model_path).exists(): thresh_mat = load_pickle(model_path) graphs = [] ns, ms = [], [] for _ in range(10): g = get_graph_from_prob_matrix(thresh_mat, thresh=0.5) nx.set_node_attributes(g, name='value', values=nx.get_node_attributes( input_g, 'value')) ns.append(g.order()) ms.append(g.size()) graphs.append(g) print('Avg n, m', np.round(np.mean(ns), 3), np.round(np.mean(ms), 3)) dump_pickle(graphs, graphs_path) return from other_models.autoencoders.fit import fit_model _, thresh_mat = fit_model(g=input_g, model_name=model) dump_pickle(thresh_mat, model_path) return
def main(): machine_name, outdir = get_machine_name_and_outdir() names = [ 'karate', 'football', 'polbooks', 'wisconsin', 'texas', 'film', 'cornell', 'cora', 'citeseer', 'airports', 'polblogs', 'chameleon', 'pubmed', 'squirrel' ] clusterings = [ 'cond', 'spectral', 'leiden', 'louvain', 'infomap', 'labelprop', 'random', 'leadingeig', 'consensus' ][:-1] for name in names: g, _ = get_graph(name, basedir=outdir) make_dirs(outdir=outdir, name=name) for clustering in clusterings: tree = load_pickle( join(outdir, 'output', 'trees', name, f'{clustering}_list.pkl')) if tree is None: continue root = create_tree(tree) faulty_tnodes = tree_okay(root=root, g=g) if faulty_tnodes > 0: print(f'{name}\t{clustering}\t{faulty_tnodes:,d} errors') return
def generate_graphs(name: str, grammar: AttributedVRG, num_graphs: int, extract_type: str, gen_type: str, basedir: str, graphs_filename: str, mixing_dict: Union[None, Dict] = None, attr_name: Union[str, None] = None, fancy=None, inp_deg_ast: float = None, inp_attr_ast: float = None, use_pickle: bool = False, save_snapshots: bool = False, alpha: Union[None, float] = None, write_pickle: bool = True) -> List[nx.Graph]: # make_dirs(outdir=outdir, name=name) # if fancy and grammar_type == 'AVRG': grammar_type += '-fancy' # if alpha is not None: grammar_type += f'-{int(alpha * 100)}' gen_filename = f'{basedir}/output/generators/{name}/{gen_type}_{extract_type}_{grammar.clustering}_{grammar.mu}_{num_graphs}.pkl' if use_pickle and check_file_exists(graphs_filename): if not save_snapshots: logging.error(f'Graph pickle found! {graphs_filename!r}') return load_pickle(graphs_filename) if save_snapshots and check_file_exists(gen_filename): logging.error(f'Gen pickle found, skipping: {gen_filename!r}') return load_pickle(graphs_filename) logging.error(f'Graphs filename: {graphs_filename!r}') if isinstance(grammar, AttributedVRG): assert attr_name != '' # assert fancy is not None if 'greedy' in gen_type: assert inp_attr_ast is not None and inp_deg_ast is not None gen = GreedyAttributeRandomGenerator(grammar=grammar, mixing_dict=mixing_dict, attr_name=attr_name, inp_attr_ast=inp_attr_ast, inp_deg_ast=inp_deg_ast, save_snapshots=save_snapshots, alpha=alpha) else: gen = AttributedRandomGenerator(grammar=grammar, mixing_dict=mixing_dict, attr_name=attr_name, use_fancy_rewiring=fancy, save_snapshots=save_snapshots) elif isinstance(grammar, VRG): gen = RandomGenerator(grammar=grammar, save_snapshots=save_snapshots) elif isinstance(grammar, NCE): gen = NCEGenerator(grammar=grammar) else: raise NotImplementedError(f'Invalid grammar type {type(grammar)!r}') graphs = gen.generate(num_graphs=num_graphs) if write_pickle: dump_pickle(graphs, graphs_filename) if save_snapshots: dump_pickle(gen, gen_filename) return graphs
def batch_synthetic_generator_runner(): # frac = np.linspace(0, 1, 21, endpoint=True) * 100 frac = np.linspace(0, 100, 11, endpoint=True, dtype=int) # change it to increments of 10 for now names = [f'toy-comm-{f}' for f in frac] # names = ['karate', 'football', 'polbooks', 'eucore', 'flights', 'chess', 'polblogs'] num_graphs = 5 outdir = '/data/ssikdar/attributed-vrg/dumps' use_pickle = True save_snapshots = False shuffle = 'edges' args = [] for name in names: # input_graph, attr_name = get_graph(name) input_graph, attr_name = nx.read_gexf( f'./input/shuffled/{shuffle}/{name}.gexf', node_type=int), 'block' name = f'{name}-{shuffle}' if attr_name == '': mix_dict, inp_deg_ast, inp_attr_ast = None, None, None else: mix_dict = get_mixing_dict(input_graph, attr_name=attr_name) inp_deg_ast = nx.degree_assortativity_coefficient(input_graph) inp_attr_ast = nx.attribute_assortativity_coefficient( input_graph, attr_name) for grammar_filename in glob(f'{outdir}/grammars/{name}/*'): grammar = load_pickle(grammar_filename) if isinstance(grammar, AttributedVRG): grammar_type = 'AVRG' fancy = True args.append((name, grammar, num_graphs, grammar_type, outdir, mix_dict, attr_name, fancy, inp_deg_ast, inp_attr_ast, use_pickle, save_snapshots)) grammar_type = 'AVRG-greedy' # args.append((name, grammar, num_graphs, grammar_type, outdir, mix_dict, attr_name, fancy, # inp_deg_ast, inp_attr_ast, use_pickle, save_snapshots)) for alpha in (0, 0.5, 1): args.append( (name, grammar, num_graphs, grammar_type, outdir, mix_dict, attr_name, fancy, inp_deg_ast, inp_attr_ast, use_pickle, save_snapshots, alpha)) else: assert isinstance(grammar, VRG) grammar_type = 'VRG' fancy = None args.append((name, grammar, num_graphs, grammar_type, outdir, mix_dict, attr_name, fancy, inp_deg_ast, inp_attr_ast, use_pickle, save_snapshots)) parallel_async(func=generate_graphs, args=args, num_workers=10) # generate_graphs(grammar: Union[VRG, NCE, AttributedVRG], num_graphs: int, grammar_type: str, outdir: str = 'dumps', # mixing_dict: Union[None, Dict] = None, attr_name: Union[str, None] = None, fancy = None, # inp_deg_ast: float = None, inp_attr_ast: float = None) return
def get_grammars(name: str, grammar_type: str, extract_type: str, clustering: str, mu: int, input_graph: nx.Graph, use_grammar_pickle: bool, use_cluster_pickle: bool, attr_name: str, outdir: str, count: int = 1, grammar_filename: str = '', write_pickle: bool = True, list_of_list_clusters=None) -> List[Union[VRG, NCE]]: """ Dump the stats :return: """ if input_graph.name != name: input_graph.name = name # make_dirs(outdir, name) # make the directories if needed # print(f'Extracting {count} grammars') grammars = [] for i in range(count): if grammar_filename == '': raise Exception('filename empty') if use_grammar_pickle and check_file_exists(grammar_filename): logging.error(f'Using pickled grammar from {grammar_filename!r}') grammar = load_pickle(grammar_filename) else: if list_of_list_clusters is None: list_of_list_filename = os.path.join(outdir, 'output', 'trees', input_graph.name, f'{clustering}_list.pkl') if not Path(list_of_list_filename).exists(): logging.error(f'Skipping grammar, name {input_graph.name!r} clustering {clustering!r}') continue list_of_list_clusters = get_clustering(g=input_graph, outdir=outdir, clustering=clustering, use_pickle=use_cluster_pickle, filename=list_of_list_filename) root = create_tree(list_of_list_clusters) if isinstance(list_of_list_clusters, list) else list_of_list_clusters # dc = dasgupta_cost(g=g, root=root, use_parallel=True) lmg: LightMultiGraph = nx_to_lmg(nx_g=input_graph) logging.error(f'Extracting grammar: {grammar_filename}') if grammar_type == 'VRG': extractor = VRGExtractor(g=lmg, extract_type=extract_type, mu=mu, root=root, clustering=clustering) elif grammar_type == 'NCE': extractor = NCEExtractor(g=lmg, extract_type=extract_type, mu=mu, root=root, clustering=clustering) elif grammar_type == 'AVRG': assert attr_name != '' extractor = AVRGExtractor(g=lmg, attr_name=attr_name, extract_type=extract_type, clustering=clustering, mu=mu, root=root) else: raise NotImplementedError(f'Invalid grammar type {grammar_type!r}') grammar = extractor.extract() logging.error(str(grammar)) if write_pickle: dump_pickle(grammar, grammar_filename) grammars.append(grammar) return grammars
def read_batched_graphs(basedir, name): input_graphs = load_pickle(join(basedir, 'input', f'{name}.graphs')) cleaned_graphs = [] for i, g in enumerate(input_graphs): g.remove_edges_from(nx.selfloop_edges(g)) if not nx.is_connected(g): nodes_lcc = max(nx.connected_components(g), key=len) g = g.subgraph(nodes_lcc).copy() g = nx.convert_node_labels_to_integers(g, label_attribute='orig_label') g.name = f'{name}_{i}' cleaned_graphs.append(g) return cleaned_graphs
def batched_graphs_generator(basedir, clusterings, name, mus=None): # num_graphs = 5 if 'polblogs' in name else 10 num_graphs = 10 use_pickle = True save_snapshots = False attr_name = 'value' mus = [5] alpha = None input_graphs = read_batched_graphs(basedir=basedir, name=name) extract_types = ['mu_random'] args = [] for i, input_graph in enumerate(input_graphs): mix_dict = get_mixing_dict(input_graph, attr_name=attr_name) inp_deg_ast = nx.degree_assortativity_coefficient(input_graph) inp_attr_ast = nx.attribute_assortativity_coefficient( input_graph, attr_name) for grammar_filename in glob( f'{basedir}/output/grammars/{name}/*_{i}.pkl'): grammar = load_pickle(grammar_filename) if grammar.mu not in mus or grammar.clustering not in clusterings or grammar.extract_type not in extract_types: continue extract_type = grammar.extract_type.replace('_', '-') if isinstance(grammar, AttributedVRG): for gen_type, fancy in zip(('AVRG-regular', 'AVRG-fancy'), (False, True)): graphs_filename = f'{basedir}/output/graphs/{name}/{gen_type}_{extract_type}_{grammar.clustering}_{grammar.mu}_{num_graphs}_{i}.pkl' args.append((name, grammar, num_graphs, extract_type, gen_type, basedir, graphs_filename, mix_dict, attr_name, fancy, inp_deg_ast, inp_attr_ast, use_pickle, save_snapshots, alpha)) for alpha, gen_type in zip( (0, 0.5, 1), ('AVRG-greedy-attr', 'AVRG-greedy-50', 'AVRG-greedy-deg')): graphs_filename = f'{basedir}/output/graphs/{name}/{gen_type}_{extract_type}_{grammar.clustering}_{grammar.mu}_{num_graphs}_{i}.pkl' args.append((name, grammar, num_graphs, extract_type, gen_type, basedir, graphs_filename, mix_dict, attr_name, fancy, inp_deg_ast, inp_attr_ast, use_pickle, save_snapshots, alpha)) # random.shuffle(args) parallel_async(func=generate_graphs, args=args, num_workers=8) return
def batched_graphs_grammars(basedir, name, clusterings): input_graphs = read_batched_graphs(basedir=basedir, name=name) attr_name = 'value' grammar_types = ['AVRG'] # ['VRG', 'AVRG'] extract_types = ['mu_random'] #, 'mu_level', 'all_tnodes'] mus = [5] use_cluster_pickle = True use_grammar_pickle = True count = 1 args = [] for i, input_graph in enumerate(input_graphs): for clustering in clusterings: list_of_list_clusters = load_pickle( join(basedir, 'output', 'trees', name, f'{clustering}_{i}.pkl')) for grammar_type in grammar_types: for extract_type in extract_types: extract = extract_type.replace('_', '-') for mu in mus: grammar_filename = f'{basedir}/output/grammars/{name}/{grammar_type}_{extract}_{clustering}_{mu}_{i}.pkl' arg = (name, grammar_type, extract_type, clustering, mu, input_graph, True, True, attr_name, basedir, 1, grammar_filename, True, list_of_list_clusters) args.append(arg) if extract_type == 'all_tnodes': # here mu is not important for all_tnodes break # print(args[: 3]) try: parallel_async(func=get_grammars, args=args, num_workers=5) except Exception as e: print(e) return
def get_clustering(g: nx.Graph, outdir: str, clustering: str, use_pickle: bool, filename='', write_pickle: bool = True) -> Any: """ wrapper method for getting dendrogram. uses an existing pickle if it can. :param g: graph :param outdir: output directory where picles are stored :param clustering: name of clustering method :param use_pickle: flag to whether or not to use the pickle :return: root node of the dendrogram """ if g.name == 'sample': list_of_list_clusters = [ [ [[0], [1]], [[2], [[3], [4]]] ], [ [[5], [6]], [[7], [8]] ] ] return list_of_list_clusters if filename == '': list_of_list_filename = os.path.join(outdir, 'output', 'trees', g.name, f'{clustering}_list.pkl') else: list_of_list_filename = filename if check_file_exists(list_of_list_filename) and use_pickle: logging.error(f'Using existing pickle for {clustering!r} clustering\n') list_of_list_clusters = load_pickle(list_of_list_filename) else: tqdm.write(f'Running {clustering!r} clustering on {g.name!r}...') if clustering == 'random': list_of_list_clusters = partitions.get_random_partition(g) elif clustering == 'consensus': # delete the matlab tree and sc files matlab_files_path = './src/matlab_clustering/HierarchicalConsensus/data' tree_path = os.path.join(matlab_files_path, f'{g.name}_tree.mat') sc_path = os.path.join(matlab_files_path, f'{g.name}_sc.vec') if check_file_exists(tree_path): os.remove(tree_path) if check_file_exists(sc_path): os.remove(sc_path) list_of_list_clusters = get_consensus_root(g=g, gname=g.name) elif clustering in ('leiden', 'louvain', 'infomap', 'labelprop', 'leadingeig'): try: list_of_list_clusters = partitions.louvain_leiden_infomap_label_prop(g, method=clustering) except Exception as e: list_of_list_clusters = [] elif clustering == 'cond': list_of_list_clusters = partitions.approx_min_conductance_partitioning(g) elif clustering == 'spectral': list_of_list_clusters = partitions.spectral_kmeans(g, K=int(math.sqrt(g.order() // 2))) else: raise NotImplementedError(f'Invalid clustering algorithm {clustering!r}') if len(list_of_list_clusters) != 0 and write_pickle: dump_pickle(list_of_list_clusters, list_of_list_filename) return list_of_list_clusters
def _partition_graph(self, test_frac=.1, val_frac=.05, prevent_disconnect=True, verbose=False, use_pickle=False): # NOTE: Splits are randomized and results might slightly deviate from reported numbers in the paper. # taken from https://github.com/lucashu1/link-prediction/blob/master/gae/preprocessing.py if self.splits_filename is None: self.splits_filename = join( self.outdir, 'output', 'splits', f'{self.dataset}_{int(test_frac * 100)}_{int(val_frac * 100)}') if use_pickle and check_file_exists(self.splits_filename): logging.error(f'Using pickle at {splits_filename!r}') adj_train, train_edges, train_edges_false, val_edges, val_edges_false, \ test_edges, test_edges_false = load_pickle(splits_filename) else: g = nx.Graph(self.input_graph) adj = nx.to_scipy_sparse_matrix(g) orig_num_cc = nx.number_connected_components(g) adj_triu = sp.triu(adj) # upper triangular portion of adj matrix adj_tuple = sparse_to_tuple( adj_triu) # (coords, values, shape), edges only 1 way edges = adj_tuple[0] # all edges, listed only once (not 2 ways) # edges_all = sparse_to_tuple(adj)[0] # ALL edges (includes both ways) num_test = int(np.floor( edges.shape[0] * test_frac)) # controls how large the test set should be num_val = int(np.floor( edges.shape[0] * val_frac)) # controls how alrge the validation set should be # Store edges in list of ordered tuples (node1, node2) where node1 < node2 edge_tuples = [(min(edge[0], edge[1]), max(edge[0], edge[1])) for edge in edges] all_edge_tuples = set(edge_tuples) train_edges = set( edge_tuples) # initialize train_edges to have all edges test_edges = set() val_edges = set() if verbose: print('generating test/val sets...', end=' ', flush=True) # Iterate over shuffled edges, add to train/val sets np.random.shuffle(edge_tuples) for edge in edge_tuples: node1, node2 = edge g.remove_edge( node1, node2 ) # If removing edge would disconnect a connected component, backtrack if prevent_disconnect: if nx.number_connected_components(g) > orig_num_cc: g.add_edge(node1, node2) continue # Fill test_edges first if len(test_edges) < num_test: test_edges.add(edge) train_edges.remove(edge) # Then, fill val_edges elif len(val_edges) < num_val: val_edges.add(edge) train_edges.remove(edge) # Both edge lists full --> break loop elif len(test_edges) == num_test and len(val_edges) == num_val: break if (len(val_edges) < num_val) or (len(test_edges) < num_test): print( 'WARNING: not enough removable edges to perform full train-test split!' ) print( f'Num. (test, val) edges requested: {num_test, num_val})') print( f'Num. (test, val) edges returned: {len(test_edges), len(val_edges)}' ) if prevent_disconnect: assert nx.number_connected_components(g) == orig_num_cc if verbose: print('creating false test edges...', end=' ', flush=True) test_edges_false = set() while len(test_edges_false) < num_test: idx_i = np.random.randint(0, adj.shape[0]) idx_j = np.random.randint(0, adj.shape[0]) if idx_i == idx_j: continue false_edge = (min(idx_i, idx_j), max(idx_i, idx_j)) # Make sure false_edge not an actual edge, and not a repeat if false_edge in all_edge_tuples: continue if false_edge in test_edges_false: continue test_edges_false.add(false_edge) if verbose: print('creating false val edges...', end=' ', flush=True) val_edges_false = set() while len(val_edges_false) < num_val: idx_i = np.random.randint(0, adj.shape[0]) idx_j = np.random.randint(0, adj.shape[0]) if idx_i == idx_j: continue false_edge = (min(idx_i, idx_j), max(idx_i, idx_j)) # Make sure false_edge in not an actual edge, not in test_edges_false, not a repeat if false_edge in all_edge_tuples or \ false_edge in test_edges_false or \ false_edge in val_edges_false: continue val_edges_false.add(false_edge) if verbose: print('creating false train edges...') train_edges_false = set() while len(train_edges_false) < len(train_edges): idx_i = np.random.randint(0, adj.shape[0]) idx_j = np.random.randint(0, adj.shape[0]) if idx_i == idx_j: continue false_edge = (min(idx_i, idx_j), max(idx_i, idx_j)) # Make sure false_edge in not an actual edge, not in test_edges_false, # not in val_edges_false, not a repeat if false_edge in all_edge_tuples or \ false_edge in test_edges_false or \ false_edge in val_edges_false or \ false_edge in train_edges_false: continue train_edges_false.add(false_edge) if verbose: print('final checks for disjointness...', end=' ', flush=True) # assert: false_edges are actually false (not in all_edge_tuples) assert test_edges_false.isdisjoint(all_edge_tuples) assert val_edges_false.isdisjoint(all_edge_tuples) assert train_edges_false.isdisjoint(all_edge_tuples) # assert: test, val, train false edges disjoint assert test_edges_false.isdisjoint(val_edges_false) assert test_edges_false.isdisjoint(train_edges_false) assert val_edges_false.isdisjoint(train_edges_false) # assert: test, val, train positive edges disjoint assert val_edges.isdisjoint(train_edges) assert test_edges.isdisjoint(train_edges) assert val_edges.isdisjoint(test_edges) if verbose: print('creating adj_train...', end=' ', flush=True) # Re-build adj matrix using remaining graph adj_train = nx.adjacency_matrix(g) # Convert edge-lists to numpy arrays train_edges = np.array( [list(edge_tuple) for edge_tuple in train_edges]) train_edges_false = np.array( [list(edge_tuple) for edge_tuple in train_edges_false]) val_edges = np.array( [list(edge_tuple) for edge_tuple in val_edges]) val_edges_false = np.array( [list(edge_tuple) for edge_tuple in val_edges_false]) test_edges = np.array( [list(edge_tuple) for edge_tuple in test_edges]) test_edges_false = np.array( [list(edge_tuple) for edge_tuple in test_edges_false]) if verbose: print('Done with train-test split!') # NOTE: these edge lists only contain single direction of edge! dump_pickle((adj_train, train_edges, train_edges_false, val_edges, val_edges_false, test_edges, test_edges_false), splits_filename) logging.error( f'train (T/F): {len(train_edges)} valid: {len(val_edges)} ({val_frac*100}%) test: {len(test_edges)} ({test_frac*100}%)' ) return adj_train, train_edges, train_edges_false, val_edges, val_edges_false, test_edges, test_edges_false
] # 'netgan', 'cell', ] # names = ['citeseer'] models = ['cell'] for name in names: name_fname = join(basedir, 'stats/link_pred', f'{name}.csv') orig_g, att_name = get_graph(name, basedir=basedir) model_dfs = [] trials = 10 test_frac, val_frac = 0.1, 0.05 for model in models: model_rows = [] model_fname = join(basedir, 'stats/link_pred', f'{name}_{model}.csv') if Path(model_fname).exists(): model_df = load_pickle(model_fname) continue for trial in range(1, trials + 1): splits_filename = join( basedir, 'output', 'splits', f'{name}_{int(test_frac*100)}_{int(val_frac*100)}_{trial}.pkl' ) link_pred = LinkPrediction(input_graph=orig_g, test_valid_split=(test_frac, val_frac), dataset=name, use_pickle=True, outdir=basedir, splits_filename=splits_filename ) # use a diff split each time
def make_graph_df(name, fname, orig_graph, mu, clustering, attr_name, grammar_type, bipartite=False): deg_ast_fn = nx.degree_assortativity_coefficient attr_ast_fn = nx.attribute_assortativity_coefficient gen_graphs = load_pickle(fname) if gen_graphs is None: return pd.DataFrame() cols = [ 'name', 'orig_n', 'orig_m', 'orig_degree_ast', 'orig_sp_ast_50', 'orig_sp_ast_100', 'orig_sp_ast_500', 'attr_name', 'orig_attr_ast', 'model', 'mu', 'clustering', 'gen_n', 'gen_m', 'gen_degree_ast', 'gen_sp_ast_50', 'gen_sp_ast_100', 'gen_sp_ast_500', 'gen_attr_ast', 'total_rewired_edges', 'fancy_rewired_edges', 'degree_js', 'pagerank_js', 'lambda_dist', 'deg_ast_diff', 'attr_ast_diff', 'is_bipartite' ] row = {col: np.nan for col in cols} orig_deg_ast = deg_ast_fn(orig_graph) orig_attr_ast = attr_ast_fn(orig_graph, attr_name) if attr_name != '' else np.nan orig_gstats = GraphStats(orig_graph) orig_sp_ast_50 = orig_gstats.shortest_path_ast(alpha=0.5) orig_sp_ast_100 = orig_gstats.shortest_path_ast(alpha=1) orig_sp_ast_500 = orig_gstats.shortest_path_ast(alpha=5) orig_h_dict = get_compatibility_matrix(orig_graph, attr_name) orig_h = orig_h_dict['homophily_ratio'] orig_h_mat = orig_h_dict['compatibility_mat'] orig_h_map = orig_h_dict['mapping'] orig_is_bip = nx.algorithms.bipartite.is_bipartite( orig_graph) if bipartite else np.nan rows = [] for g in gen_graphs: gen_gstats = GraphStats(g) gpc = GraphPairCompare(orig_gstats, gen_gstats) gen_deg_ast = deg_ast_fn(g) gen_sp_ast_50 = gen_gstats.shortest_path_ast(alpha=0.5) gen_sp_ast_100 = gen_gstats.shortest_path_ast(alpha=1) gen_sp_ast_500 = gen_gstats.shortest_path_ast(alpha=5) gen_attr_ast = attr_ast_fn(g, attr_name) if attr_name != '' else np.nan total_rewired_edges = g.graph.get('total_rewirings', 0) fancy_rewired_edges = g.graph.get('fancy_rewirings', 0) h_dict = get_compatibility_matrix(g, attr_name) h = h_dict['homophily_ratio'] h_mat = h_dict['compatibility_mat'] h_map = h_dict['mapping'] gen_is_bip = nx.algorithms.bipartite.is_bipartite( g) if bipartite else np.nan row = dict(name=name, orig_n=orig_graph.order(), orig_m=orig_graph.size(), orig_deg_ast=orig_deg_ast, orig_sp_ast_50=orig_sp_ast_50, orig_sp_ast_100=orig_sp_ast_100, orig_sp_ast_500=orig_sp_ast_500, orig_attr_ast=orig_attr_ast, attr_name=attr_name, model=grammar_type, clustering=clustering, mu=mu, orig_homophily_ratio=orig_h, orig_homophily_mat=orig_h_mat, orig_homophily_map=orig_h_map, orig_is_bipartite=orig_is_bip, gen_n=g.order(), gen_m=g.size(), gen_deg_ast=gen_deg_ast, gen_sp_ast_50=gen_sp_ast_50, gen_sp_ast_100=gen_sp_ast_100, gen_sp_ast_500=gen_sp_ast_500, gen_attr_ast=gen_attr_ast, total_rewired_edges=total_rewired_edges, fancy_rewired_edges=fancy_rewired_edges, degree_js=gpc.degree_js(), pagerank_js=gpc.pagerank_js(), lambda_dist=gpc.lambda_dist(), gen_homophily_ratio=h, gen_homophily_mat=h_mat, gen_homophily_map=h_map, deg_mix_dist_dict=gpc.deg_mixing_dist_dict(), attr_mix_dist_dict=gpc.attr_mixing_dist_dict(), gen_is_biparite=gen_is_bip) rows.append(row) return pd.DataFrame(rows)
def make_graph_df_new(name: str, fname: str, basedir: str, orig_gstats: GraphStats, slow_stats: bool, model: str = 'AVRG'): gen_graphs = load_pickle(fname) if gen_graphs is None: return pd.DataFrame() # break down the filename to figure out the different parts path = Path(fname) if model == 'AVRG': pattern = r'(.+)\_(.+)\_(.+)\_(.+)\_(\d+)' m = re.match(pattern, path.stem) if m is None: return gen_type, extract_type, clustering, mu, _ = m.groups() else: pattern = r'(.+)\_(\d+)' m = re.match(pattern, path.stem) if m is None: return model, _ = m.groups() gen_type, extract_type, clustering, mu = np.nan, np.nan, np.nan, np.nan if slow_stats: orig_sp_ast_5 = orig_gstats.shortest_path_ast(alpha=0.05, fname=join( basedir, 'input', f'{name}.gml')) orig_sp_ast_50 = orig_gstats.shortest_path_ast(alpha=0.5, fname=join( basedir, 'input', f'{name}.gml')) orig_sp_ast_100 = orig_gstats.shortest_path_ast(alpha=1, fname=join( basedir, 'input', f'{name}.gml')) orig_sp_ast_500 = orig_gstats.shortest_path_ast(alpha=5, fname=join( basedir, 'input', f'{name}.gml')) orig_sp_ast_1000 = orig_gstats.shortest_path_ast(alpha=10, fname=join( basedir, 'input', f'{name}.gml')) orig_apl = orig_gstats.average_path_length() orig_avg_cc = orig_gstats.average_clustering() orig_stats = _get_basic_stats( gstats=orig_gstats, kind='orig') # add basic stats of the original graph rows = [] for g in gen_graphs: orig_graph = orig_gstats.graph row = dict(name=name, orig_graph=np.nan, model=model, gen_type=gen_type, extract_type=extract_type, clustering=clustering, mu=mu) row.update(orig_stats) # add the original stats gen_gstats = GraphStats(g) row.update(_get_basic_stats(gen_gstats, kind='gen')) gpc = GraphPairCompare(orig_gstats, gen_gstats) row.update( dict(degree_js=gpc.degree_js(), pagerank_js=gpc.pagerank_js(), lambda_dist=gpc.lambda_dist())) if slow_stats: gen_sp_ast_5 = gen_gstats.shortest_path_ast(alpha=0.05) gen_sp_ast_50 = gen_gstats.shortest_path_ast(alpha=0.5) gen_sp_ast_100 = gen_gstats.shortest_path_ast(alpha=1) gen_sp_ast_500 = gen_gstats.shortest_path_ast(alpha=5) gen_sp_ast_1000 = gen_gstats.shortest_path_ast(alpha=10) gen_apl = gen_gstats.average_path_length() gen_avg_cc = gen_gstats.average_clustering() row.update( dict(orig_apl=orig_apl, orig_avg_cc=orig_avg_cc, gen_apl=gen_apl, gen_avg_cc=gen_avg_cc)) row.update( dict(orig_sp_ast_5=orig_sp_ast_5, orig_sp_ast_50=orig_sp_ast_50, orig_sp_ast_100=orig_sp_ast_100, orig_sp_ast_500=orig_sp_ast_500, orig_sp_ast_1000=orig_sp_ast_1000)) row.update( dict(gen_sp_ast_5=gen_sp_ast_5, gen_sp_ast_50=gen_sp_ast_50, gen_sp_ast_100=gen_sp_ast_100, gen_sp_ast_500=gen_sp_ast_500, gen_sp_ast_1000=gen_sp_ast_1000)) rows.append(row) return pd.DataFrame(rows)
def batch_generator_runner(names, basedir, clusterings, mus=None, extract_types=None, save_snapshots=False, num_workers=10, shuffle=False): num_graphs = 10 # we need 1 graph to chart the progress # TODO: change this in the future? use_pickle = True save_snapshots = save_snapshots if mus is None: mus = list(range(3, 11)) + [-1] alpha = None args = [] for name in names: input_graph, attr_name = get_graph(name, basedir=basedir) if input_graph.size() > 3_000: save_snapshots = False mix_dict = get_mixing_dict(input_graph, attr_name=attr_name) inp_deg_ast = nx.degree_assortativity_coefficient(input_graph) inp_attr_ast = nx.attribute_assortativity_coefficient( input_graph, attr_name) for grammar_filename in glob(f'{basedir}/output/grammars/{name}/*'): grammar = load_pickle(grammar_filename) extract_type = grammar.extract_type.replace('_', '-') if grammar.mu not in mus or grammar.clustering not in clusterings or extract_type not in extract_types: continue print(Path(grammar_filename).stem) if isinstance(grammar, AttributedVRG): for gen_type, fancy in zip(('AVRG-regular', 'AVRG-fancy'), (False, True)): graphs_filename = f'{basedir}/output/graphs/{name}/{gen_type}_{extract_type}_{grammar.clustering}_{grammar.mu}_{num_graphs}.pkl' args.append((name, grammar, num_graphs, extract_type, gen_type, basedir, graphs_filename, mix_dict, attr_name, fancy, inp_deg_ast, inp_attr_ast, use_pickle, save_snapshots, alpha)) for alpha, gen_type in zip( (0, 0.5, 1), ('AVRG-greedy-attr', 'AVRG-greedy-50', 'AVRG-greedy-deg')): fancy = None graphs_filename = f'{basedir}/output/graphs/{name}/{gen_type}_{extract_type}_{grammar.clustering}_{grammar.mu}_{num_graphs}.pkl' args.append((name, grammar, num_graphs, extract_type, gen_type, basedir, graphs_filename, mix_dict, attr_name, fancy, inp_deg_ast, inp_attr_ast, use_pickle, save_snapshots, alpha)) else: continue # skip VRGs # assert isinstance(grammar, VRG) # grammar_type = 'VRG' # fancy = None # graphs_filename = f'{basedir}/output/graphs/{name}/{grammar_type}_{grammar.clustering}_{grammar.mu}_{num_graphs}.pkl' # args.append((name, grammar, num_graphs, grammar_type, outdir, mix_dict, attr_name, fancy, # inp_deg_ast, inp_attr_ast, use_pickle, save_snapshots, alpha, graphs_filename)) if shuffle: random.shuffle(args) try: parallel_async(func=generate_graphs, args=args, num_workers=num_workers) except Exception as e: print(e) return
def make_gen_df(base_path, names=None, clusterings=None, num_samples=None): """ num_samples is for number of samples of generated graph """ rows = [] cols = [ 'snap_id', 'name', 'model', 'clustering', 'attr_name', 'orig_n', 'orig_m', 'orig_deg_ast', 'orig_attr_ast', 'mu', 'n', 'm', 't', 'term_graph', 'term_n', 'term_m', 'term_degree_js', 'term_pagerank_js', 'term_lambda_dist', 'term_deg_ast', 'term_attr_ast' ] if names is None: names = [ 'karate', 'football', 'polbooks', 'us-flights', 'cora', 'citeseer', 'polblogs', 'pubmed' ][:-1] # mus = [5, 6] mus = range(3, 11) snap_id = 0 # snap id is the track of generated graphs - 10 if clusterings is None: clusterings = ['cond', 'leiden', 'spectral', 'consensus'] for name in names: orig_graph, attr_name = get_graph(name, basedir=base_path) orig_deg_ast = nx.degree_assortativity_coefficient(orig_graph) orig_att_ast = nx.attribute_assortativity_coefficient( orig_graph, attr_name) orig_gstats = GraphStats(orig_graph) for gen_filename in glob.glob( f'{base_path}/output/generators/{name}/*'): path = Path(gen_filename) gen: RandomGenerator = load_pickle( path ) # all gen snapshots has 10 different generations - we need maybe just 1 if gen is None: continue print(path.stem, end='\t', flush=True) pattern = r'(.*)\_(\w+)\_(\d+)\_(\d+)' m = re.match(pattern, path.stem) grammar_type, clustering, mu, _ = m.groups() mu = int(mu) if mu not in mus or clustering not in clusterings: continue generated_graph_snapshots = gen.all_gen_snapshots[0] del gen # delete the object to save memory if num_samples is None: num_samples = len(generated_graph_snapshots) indices = sorted( set( np.linspace(0, len(generated_graph_snapshots) - 1, num_samples, dtype=int, endpoint=True))) for t in indices: graph = generated_graph_snapshots[t] terminal_graph = filter_terminal_graph(graph) terminal_graph = un_nest_attr_dict(terminal_graph) row = {col: np.nan for col in cols} row.update( dict(snap_id=snap_id, name=name, model=grammar_type, clustering=clustering, attr_name=attr_name, orig_n=orig_graph.order(), orig_m=orig_graph.size(), orig_deg_ast=orig_deg_ast, orig_attr_ast=orig_att_ast, mu=mu, t=t, n=graph.order(), m=graph.size(), term_graph=terminal_graph, term_n=terminal_graph.order(), term_m=terminal_graph.size())) if terminal_graph.size() > 0: gen_gstats = GraphStats(terminal_graph) gpc = GraphPairCompare(orig_gstats, gen_gstats) row.update( term_degree_js=gpc.degree_js(), term_pagerank_js=gpc.pagerank_js(), term_lambda_dist=gpc.lambda_dist(), term_deg_ast=nx.degree_assortativity_coefficient( terminal_graph), term_attr_ast=nx.attribute_assortativity_coefficient( terminal_graph, attr_name), deg_mix_dist_dict=gpc.deg_mixing_dist_dict(), attr_mix_dist_dict=gpc.attr_mixing_dist_dict()) rows.append(row) temp_df = pd.DataFrame(rows) temp_df.to_csv(f'{base_path}/stats/temp_gen_df.csv', index=False) return pd.DataFrame(rows)
def make_grammar_df(base_path, names, clusterings, overwrite): cost_dict_pickle_fname = join(basedir, 'input', 'cost_dict.pkl') root_dict_pickle_fname = join(basedir, 'input', 'root_dict.pkl') cost_dict = load_pickle(cost_dict_pickle_fname) root_dict = load_pickle(root_dict_pickle_fname) if root_dict is None: root_dict = {} recompute = True else: recompute = False for name in names: if name not in root_dict: recompute = True for clustering in clusterings: if clustering not in root_dict[name]: recompute = True if recompute: for name in tqdm(names, desc='Name'): orig_graph, attr_name = get_graph(name, basedir=base_path) if name not in root_dict: root_dict[name] = {} for clustering in tqdm(clusterings, desc='Clustering', leave=False): if clustering in root_dict[name]: continue root = load_pickle( f'{base_path}/output/trees/{name}/{clustering}_list.pkl') if root is None: continue if isinstance(root, list): root = create_tree(root) ht, avg_branch_factor, _ = get_tree_stats(g=orig_graph, root=root, cost=False) dc = cost_dict[name][clustering] root_dict[name][clustering] = ht, avg_branch_factor, dc dump_pickle(root_dict, root_dict_pickle_fname) print(root_dict) dl_dict = {} for name in names: temp_fname = f'{base_path}/stats/temp/_grammar_df_{name}.csv' if Path(temp_fname).exists() and not overwrite: print(f'Skipping {name!r}') continue orig_graph, attr_name = get_graph(name, basedir=base_path) dl_dict[name] = graph_mdl(orig_graph, attributed=True) rows = [] print('\n\n', name) files = glob.glob(f'{base_path}/output/grammars/{name}/*.pkl') for fname in tqdm(files, total=len(files), desc=f'{name}'): path = Path(fname) pattern = r'(\w+)_(.+)\_(\w+)_(.+).*' m = re.match(pattern, path.stem) if m is None: continue grammar_type, extract_type, clustering, mu = m.groups() if clustering not in clusterings: # skip over clusterings we dont care about continue if grammar_type.startswith('VRG'): # skip over regular VRGs continue tqdm.write(f'{grammar_type}, {extract_type}, {clustering}, {mu}') ht, avg_branch_factor, dc = root_dict[name][clustering] vrg = load_pickle(fname) if vrg is None: continue graph_dl = dl_dict[name] row = dict(name=name, orig_n=orig_graph.order(), orig_m=orig_graph.size(), grammar_type=grammar_type, extract_type=vrg.extract_type, mu=int(mu), clustering=clustering, cost=dc, branch_factor=avg_branch_factor, height=ht, graph_dl=graph_dl, num_rules=vrg.num_rules, unique_rules=len(vrg.unique_rule_list), grammar_dl=vrg.cost) rows.append(row) temp_df = pd.DataFrame(rows) temp_df.to_csv(temp_fname, index=False) # for name in names: # temp_fname = f'{base_path}/stats/_grammar_df_{name}.csv' # if Path(temp_fname).exists(): # os.remove(temp_fname) return pd.DataFrame(rows)