def main(): machine_name, outdir = get_machine_name_and_outdir() names = [ 'karate', 'football', 'polbooks', 'wisconsin', 'texas', 'film', 'cornell', 'cora', 'citeseer', 'airports', 'polblogs', 'chameleon', 'pubmed', 'squirrel' ] clusterings = [ 'cond', 'spectral', 'leiden', 'louvain', 'infomap', 'labelprop', 'random', 'leadingeig', 'consensus' ][:-1] for name in names: g, _ = get_graph(name, basedir=outdir) make_dirs(outdir=outdir, name=name) for clustering in clusterings: tree = load_pickle( join(outdir, 'output', 'trees', name, f'{clustering}_list.pkl')) if tree is None: continue root = create_tree(tree) faulty_tnodes = tree_okay(root=root, g=g) if faulty_tnodes > 0: print(f'{name}\t{clustering}\t{faulty_tnodes:,d} errors') return
def autoencoders(outdir, name, model): model_path = join(outdir, 'output', 'other_models', 'autoencoders') # if not Path(model_path).exists(): # os.makedirs(model_path) model_path = join(model_path, f'{name}_{model}_mat.pkl') graphs_path = join(outdir, 'output', 'graphs', name, f'{model}_10.pkl') # if Path(graphs_path).exists(): # return # input_g, _ = get_graph(name, basedir=outdir) if Path(model_path).exists(): thresh_mat = load_pickle(model_path) graphs = [] ns, ms = [], [] for _ in range(10): g = get_graph_from_prob_matrix(thresh_mat, thresh=0.5) nx.set_node_attributes(g, name='value', values=nx.get_node_attributes( input_g, 'value')) ns.append(g.order()) ms.append(g.size()) graphs.append(g) print('Avg n, m', np.round(np.mean(ns), 3), np.round(np.mean(ms), 3)) dump_pickle(graphs, graphs_path) return from other_models.autoencoders.fit import fit_model _, thresh_mat = fit_model(g=input_g, model_name=model) dump_pickle(thresh_mat, model_path) return
def make_combined_graph_dfs(basedir, names, clusterings, bipartite=False): dfs = [] mus = range(3, 11) # mus = range(5, 8) possbile_models = 'SBM', 'DC-SBM', 'CL', 'AGM', 'NetGAN', 'CELL' for name in names: # add tqdm temp_fname = f'{basedir}/stats/temp/_graph_df_{name}.csv' if Path(temp_fname).exists(): print(f'Skipping {name!r}') continue sub_df = [] orig_graph, attr_name = get_graph(name, basedir=basedir) for fname in tqdm(glob.glob(f'{basedir}/output/graphs/{name}/*'), desc=f'{name}', ncols=100): path = Path(fname) if path.stem.startswith(possbile_models) or 'ae' in path.stem: pattern = r'(.*)\_(\d+)' m = re.match(pattern, path.stem) if m is None: continue grammar_type, _ = m.groups() mu = np.nan clustering = np.nan elif path.stem.startswith('AVRG'): pattern = r'(.*)\_(\w+)\_(\d+)\_(\d+)' m = re.match(pattern, path.stem) if m is None: continue grammar_type, clustering, mu, _ = m.groups() mu = int(mu) if mu not in mus or clustering not in clusterings: continue tqdm.write(path.stem) else: assert NotImplementedError(f'Invalid model: {path.stem!r}') continue try: df = make_graph_df(name, fname, orig_graph, mu, clustering, attr_name, grammar_type, bipartite, basedir) except Exception as e: print(f'ERROR in graph df! {e}') continue dfs.append(df) sub_df.append(df) # df.to_csv(temp_fname, index=False) if len(sub_df) > 0: temp_df = pd.concat(sub_df, ignore_index=True) temp_df.to_csv(temp_fname) print(f'Writing {name!r} to {temp_fname!r}') # os.remove(temp_df_filename) graph_df = pd.concat(dfs, ignore_index=True) return graph_df
def old_main(): basedir = '/data/ssikdar/Attributed-VRG' names = [ 'polbooks', 'football', 'wisconsin', 'texas', 'cornell', 'cora', 'citeseer', 'airports', 'polblogs', 'film', 'chameleon', 'squirrel' ][:-3] models = ['gcn_ae', 'gcn_vae'] args = [] for name in names: input_g, _ = get_graph(gname=name, basedir=basedir) for model in models: try: if model in ('netgan', 'cell'): netgan_cell_runner(outdir=basedir, model=model, name=name, input_g=input_g) elif 'ae' in model: autoencoders(outdir=basedir, name=name, model=model) elif model in ('SBM', 'DC-SBM', 'CL', 'AGM'): graphs = get_graphs_from_models(input_graph=input_g, num_graphs=10, name=name, model=model, outdir=basedir) print(graphs) except Exception as e: print(name, model, e) exit(0) # for name in names: # input_g, _ = get_graph(gname=name, basedir=basedir) # for model in models: # try: # if model in ('netgan', 'cell'): # netgan_cell_runner(outdir=basedir, model=model, name=name, input_g=input_g) # elif 'ae' in model: # autoencoders(outdir=basedir, name=name, model=model) # elif model in ('SBM', 'DC-SBM', 'CL', 'AGM'): # get_graphs_from_models(input_graph=input_g, num_graphs=10, name=name, model=model, outdir=basedir) # except Exception as e: # print(name, model, e) exit(0) # for name in names[: ]: # input_graph, attr_name = get_graph(name, basedir=outdir) # for model in 'SBM', 'DC-SBM', 'CL', 'AGM': # try: # get_graphs_from_models(input_graph=input_graph, num_graphs=10, name=name, model=model, outdir=outdir) # except Exception as e: # print(e) exit(0)
def batch_grammar_runner(names, clusterings, outdir, mus=None, extract_types=None, num_workers=8, shuffle=False): # grammar_types_1 = ['VRG', 'AVRG'] grammar_types = ['AVRG'] if extract_types is None: extract_types = ['mu_random', 'mu_level', 'all_tnodes'] if mus is None: mus = range(3, 11) # mus = [5, 6] use_cluster_pickle = True use_grammar_pickle = True count = 1 args = [] write_pickle = True for name in names: input_graph, attr_name = get_graph(name, basedir=outdir) for clustering in clusterings: for grammar_type in grammar_types: for extract_type in extract_types: for mu in mus: extract = extract_type.replace('_', '-') if extract_type == 'all_tnodes': mu = -1 grammar_filename = join( outdir, 'output', 'grammars', name, f'{grammar_type}_{extract}_{clustering}_{mu}.pkl') arg = (name, grammar_type, extract_type, clustering, mu, input_graph, use_grammar_pickle, use_cluster_pickle, attr_name, outdir, count, grammar_filename, write_pickle) args.append(arg) if extract_type == 'all_tnodes': # here mu is not important for all_tnodes break print(args[:3]) if shuffle: random.shuffle(args) try: parallel_async(func=get_grammars, args=args, num_workers=num_workers) except Exception as e: print(e) ## get_grammars(name: str, grammar_type: str, extract_type: str, clustering: str, mu: int, input_graph: nx.Graph, # use_grammar_pickle: bool, use_cluster_pickle: bool, attr_name: str, outdir: str, count: int = 1, # grammar_filename: str = '', write_pickle: bool = True, list_of_list_clusters=None) ## return
def avrg_link_pred(): basedir = '/data/ssikdar/Attributed-VRG/' names = [ 'polbooks', 'football', 'wisconsin', 'texas', 'cornell', 'cora', 'citeseer', ] for name in names: input_graph, _ = get_graph(name, basedir=basedir) extract_type = 'mu_random' mu = 5 clustering = 'leiden' grammar_filename = join(basedir, 'output/grammars', name, f'NCE_mu-random_{clustering}_{mu}.pkl') nce = get_grammars(name=name, grammar_type='NCE', extract_type=extract_type, clustering=clustering, attr_name='value', input_graph=input_graph, mu=mu, outdir=basedir, use_grammar_pickle=True, use_cluster_pickle=False, grammar_filename=grammar_filename)[0] print(nce) # AVRG-regular_mu-random_louvain_8_10.pkl graphs_filename = join(basedir, 'output/graphs', name, f'NCE_mu-random_{clustering}_{mu}_10.pkl') nce_graphs = generate_graphs(basedir=basedir, extract_type=extract_type, gen_type='NCE', grammar=nce, graphs_filename=graphs_filename, name=name, num_graphs=10, use_pickle=True) for out_g in nce_graphs: print(f'n={out_g.order():,d}, m={out_g.size():,d}, {type(out_g)}') print() return return
def batch_cluster_runner(names, outdir, clusterings=None): if clusterings is None: clusterings = [ 'cond', 'spectral', 'leiden', 'louvain', 'infomap', 'labelprop', 'random', 'leading_eig', 'consensus' ][:-1] use_pickle = True args = [] for name in names: g, _ = get_graph(name, basedir=outdir) g.name = name for clustering in clusterings: args.append((g, outdir, clustering, use_pickle, '')) random.shuffle(args) parallel_async(func=get_clustering, args=args) return
if __name__ == '__main__': basedir = '/data/ssikdar/Attributed-VRG' names = [ 'polbooks', 'football', 'wisconsin', 'texas', 'cornell', 'cora', 'citeseer', 'airports', 'polblogs', 'film', 'chameleon', 'squirrel' ][:5] models = [ 'AVRG', 'gcn_ae', 'gcn_vae', 'linear_ae', 'linear_vae', 'jaccard', 'adamic-adar' ] # 'netgan', 'cell', ] # names = ['citeseer'] models = ['cell'] for name in names: name_fname = join(basedir, 'stats/link_pred', f'{name}.csv') orig_g, att_name = get_graph(name, basedir=basedir) model_dfs = [] trials = 10 test_frac, val_frac = 0.1, 0.05 for model in models: model_rows = [] model_fname = join(basedir, 'stats/link_pred', f'{name}_{model}.csv') if Path(model_fname).exists(): model_df = load_pickle(model_fname) continue for trial in range(1, trials + 1): splits_filename = join( basedir, 'output', 'splits', f'{name}_{int(test_frac*100)}_{int(val_frac*100)}_{trial}.pkl' )
def batch_generator_runner(names, basedir, clusterings, mus=None, extract_types=None, save_snapshots=False, num_workers=10, shuffle=False): num_graphs = 10 # we need 1 graph to chart the progress # TODO: change this in the future? use_pickle = True save_snapshots = save_snapshots if mus is None: mus = list(range(3, 11)) + [-1] alpha = None args = [] for name in names: input_graph, attr_name = get_graph(name, basedir=basedir) if input_graph.size() > 3_000: save_snapshots = False mix_dict = get_mixing_dict(input_graph, attr_name=attr_name) inp_deg_ast = nx.degree_assortativity_coefficient(input_graph) inp_attr_ast = nx.attribute_assortativity_coefficient( input_graph, attr_name) for grammar_filename in glob(f'{basedir}/output/grammars/{name}/*'): grammar = load_pickle(grammar_filename) extract_type = grammar.extract_type.replace('_', '-') if grammar.mu not in mus or grammar.clustering not in clusterings or extract_type not in extract_types: continue print(Path(grammar_filename).stem) if isinstance(grammar, AttributedVRG): for gen_type, fancy in zip(('AVRG-regular', 'AVRG-fancy'), (False, True)): graphs_filename = f'{basedir}/output/graphs/{name}/{gen_type}_{extract_type}_{grammar.clustering}_{grammar.mu}_{num_graphs}.pkl' args.append((name, grammar, num_graphs, extract_type, gen_type, basedir, graphs_filename, mix_dict, attr_name, fancy, inp_deg_ast, inp_attr_ast, use_pickle, save_snapshots, alpha)) for alpha, gen_type in zip( (0, 0.5, 1), ('AVRG-greedy-attr', 'AVRG-greedy-50', 'AVRG-greedy-deg')): fancy = None graphs_filename = f'{basedir}/output/graphs/{name}/{gen_type}_{extract_type}_{grammar.clustering}_{grammar.mu}_{num_graphs}.pkl' args.append((name, grammar, num_graphs, extract_type, gen_type, basedir, graphs_filename, mix_dict, attr_name, fancy, inp_deg_ast, inp_attr_ast, use_pickle, save_snapshots, alpha)) else: continue # skip VRGs # assert isinstance(grammar, VRG) # grammar_type = 'VRG' # fancy = None # graphs_filename = f'{basedir}/output/graphs/{name}/{grammar_type}_{grammar.clustering}_{grammar.mu}_{num_graphs}.pkl' # args.append((name, grammar, num_graphs, grammar_type, outdir, mix_dict, attr_name, fancy, # inp_deg_ast, inp_attr_ast, use_pickle, save_snapshots, alpha, graphs_filename)) if shuffle: random.shuffle(args) try: parallel_async(func=generate_graphs, args=args, num_workers=num_workers) except Exception as e: print(e) return
def make_gen_df(base_path, names=None, clusterings=None, num_samples=None): """ num_samples is for number of samples of generated graph """ rows = [] cols = [ 'snap_id', 'name', 'model', 'clustering', 'attr_name', 'orig_n', 'orig_m', 'orig_deg_ast', 'orig_attr_ast', 'mu', 'n', 'm', 't', 'term_graph', 'term_n', 'term_m', 'term_degree_js', 'term_pagerank_js', 'term_lambda_dist', 'term_deg_ast', 'term_attr_ast' ] if names is None: names = [ 'karate', 'football', 'polbooks', 'us-flights', 'cora', 'citeseer', 'polblogs', 'pubmed' ][:-1] # mus = [5, 6] mus = range(3, 11) snap_id = 0 # snap id is the track of generated graphs - 10 if clusterings is None: clusterings = ['cond', 'leiden', 'spectral', 'consensus'] for name in names: orig_graph, attr_name = get_graph(name, basedir=base_path) orig_deg_ast = nx.degree_assortativity_coefficient(orig_graph) orig_att_ast = nx.attribute_assortativity_coefficient( orig_graph, attr_name) orig_gstats = GraphStats(orig_graph) for gen_filename in glob.glob( f'{base_path}/output/generators/{name}/*'): path = Path(gen_filename) gen: RandomGenerator = load_pickle( path ) # all gen snapshots has 10 different generations - we need maybe just 1 if gen is None: continue print(path.stem, end='\t', flush=True) pattern = r'(.*)\_(\w+)\_(\d+)\_(\d+)' m = re.match(pattern, path.stem) grammar_type, clustering, mu, _ = m.groups() mu = int(mu) if mu not in mus or clustering not in clusterings: continue generated_graph_snapshots = gen.all_gen_snapshots[0] del gen # delete the object to save memory if num_samples is None: num_samples = len(generated_graph_snapshots) indices = sorted( set( np.linspace(0, len(generated_graph_snapshots) - 1, num_samples, dtype=int, endpoint=True))) for t in indices: graph = generated_graph_snapshots[t] terminal_graph = filter_terminal_graph(graph) terminal_graph = un_nest_attr_dict(terminal_graph) row = {col: np.nan for col in cols} row.update( dict(snap_id=snap_id, name=name, model=grammar_type, clustering=clustering, attr_name=attr_name, orig_n=orig_graph.order(), orig_m=orig_graph.size(), orig_deg_ast=orig_deg_ast, orig_attr_ast=orig_att_ast, mu=mu, t=t, n=graph.order(), m=graph.size(), term_graph=terminal_graph, term_n=terminal_graph.order(), term_m=terminal_graph.size())) if terminal_graph.size() > 0: gen_gstats = GraphStats(terminal_graph) gpc = GraphPairCompare(orig_gstats, gen_gstats) row.update( term_degree_js=gpc.degree_js(), term_pagerank_js=gpc.pagerank_js(), term_lambda_dist=gpc.lambda_dist(), term_deg_ast=nx.degree_assortativity_coefficient( terminal_graph), term_attr_ast=nx.attribute_assortativity_coefficient( terminal_graph, attr_name), deg_mix_dist_dict=gpc.deg_mixing_dist_dict(), attr_mix_dist_dict=gpc.attr_mixing_dist_dict()) rows.append(row) temp_df = pd.DataFrame(rows) temp_df.to_csv(f'{base_path}/stats/temp_gen_df.csv', index=False) return pd.DataFrame(rows)
def make_all_graph_dfs_new(basedir, name, clusterings, models=None, final=False, slow_stats=False, mus=None, extract_types=None): dfs = [] if mus is None: mus = list(range(3, 11)) + [-1] if extract_types is None: extract_types = ['mu-random', 'mu-level', 'all-tnodes'] if models is None: models = ['AVRG'] write_every = 10 for model in models: if final: temp_fname = f'{basedir}/stats/final/graphs/{name}_graph_df_{model}.csv' else: temp_fname = f'{basedir}/stats/temp/graphs/_graph_df_{name}.csv' existing_df = pd.read_csv(temp_fname) if Path( temp_fname).exists() else None orig_graph, attr_name = get_graph(name, basedir=basedir) orig_gstats = GraphStats(orig_graph) i = 0 for fname in tqdm( glob.glob(f'{basedir}/output/graphs/{name}/{model}*'), desc=f'{name}', ncols=100): path = Path(fname) if not path.stem.startswith(tuple(models)): continue if model == 'AVRG': pattern = r'(.+)_(.+)_(.+)_(.+)_(\d+).*' m = re.match(pattern, path.stem) if m is None: return gen_type, extract_type, clustering, mu, _ = m.groups() mu = int(mu) if clustering not in clusterings or mu not in mus or extract_type not in extract_types: continue ## check if the row exists already if existing_df is not None: if not existing_df[(existing_df.name == name) & (existing_df.model == model) & (existing_df.gen_type == gen_type) & (existing_df.extract_type == extract_type) & (existing_df.clustering == clustering) & (existing_df.mu == mu)].empty: continue tqdm.write(path.stem) df = make_graph_df_new(basedir=basedir, fname=fname, name=name, orig_gstats=orig_gstats, model=model, slow_stats=slow_stats) # add df to the existing df if existing_df is None: existing_df = df else: existing_df = existing_df.append(df, ignore_index=True) if (i > 0) and (i % write_every == 0): tqdm.write(f'Writing partial results {name!r} {model!r}!') existing_df.to_csv(temp_fname, index=False) i += 1 if existing_df is not None: # write existing df again existing_df.to_csv(temp_fname, index=False) print(f'Writing {name!r} {model!r} to {temp_fname!r}') return
def make_grammar_df(base_path, names, clusterings, overwrite): cost_dict_pickle_fname = join(basedir, 'input', 'cost_dict.pkl') root_dict_pickle_fname = join(basedir, 'input', 'root_dict.pkl') cost_dict = load_pickle(cost_dict_pickle_fname) root_dict = load_pickle(root_dict_pickle_fname) if root_dict is None: root_dict = {} recompute = True else: recompute = False for name in names: if name not in root_dict: recompute = True for clustering in clusterings: if clustering not in root_dict[name]: recompute = True if recompute: for name in tqdm(names, desc='Name'): orig_graph, attr_name = get_graph(name, basedir=base_path) if name not in root_dict: root_dict[name] = {} for clustering in tqdm(clusterings, desc='Clustering', leave=False): if clustering in root_dict[name]: continue root = load_pickle( f'{base_path}/output/trees/{name}/{clustering}_list.pkl') if root is None: continue if isinstance(root, list): root = create_tree(root) ht, avg_branch_factor, _ = get_tree_stats(g=orig_graph, root=root, cost=False) dc = cost_dict[name][clustering] root_dict[name][clustering] = ht, avg_branch_factor, dc dump_pickle(root_dict, root_dict_pickle_fname) print(root_dict) dl_dict = {} for name in names: temp_fname = f'{base_path}/stats/temp/_grammar_df_{name}.csv' if Path(temp_fname).exists() and not overwrite: print(f'Skipping {name!r}') continue orig_graph, attr_name = get_graph(name, basedir=base_path) dl_dict[name] = graph_mdl(orig_graph, attributed=True) rows = [] print('\n\n', name) files = glob.glob(f'{base_path}/output/grammars/{name}/*.pkl') for fname in tqdm(files, total=len(files), desc=f'{name}'): path = Path(fname) pattern = r'(\w+)_(.+)\_(\w+)_(.+).*' m = re.match(pattern, path.stem) if m is None: continue grammar_type, extract_type, clustering, mu = m.groups() if clustering not in clusterings: # skip over clusterings we dont care about continue if grammar_type.startswith('VRG'): # skip over regular VRGs continue tqdm.write(f'{grammar_type}, {extract_type}, {clustering}, {mu}') ht, avg_branch_factor, dc = root_dict[name][clustering] vrg = load_pickle(fname) if vrg is None: continue graph_dl = dl_dict[name] row = dict(name=name, orig_n=orig_graph.order(), orig_m=orig_graph.size(), grammar_type=grammar_type, extract_type=vrg.extract_type, mu=int(mu), clustering=clustering, cost=dc, branch_factor=avg_branch_factor, height=ht, graph_dl=graph_dl, num_rules=vrg.num_rules, unique_rules=len(vrg.unique_rule_list), grammar_dl=vrg.cost) rows.append(row) temp_df = pd.DataFrame(rows) temp_df.to_csv(temp_fname, index=False) # for name in names: # temp_fname = f'{base_path}/stats/_grammar_df_{name}.csv' # if Path(temp_fname).exists(): # os.remove(temp_fname) return pd.DataFrame(rows)