def link_file(path, destname=None): if destname is None: destname = path.split("/")[-1] destpath = os.path.join(tmp_dir, destname) utils.remove_if_file_exit(destpath) os.symlink(path, destpath) return destpath
def get_edges(self): if not self.has_downloaded(): self.download() graph_file = config.get_download_file_path(self.name, self.name + '.graph') self.logger.info("reading {}".format(graph_file)) cmd = "{} -server -cp {} it.unimi.dsi.webgraph.ArcListASCIIGraph {} {}".format( utils.get_java_command(), config.WEBGRAPH_JAR_PATH, self.name + '-hc-t', self.name) self.logger.info("Running " + cmd) timecost, status = utils.timeit( lambda: utils.shell_run_and_wait(cmd, self.download_dir)) if status != 0: raise Exception( "Run command with error status code {}".format(status)) csvfile = os.path.join(self.download_dir, self.name) edges = pd.read_csv(csvfile, header=None, sep='\t') edges.columns = ['src', 'dest'] utils.remove_if_file_exit(csvfile) return edges
def testRandomDataset8(self): ds = self.graph_unweighted_undirect utils.remove_if_file_exit(ds.file_hig) print(ds.to_higformat()) ds = self.graph_weighted_undirect utils.remove_if_file_exit(ds.file_hig) print(ds.to_higformat())
def run(self, data, seed=None, lamb=1, trials=5, temp_step=0.999, initial_temp=1e-6): params = locals() del params['data'] del params['self'] if seed is None: seed = np.random.randint(999999) params['seed'] = seed if (data.is_directed() or data.is_weighted()) and False: raise Exception( "only undirected and unweighted graph is supported") if not utils.file_exists(data.file_edges): data.to_edgelist() with utils.TempDir() as tmp_dir: pajek = os.path.join(tmp_dir, 'edges.txt') utils.remove_if_file_exit(pajek) os.symlink(data.file_edges, pajek) cmd = "{} {} {} {} {} {} {}".format( config.get_OSLOM_prog('modopt', data.is_directed()), pajek, seed, lamb, trials, temp_step, initial_temp) self.logger.info("Running " + cmd) timecost, status = utils.timeit( lambda: utils.shell_run_and_wait(cmd, tmp_dir)) if status != 0: raise Exception( "Run command with error status code {}".format(status)) outputfile = pajek + "part" with open(outputfile) as f: lines = [u.strip() for u in f] lines = [[int(v) for v in u.split("\t")] for u in lines] clusters = dict(enumerate(lines)) self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost)) result = {} result['runname'] = self.name result['params'] = params result['dataname'] = data.name result['meta'] = self.get_meta() result['timecost'] = timecost result['clusters'] = clusters save_result(result) self.result = result return self
def run(self, data, seed=None): if seed is None: seed = np.random.randint(999999) params = {'seed': seed} if (data.is_directed() or data.is_weighted()) and False: raise Exception( "only undirected and unweighted graph is supported") if not utils.file_exists(data.file_pajek): data.to_pajek() with utils.TempDir() as tmp_dir: #tmp_dir="/tmp/abc" pajek = os.path.join(tmp_dir, 'pajek.txt') utils.remove_if_file_exit(pajek) os.symlink(data.file_pajek, pajek) cmd = "{} {} {} {}".format( config.get_OSLOM_prog('Infohiermap', data.is_directed()), seed, pajek, 1) self.logger.info("Running " + cmd) timecost, status = utils.timeit( lambda: utils.shell_run_and_wait(cmd, tmp_dir)) if status != 0: raise Exception( "Run command with error status code {}".format(status)) outputfile = glob.glob(tmp_dir + "/pajek.tree")[0] import pandas as pd output = pd.read_csv(outputfile, sep=" ", skiprows=1, header=None, dtype={0: np.str}) output['cluster'] = output.loc[:, 0].map( lambda u: int(u.split(':')[0])) output['node'] = output.loc[:, 2].astype(np.int) clusters = output[['cluster', 'node']] clusters = clusters.groupby('cluster').apply( lambda u: list(u['node'])).to_dict() self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost)) result = {} result['runname'] = self.name result['params'] = params result['dataname'] = data.name result['meta'] = self.get_meta() result['timecost'] = timecost result['clusters'] = clusters save_result(result) self.result = result return self
def testFromSnap(self): name = "testFromSnap" utils.remove_if_file_exit(config.get_data_file_path(name), is_dir=True) G = snap.GenRndGnm(snap.PNGraph, 100, 1000) d = convert.from_snap(name, G, overide=True) print ("testFromSnap", d) utils.remove_if_file_exit(config.get_data_file_path(name), is_dir=True) G = snap.GenRndGnm(snap.PUNGraph, 100, 1000) d = convert.from_snap(name, G, overide=True) print ("testFromSnap", d)
def testRandomDataset7(self): ds = self.graph_unweighted_undirect utils.remove_if_file_exit(ds.file_snap) print(ds.to_snapformat()) FIn = snap.TFIn(ds.file_snap) Graph = snap.TUNGraph.Load(FIn) ds = self.graph_unweighted_direct self.assertTrue(ds.is_directed()) utils.remove_if_file_exit(ds.file_snap) print(ds.to_snapformat()) FIn = snap.TFIn(ds.file_snap) Graph = snap.TNGraph.Load(FIn)
def remove_local(name, rm_graph_data=True, rm_clustering_result=True): ''' remove a local dataset :param name: name of a dataset :param rm_clustering_result: remove local graph data :param rm_clustering_result: remove clustering results associated with the graph. ''' if rm_graph_data: path = config.get_data_file_path(name) utils.remove_if_file_exit(path, is_dir=True) if rm_clustering_result: path = config.get_result_file_path(name) utils.remove_if_file_exit(path, is_dir=True)
def save_result(result): if isinstance(result, Result): result.save() else: filepath = config.get_result_file_path(result['dataname'], result['runname'], create=True) try: fpath = os.path.join(filepath, 'result.txt') with open(fpath, 'wt') as f: json.dump(result, f) except: utils.remove_if_file_exit(fpath, is_dir=False) raise
def run(self, data, **kwargs): if False and (data.is_directed()): raise Exception("only undirected is supported") params = dict(kwargs) params = {k:v for k, v in params.items() if v is not None } if not data.is_directed(): params['Sym'] = 1 params['d'] = "output" if "r" not in params: params['r'] = 0.1 if not utils.file_exists(data.file_edges): data.to_edgelist() txt_params = " ".join(["-{} {}".format(k, v) for k, v in params.items()]) cmd = "{} -jar {} -i {} {} ".format(utils.get_java_command(), config.GANXISW_PROG, "edges.txt", txt_params) with utils.TempDir() as tmp_dir: utils.remove_if_file_exit(os.path.join(tmp_dir, "output"), True) utils.create_dir_if_not_exists(os.path.join(tmp_dir, "output")) self.logger.info("Running " + cmd) utils.link_file(data.file_edges, tmp_dir, "edges.txt") timecost, status = utils.timeit(lambda: utils.shell_run_and_wait(cmd, tmp_dir)) if status != 0: raise Exception("Run command with error status code {}".format(status)) outputfile = glob.glob(os.path.join(tmp_dir, "output/SLPAw*.icpm"))[0] clusters = [] with open (os.path.join(tmp_dir, outputfile), "r") as f: for line in f: clusters.append([int(u) for u in line.strip().split(" ")]) clusters = dict(enumerate(clusters)) self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost)) result = {} result['runname'] = self.name result['params'] = params result['dataname'] = data.name result['meta'] = self.get_meta() result['timecost'] = timecost result['clusters'] = clusters save_result(result) self.result = result return self
def run(self, data, cutoff_r=0.01, inflation_in=2, NBDisimilarity_q=0.3, seed=None): if False and (data.is_directed()): raise Exception("only undirected is supported") if seed is not None:self.logger.info("seed ignored") params = locals();del params['self'];del params['data'] if not utils.file_exists(data.file_edges): data.to_edgelist() cmd = "{} {} {} {} {}".format(config.LABLE_RANK_PROG, "edges.txt", cutoff_r, inflation_in, NBDisimilarity_q) with utils.TempDir() as tmp_dir: utils.remove_if_file_exit(os.path.join(tmp_dir, "output"), True) utils.create_dir_if_not_exists(os.path.join(tmp_dir, "output")) self.logger.info("Running " + cmd) utils.link_file(data.file_edges, tmp_dir, "edges.txt") timecost, status = utils.timeit(lambda: utils.shell_run_and_wait(cmd, tmp_dir)) if status != 0: raise Exception("Run command with error status code {}".format(status)) outputfile = glob.glob(os.path.join(tmp_dir, "output/LabelRank*.icpm"))[0] clusters = [] with open (os.path.join(tmp_dir, outputfile), "r") as f: for line in f: clusters.append([int(u) for u in line.strip().split(" ")]) clusters = dict(enumerate(clusters)) self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost)) result = {} result['runname'] = self.name result['params'] = params result['dataname'] = data.name result['meta'] = self.get_meta() result['timecost'] = timecost result['clusters'] = clusters save_result(result) self.result = result return self
def testFromNextworkx(self): name = "testFromNextworkx" utils.remove_if_file_exit(config.get_data_file_path(name), is_dir=True) G = nx.complete_graph(5) d = convert.from_networkx(name, G, weighted=False, overide=True) print (d) utils.remove_if_file_exit(config.get_data_file_path(name), is_dir=True) G = nx.complete_graph(5).to_directed() d = convert.from_networkx(name, G, weighted=False, overide=True) print (d) utils.remove_if_file_exit(config.get_data_file_path(name), is_dir=True) G = nx.complete_graph(5) G.add_weighted_edges_from((u, v, 1) for u, v in nx.complete_graph(5).edges()) d = convert.from_networkx(name, G, weighted=True, overide=True) print (d) utils.remove_if_file_exit(config.get_data_file_path(name), is_dir=True) G = nx.complete_graph(5).to_directed() G.add_weighted_edges_from((u, v, 1) for u, v in nx.complete_graph(5).edges()) d = convert.from_networkx(name, G, weighted=True, overide=True) print (d)
def testFromIGraph(self): name = "testFromIGraph" utils.remove_if_file_exit(config.get_data_file_path(name), is_dir=True) G = igraph.Graph.Full(5, directed=False) d = convert.from_igraph(name, G) print (d) utils.remove_if_file_exit(config.get_data_file_path(name), is_dir=True) G = igraph.Graph.Full(5, directed=True) d = convert.from_igraph(name, G) print (d) utils.remove_if_file_exit(config.get_data_file_path(name), is_dir=True) G = igraph.Graph.Full(5, directed=False) G.es['weight'] = [1] * G.ecount() d = convert.from_igraph(name, G) print (d) utils.remove_if_file_exit(config.get_data_file_path(name), is_dir=True) G = igraph.Graph.Full(5, directed=True) G.es['weight'] = [1] * G.ecount() d = convert.from_igraph(name, G) print (d)
def run(self, data, execution='async', ncpus=None, scheduler=None, engine_opts=None, graph_opts=None, scheduler_opts=None, seed=None): if seed is not None: self.logger.info("seed ignored") params = locals() del params['self'] del params['data'] del params['seed'] params = {u: v for u, v in params.items() if v is not None} if (data.is_directed() or data.is_weighted()) and False: raise Exception( "only undirected and unweighted graph is supported") ncpus = utils.get_num_thread(ncpus) params['ncpus'] = ncpus params['weighted'] = data.is_weighted() * 1 params['directed'] = data.is_directed() * 1 if not utils.file_exists(data.file_edges): data.to_edgelist() with utils.TempDir() as tmp_dir: pajek = os.path.join(tmp_dir, 'edges.txt') utils.remove_if_file_exit(pajek) os.symlink(data.file_edges, pajek) args = " ".join( ["--{} {}".format(u, v) for u, v in params.items()]) cmd = "{} --graph {} --saveprefix=output.cluster {}".format( config.get_powergraph_prog('label_propagation', data.is_directed()), pajek, args) self.logger.info("Running " + cmd) timecost, status = utils.timeit( lambda: utils.shell_run_and_wait(cmd, tmp_dir)) if status != 0: raise Exception( "Run command with error status code {}".format(status)) outputfiles = glob.glob(tmp_dir + "/output.cluster.*") import pandas as pd df_from_each_file = (pd.read_csv(f, sep="\t", header=None) for f in outputfiles) output = pd.concat(df_from_each_file, ignore_index=True) output.columns = ['node', 'cluster'] clusters = output[['cluster', 'node']] clusters = clusters.groupby('cluster').apply( lambda u: list(u['node'])).to_dict() self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost)) result = {} result['runname'] = self.name result['params'] = params result['dataname'] = data.name result['meta'] = self.get_meta() result['timecost'] = timecost result['clusters'] = clusters save_result(result) self.result = result return self
def run(self, data, v=5, v1=None, v2=None, prop=None, repeat=None, mo=None, nosplit=False, extrasimplify=False, q=False, seed=None): assert (v1 is None and v2 is None) or (v1 is not None and v2 is not None) params = locals() del params['self'] del params['data'] if seed is not None: self.logger.info("seed ignored") if (data.is_directed() or data.is_weighted()) and False: raise Exception( "only undirected and unweighted graph is supported") if not utils.file_exists(data.file_edges): data.to_edgelist() with utils.TempDir() as tmp_dir: pajek = os.path.join(tmp_dir, 'edges.txt') utils.remove_if_file_exit(pajek) os.symlink(data.file_edges, pajek) cmd = "{} -cp {} COPRA {} -w -v {}".format( utils.get_java_command(), config.get_OSLOM_prog('copra', data.is_directed()), pajek, v) if (v1 is not None and v2 is not None): cmd += "-vs {} {}".format(v1, v2) if prop is not None: cmd += ' -prop {}'.format(prop) if repeat is not None: cmd += ' -repeat {}'.format(repeat) if mo is not None: cmd += ' -mo' if nosplit is not None: cmd += ' -nosplit' if extrasimplify is not None: cmd += ' -extrasimplify' if q is not None: cmd += ' -q' self.logger.info("Running " + cmd) timecost, status = utils.timeit( lambda: utils.shell_run_and_wait(cmd, tmp_dir)) if status != 0: raise Exception( "Run command with error status code {}".format(status)) outputfile = os.path.join(tmp_dir, 'clusters-edges.txt') with open(outputfile) as f: lines = [u.strip() for u in f] lines = [[int(v) for v in u.split(" ")] for u in lines] clusters = dict(enumerate(lines)) self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost)) result = {} result['runname'] = self.name result['params'] = params result['dataname'] = data.name result['meta'] = self.get_meta() result['timecost'] = timecost result['clusters'] = clusters save_result(result) self.result = result return self
def testRandomDataset6(self): ds = self.graph_unweighted_undirect utils.remove_if_file_exit(ds.file_anyscan) print(ds.to_anyscan())
def __init__(self, name=None, description="", groundtruthObj=None, edgesObj=None, directed=False, weighted=False, overide=False, additional_meta=None, is_edge_mirrored=False): assert edgesObj is not None self.name = name self.description = description self.additional_meta = additional_meta self.logger = utils.get_logger("{}:{}".format( type(self).__name__, self.name)) self.directed = directed self.weighted = weighted self.is_edge_mirrored = is_edge_mirrored self.parq_edges = None if name: assert name self.file_edges = config.get_data_file_path(self.name, 'edges.txt') self.file_pajek = config.get_data_file_path(self.name, 'pajek.txt') self.file_hig = config.get_data_file_path(self.name, 'pajek.hig') self.file_scanbin = config.get_data_file_path(self.name, 'scanbin') self.file_anyscan = config.get_data_file_path( self.name, 'anyscan.txt') self.file_snap = config.get_data_file_path(self.name, 'snap.bin') self.file_mcl_mci = config.get_data_file_path(self.name, 'mcl.mci') self.file_mcl_tab = config.get_data_file_path(self.name, 'mcl.tab') self.file_topgc = config.get_data_file_path(self.name, 'topgc.txt') self.file_mirror_edges = config.get_data_file_path( self.name, 'edges_mirror.txt') if self.is_weighted(): self.file_unweighted_edges = self.file_edges else: self.file_unweighted_edges = config.get_data_file_path( self.name, 'unweighted_edges.txt') self.set_ground_truth(groundtruthObj) self.set_edges(edgesObj) if name: is_persistent = self.is_edges_persistent( ) and self.is_ground_truth_persistent() self.home = config.get_data_file_path(name, create=False) if utils.file_exists(self.home): if overide: utils.remove_if_file_exit(self.home, is_dir=True) elif is_persistent: pass else: raise Exception( "Dataset {} exists at {}. Use overide=True or load it locally." .format(name, self.home)) if not is_persistent: utils.remove_if_file_exit(config.get_result_file_path( self.name), is_dir=True) utils.create_dir_if_not_exists(self.home) self.persistent() self.update_meta()