def to_snap(data): """ convert the dataset to a SNAP graph. :param data: :py:class:`gct.Dataset` :rtype: SNAP graph """ import snap if 1 and utils.file_exists(data.file_snap): FIn = snap.TFIn(data.file_snap) if data.is_directed(): graph = snap.TNGraph.Load(FIn) else: graph = snap.TUNGraph.Load(FIn) return graph if False and data.is_weighted(): raise Exception("weighted graph is not supported well on snap") fname = data.file_edges if not utils.file_exists(fname): data.to_edgelist() if data.is_directed(): return snap.LoadEdgeList(snap.PNGraph, fname, 0, 1) else: return snap.LoadEdgeList(snap.PUNGraph, fname, 0, 1)
def has_downloaded(self, fname=None): if fname is None: for fname in self.get_remote_names(): if not utils.file_exists( config.get_download_file_path(self.name, fname)): return False return True else: return utils.file_exists( config.get_download_file_path(self.name, fname))
def to_pajek(self, filepath=None): if (filepath == None): filepath = self.file_pajek if utils.file_exists(filepath): return filepath if not utils.file_exists(self.file_edges): self.to_edgelist() from gct.dataset import edgelist2pajek edgelist2pajek.edgelist_to_pajek(self.file_edges, self.file_pajek, self.is_directed(), self.is_weighted()) return filepath
def to_higformat(self, filepath=None): if (filepath == None): filepath = self.file_hig if utils.file_exists(filepath): return filepath if not utils.file_exists(self.file_pajek): self.to_pajek() cmd = "python {} {}".format(config.HIG_CONVERT_PROG, self.file_pajek) self.logger.info("running " + cmd) status = utils.shell_run_and_wait(cmd) if (status != 0): raise Exception("run command failed: " + str(status)) return filepath
def to_anyscan(self, filepath=None): if False and self.is_directed(): raise Exception("directed graph not supported") if (filepath == None): filepath = self.file_anyscan if utils.file_exists(filepath): return filepath edges1 = self.get_edges()[['src', 'dest']] min_node = min(edges1['src'].min(), edges1['dest'].min()) max_node = max(edges1['src'].max(), edges1['dest'].max()) self.logger.info("min node: {}, max node: {}".format( min_node, max_node)) if min_node != 0: self.logger.warn( "node id is greater than 0, fake node will be added") self_edges = pd.DataFrame([[u, u] for u in range(max_node + 1)], columns=['src', 'dest']) edges2 = edges1[['dest', 'src']].copy() edges2.columns = ['src', 'dest'] edges = pd.concat([edges1, edges2, self_edges], axis=0) del edges1, edges2, self_edges def fun(df): values = sorted(list(set(df['dest']))) return str(len(values)) + " " + " ".join([str(u) for u in values]) grouped = edges.groupby('src').apply(fun).reset_index() with open(filepath, 'wt') as f: f.write(str(max_node + 1) + "\n") for i in grouped.index: f.write(grouped.loc[i, 0] + "\n") return filepath
def load_local(name): ''' load a local dataset :param name: name of a dataset ''' path = config.get_data_file_path(name, create=False) if not utils.file_exists(path): raise Exception("path not exists: " + path) with open(os.path.join(path, 'meta.info')) as f: meta = json.load(f) edges = meta['parq_edges'] gt = None if meta["has_ground_truth"]: gt = {k: Clustering(v) for k, v in meta['parq_ground_truth'].items()} additional_meta = None if not "additional" in meta else meta['additional'] is_edge_mirrored = meta['is_edge_mirrored'] return Dataset(name=meta['name'], description=meta['description'], groundtruthObj=gt, edgesObj=edges, directed=meta['directed'], weighted=meta['weighted'], overide=False, additional_meta=additional_meta, is_edge_mirrored=is_edge_mirrored)
def to_scanbin(self, filepath=None): if (filepath == None): filepath = self.file_scanbin if utils.file_exists(os.path.join( filepath, "b_degree.bin")) and utils.file_exists( os.path.join(filepath, "b_adj.bin")): return filepath utils.create_dir_if_not_exists(filepath) if not utils.file_exists(self.file_edges): self.to_edgelist() cmd = "{} {} {} {}".format(config.SCAN_CONVERT_PROG, self.file_edges, "b_degree.bin", "b_adj.bin") self.logger.info("running " + cmd) status = utils.shell_run_and_wait(cmd, filepath) if (status != 0): raise Exception("run command failed: " + str(status)) return filepath
def to_mcl_mci(self, filepath=None): if (filepath == None): filepath = self.file_mcl_mci if utils.file_exists(filepath): return filepath if not utils.file_exists(self.file_edges): self.to_edgelist() if not self.is_directed(): cmd = "{} --stream-mirror -123 {} -o {} --write-binary".format( config.MCL_CONVERT_PROG, self.file_edges, filepath) else: cmd = "{} -123 {} -o {} --write-binary".format( config.MCL_CONVERT_PROG, self.file_edges, filepath) self.logger.info("running " + cmd) status = utils.shell_run_and_wait(cmd) if (status != 0): raise Exception("run command failed: " + str(status)) return filepath
def to_unweighted_fromat(self, filepath=None): if (filepath == None): filepath = self.file_unweighted_edges if utils.file_exists(filepath): return filepath edges = self.get_edges()[['src', 'dest']] edges.to_csv(filepath, header=None, index=None, sep=" ") return filepath
def persistent_cnl(self): if self.path is not None: local_cnl = self.path + ".cnl" if not utils.file_exists(local_cnl): self.logger.info("persistent cluster to cnl file: " + local_cnl) self.save_to_cnl_file(local_cnl) return local_cnl return None
def local_exists(name): ''' check if dataset 'name' exists locally. :param name: name of a dataset :rtype: bool ''' path = config.get_data_file_path(name) return utils.file_exists(path)
def run(self, data, seed=None): if seed is None: seed = np.random.randint(999999) params = {'seed': seed} if (data.is_directed() or data.is_weighted()) and False: raise Exception( "only undirected and unweighted graph is supported") if not utils.file_exists(data.file_pajek): data.to_pajek() with utils.TempDir() as tmp_dir: def link_file(path, destname=None): if destname is None: destname = path.split("/")[-1] destpath = os.path.join(tmp_dir, destname) utils.remove_if_file_exit(destpath) os.symlink(path, destpath) return destpath pajek = link_file(data.file_pajek, 'pajek.txt') cmd = "{} {} {} {}".format( config.get_OSLOM_prog('infomap', data.is_directed()), seed, pajek, 1) self.logger.info("Running " + cmd) timecost, status = utils.timeit( lambda: utils.shell_run_and_wait(cmd, tmp_dir)) if status != 0: raise Exception( "Run command with error status code {}".format(status)) outputfile = glob.glob(tmp_dir + "/pajek.tree")[0] import pandas as pd output = pd.read_csv(outputfile, sep=" ", skiprows=1, header=None) output['cluster'] = output.loc[:, 0].map( lambda u: int(u.split(':')[0])) output['node'] = output.loc[:, 2].astype(np.int) clusters = output[['cluster', 'node']] clusters = clusters.groupby('cluster').apply( lambda u: list(u['node'])).to_dict() self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost)) result = {} result['runname'] = self.name result['params'] = params result['dataname'] = data.name result['meta'] = self.get_meta() result['timecost'] = timecost result['clusters'] = clusters save_result(result) self.result = result return self
def run(self, data, f=False, m=None,seed=None): if False and (data.is_directed()): raise Exception("only undirected is supported") if seed is not None:self.logger.info("seed ignored") params = {} params['f'] = f params['m'] = m if not utils.file_exists(data.file_hig): data.to_higformat() cmd = ["./hirecs"] cmd.append("-oje") if f: cmd.append('-f') if m is not None: cmd.append("-m{}".format(m)) cmd.append(data.file_hig) cmd.append("> output") cmd = " ".join(cmd) with utils.TempDir() as tmp_dir: with open(os.path.join(tmp_dir, "tmpcmd"), 'wt') as f: f.write(cmd) self.logger.info("Running " + cmd) cmd = "bash tmpcmd" utils.link_file(os.path.join(config.HIRECS_PATH, 'hirecs'), tmp_dir) utils.link_file(os.path.join(config.HIRECS_PATH, 'libhirecs.so'), tmp_dir) utils.link_file(data.file_hig, tmp_dir) timecost, status = utils.timeit(lambda: utils.shell_run_and_wait(cmd, tmp_dir)) if status != 0: raise Exception("Run command with error status code {}".format(status)) with open (os.path.join(tmp_dir, "output"), "r") as f: output = json.load(f) mod = output['mod'] communities = output['communities'] clusters = {} for c in communities: clusters[int(c)] = [int(u) for u in communities[c].keys()] self.logger.info("Made %d clusters in %f seconds with modularity %f" % (len(clusters), timecost, mod)) result = {} result['runname'] = self.name result['params'] = params result['overlap'] = True result['dataname'] = data.name result['meta'] = self.get_meta() result['timecost'] = timecost result['clusters'] = clusters save_result(result) self.result = result return self
def run(self, data, seed=None, lamb=1, trials=5, temp_step=0.999, initial_temp=1e-6): params = locals() del params['data'] del params['self'] if seed is None: seed = np.random.randint(999999) params['seed'] = seed if (data.is_directed() or data.is_weighted()) and False: raise Exception( "only undirected and unweighted graph is supported") if not utils.file_exists(data.file_edges): data.to_edgelist() with utils.TempDir() as tmp_dir: pajek = os.path.join(tmp_dir, 'edges.txt') utils.remove_if_file_exit(pajek) os.symlink(data.file_edges, pajek) cmd = "{} {} {} {} {} {} {}".format( config.get_OSLOM_prog('modopt', data.is_directed()), pajek, seed, lamb, trials, temp_step, initial_temp) self.logger.info("Running " + cmd) timecost, status = utils.timeit( lambda: utils.shell_run_and_wait(cmd, tmp_dir)) if status != 0: raise Exception( "Run command with error status code {}".format(status)) outputfile = pajek + "part" with open(outputfile) as f: lines = [u.strip() for u in f] lines = [[int(v) for v in u.split("\t")] for u in lines] clusters = dict(enumerate(lines)) self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost)) result = {} result['runname'] = self.name result['params'] = params result['dataname'] = data.name result['meta'] = self.get_meta() result['timecost'] = timecost result['clusters'] = clusters save_result(result) self.result = result return self
def to_topgc_fromat(self, filepath=None): if (filepath == None): filepath = self.file_topgc if utils.file_exists(filepath): return filepath if not utils.file_exists(self.file_edges): self.to_edgelist() with TempDir() as tmp_dir: cmd = "cat {}|sort -k1,1 -k2,2 -n | {} > {} || rm {}".format( self.file_edges, config.get_cdc_prog('mkidx'), filepath, filepath) self.logger.info("Running " + cmd) with open(os.path.join(tmp_dir, "tmpcmd"), 'wt') as f: f.write(cmd) timecost, status = utils.timeit( lambda: utils.shell_run_and_wait("bash -x tmpcmd", tmp_dir)) if status != 0: raise Exception( "Run command with error status code {}".format(status)) assert utils.file_exists(filepath) return filepath
def run(self, data, seed=None): self.logger.warning( "dct assumes node starts with zero and is continuous") if seed is None: seed = np.random.randint(999999) params = {'seed': seed, 'prog': self.progname} if (data.is_directed() or data.is_weighted()) and False: raise Exception( "only undirected and unweighted graph is supported") if not utils.file_exists(data.file_edges): data.to_edgelist() with utils.TempDir() as tmp_dir: #tmp_dir = "/tmp/abc" pajek = utils.link_file(data.file_edges, dest_dir=tmp_dir, destname='edges.txt') cmd = "{} {}".format( config.get_dct_prog(self.progname, data.is_directed()), pajek) self.logger.info("Running " + cmd) timecost, status = utils.timeit(lambda: utils.shell_run_and_wait( cmd, tmp_dir, env={'SEED': str(seed)})) if status != 0: raise Exception( "Run command with error status code {}".format(status)) outputfiles = glob.glob(tmp_dir + "/output*.cluster") import pandas as pd df_from_each_file = (pd.read_csv(f, sep=" ", header=None) for f in outputfiles) output = pd.concat(df_from_each_file, ignore_index=True) output.columns = ['node', 'cluster'] clusters = output[['cluster', 'node']] clusters = clusters.groupby('cluster').apply( lambda u: list(u['node'])).to_dict() self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost)) result = {} result['runname'] = self.name result['params'] = params result['dataname'] = data.name result['meta'] = self.get_meta() result['timecost'] = timecost result['clusters'] = clusters save_result(result) self.result = result return self
def to_mirror_edges_format(self, filepath=None): if (filepath == None): filepath = self.file_mirror_edges if utils.file_exists(filepath): return filepath edges1 = self.get_edges() edges2 = edges1.copy() edges2['src'] = edges1['dest'] edges2['dest'] = edges1['src'] edges2 = edges2[edges1.columns] edges = pd.concat([edges1, edges2 ]).drop_duplicates().sort_values(['src', 'dest']) edges.to_csv(filepath, header=None, index=None, sep=" ") return filepath
def to_networkit(data): """ convert the dataset to a `networkit <https://networkit.github.io/>`_ graph. :param data: :py:class:`gct.Dataset` :rtype: networkit graph """ import networkit fname = data.file_edges if not utils.file_exists(fname): data.to_edgelist() return networkit.readGraph(fname, fileformat=networkit.Format.EdgeListSpaceZero, directed=data.is_directed())
def run(self, data, seed=None): self.logger.warning( "dct::seq_louvain assumes node starts with zero and is continuous") if seed is None: seed = np.random.randint(999999) params = {'seed': seed} if (data.is_directed() or data.is_weighted()) and False: raise Exception( "only undirected and unweighted graph is supported") if not utils.file_exists(data.file_edges): data.to_edgelist() with utils.TempDir() as tmp_dir: pajek = utils.link_file(data.file_edges, dest_dir=tmp_dir, destname='edges.txt') cmd = "{} -f -s {} -o output {}".format( config.get_dct_prog('infomap', data.is_directed()), seed, pajek) self.logger.info("Running " + cmd) timecost, status = utils.timeit( lambda: utils.shell_run_and_wait(cmd, tmp_dir)) if status != 0: raise Exception( "Run command with error status code {}".format(status)) outputfile = tmp_dir + "/output" import pandas as pd output = pd.read_csv(outputfile, sep=" ", header=None) output.columns = ['cluster'] output['node'] = range(len(output)) clusters = output[['cluster', 'node']] clusters = clusters.groupby('cluster').apply( lambda u: list(u['node'])).to_dict() self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost)) result = {} result['runname'] = self.name result['params'] = params result['dataname'] = data.name result['meta'] = self.get_meta() result['timecost'] = timecost result['clusters'] = clusters save_result(result) self.result = result return self
def run(self, data, algorithm=4, minpts=4, epsilon=0.5, alpha=32768 , beta=32768 , thread=-1, seed=None): params = locals() del(params['self']);del(params['data']) if seed is not None:self.logger.info("seed ignored") thread = utils.get_num_thread(thread) params['thread'] = thread if False and (data.is_directed() or data.is_weighted()): raise Exception("only undirected and unweighted graph is supported") if not utils.file_exists(data.file_anyscan): data.to_anyscan() cmd = "{} -c {} -i {} -m {} -e {} -o {} -a {} -b {} -t {}".format( config.ANYSCAN_PROG, algorithm, data.file_anyscan, minpts, epsilon, 'output', alpha, beta, thread) self.logger.info("Running " + cmd) with utils.TempDir() as tmp_dir: timecost, status = utils.timeit(lambda: utils.shell_run_and_wait(cmd, tmp_dir)) if status != 1: # anyscan always return 1 raise Exception("Run command with error status code {}".format(status)) with open (os.path.join(tmp_dir, "output"), "r") as output: lines = [u.strip() for u in output.readlines()] n_nodes = int(lines[0]) clusters_list = lines[1].split(" ") clusters_list = [int(u) for u in clusters_list ] if (n_nodes != len(clusters_list)): raise Exception("#node is not equals #cluster") from collections import defaultdict clusters = defaultdict(list) for n, c in enumerate(clusters_list): clusters[c].append(n) self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost)) result = {} result['runname'] = self.name result['params'] = params result['dataname'] = data.name result['meta'] = self.get_meta() result['timecost'] = timecost result['clusters'] = clusters save_result(result) self.result = result return self
def to_snapformat(self, filepath=None): if (filepath == None): filepath = self.file_snap if utils.file_exists(filepath): return filepath import snap from gct.dataset import convert g = convert.to_snap(self) self.logger.info("Writing {} to {}".format(type(g), filepath)) FOut = snap.TFOut(filepath) g.Save(FOut) FOut.Flush() return filepath
def run(self, data, mu=3, epsilon=0.5, prog='pScan', seed=None): assert prog in ['pScan', 'ppScan', 'ppScanSSE'] if seed is not None:self.logger.info("seed ignored") params = {'mu':mu, 'epsilon':epsilon, 'prog':prog} if False and (data.is_directed() or data.is_weighted()): raise Exception("only undirected and unweighted graph is supported") if not utils.file_exists(data.file_scanbin): data.to_scanbin() with utils.TempDir() as tmp_dir: scanbin = data.file_scanbin os.symlink(os.path.join(scanbin, 'b_degree.bin'), os.path.join(tmp_dir, 'b_degree.bin')) os.symlink(os.path.join(scanbin, 'b_adj.bin'), os.path.join(tmp_dir, 'b_adj.bin')) if prog == 'pScan': EXE = config.PSCAN_PROG elif prog == 'ppScan': EXE = config.PPSCAN_PROG elif prog == 'ppScanSSE': EXE = config.PPSCANSSE_PROG cmd = "{} {} {} {} {}".format(EXE, tmp_dir, epsilon, mu, 'output') self.logger.info("Running " + cmd) timecost, status = utils.timeit(lambda: utils.shell_run_and_wait(cmd, tmp_dir)) if status != 0: raise Exception("Run command with error status code {}".format(status)) outputfile = glob.glob(tmp_dir + "/result-*-*.txt")[0] import pandas as pd output = pd.read_csv(outputfile, sep=" ") clusters = output[output['c/n'] == 'c'][['vertex_id', 'cluster_id']] others = output[output['c/n'] != 'c'][['vertex_id', 'cluster_id']] clusters = clusters.groupby('cluster_id').apply(lambda u: list(u['vertex_id'])).to_dict() others = others.values.tolist() self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost)) result = {} result['runname'] = self.name result['params'] = params result['dataname'] = data.name result['meta'] = self.get_meta() result['timecost'] = timecost result['clusters'] = clusters result['noise_or_hub'] = others save_result(result) self.result = result return self
def download(self): for fname in self.get_remote_names(): if not self.has_downloaded(fname): rfile = 'http://data.law.di.unimi.it/webdata/' + self.name + "/" + fname lfile = config.get_download_file_path(self.name, fname, create=True) if not utils.file_exists(lfile): self.logger.info("Dowloading {} to {} ".format( rfile, lfile)) utils.urlretrieve(rfile, lfile) assert self.has_downloaded() if not self.md5check(): self.logger.error("md5 check failed") shutil.rmtree(config.get_download_file_path(self.name)) raise Exception("md5 check failed")
def run(self, data, **kwargs): if False and (data.is_directed()): raise Exception("only undirected is supported") params = dict(kwargs) params = {k:v for k, v in params.items() if v is not None } if not data.is_directed(): params['Sym'] = 1 params['d'] = "output" if "r" not in params: params['r'] = 0.1 if not utils.file_exists(data.file_edges): data.to_edgelist() txt_params = " ".join(["-{} {}".format(k, v) for k, v in params.items()]) cmd = "{} -jar {} -i {} {} ".format(utils.get_java_command(), config.GANXISW_PROG, "edges.txt", txt_params) with utils.TempDir() as tmp_dir: utils.remove_if_file_exit(os.path.join(tmp_dir, "output"), True) utils.create_dir_if_not_exists(os.path.join(tmp_dir, "output")) self.logger.info("Running " + cmd) utils.link_file(data.file_edges, tmp_dir, "edges.txt") timecost, status = utils.timeit(lambda: utils.shell_run_and_wait(cmd, tmp_dir)) if status != 0: raise Exception("Run command with error status code {}".format(status)) outputfile = glob.glob(os.path.join(tmp_dir, "output/SLPAw*.icpm"))[0] clusters = [] with open (os.path.join(tmp_dir, outputfile), "r") as f: for line in f: clusters.append([int(u) for u in line.strip().split(" ")]) clusters = dict(enumerate(clusters)) self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost)) result = {} result['runname'] = self.name result['params'] = params result['dataname'] = data.name result['meta'] = self.get_meta() result['timecost'] = timecost result['clusters'] = clusters save_result(result) self.result = result return self
def to_networkx(data): """ convert the dataset to a networkx graph. :param data: :py:class:`gct.Dataset` :rtype: networkx graph """ import networkx as nx fname = data.file_edges if not utils.file_exists(fname): data.to_edgelist() container = nx.DiGraph() if data.is_directed() else nx.Graph() if data.is_weighted(): return nx.read_weighted_edgelist(fname, create_using=container, nodetype=int) else: return nx.read_edgelist(fname, create_using=container, nodetype=int)
def to_edgelist(self, filepath=None, sep=" ", sort=False): if (filepath == None): filepath = self.file_edges if utils.file_exists(filepath): return filepath self.logger.info("writing edges to " + filepath) with open(filepath, 'wt') as f: columns = ['src', 'dest', 'weight' ] if self.is_weighted() else ['src', 'dest'] df = self.get_edges()[columns] if sort: df = df.sort_values(by=['src', 'dest']) for i in range(df.shape[0]): r = df.iloc[i] r = ['%g' % (u) for u in r] row = sep.join(r) row += "\n" f.write(row) self.logger.info("finish writing " + filepath) return filepath
def to_graph_tool(data): """ convert the dataset to a graph-tool graph. (TBD) graph_tool support weights? :param data: :py:class:`gct.Dataset` :rtype: graph-tool graph """ import graph_tool fname = data.file_edges if not utils.file_exists(fname): data.to_edgelist() g = graph_tool.load_graph_from_csv(fname, directed=data.is_directed(), string_vals=False, skip_first=False, csv_options={"delimiter": " "}) return g
def run(self, data, cutoff_r=0.01, inflation_in=2, NBDisimilarity_q=0.3, seed=None): if False and (data.is_directed()): raise Exception("only undirected is supported") if seed is not None:self.logger.info("seed ignored") params = locals();del params['self'];del params['data'] if not utils.file_exists(data.file_edges): data.to_edgelist() cmd = "{} {} {} {} {}".format(config.LABLE_RANK_PROG, "edges.txt", cutoff_r, inflation_in, NBDisimilarity_q) with utils.TempDir() as tmp_dir: utils.remove_if_file_exit(os.path.join(tmp_dir, "output"), True) utils.create_dir_if_not_exists(os.path.join(tmp_dir, "output")) self.logger.info("Running " + cmd) utils.link_file(data.file_edges, tmp_dir, "edges.txt") timecost, status = utils.timeit(lambda: utils.shell_run_and_wait(cmd, tmp_dir)) if status != 0: raise Exception("Run command with error status code {}".format(status)) outputfile = glob.glob(os.path.join(tmp_dir, "output/LabelRank*.icpm"))[0] clusters = [] with open (os.path.join(tmp_dir, outputfile), "r") as f: for line in f: clusters.append([int(u) for u in line.strip().split(" ")]) clusters = dict(enumerate(clusters)) self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost)) result = {} result['runname'] = self.name result['params'] = params result['dataname'] = data.name result['meta'] = self.get_meta() result['timecost'] = timecost result['clusters'] = clusters save_result(result) self.result = result return self
def run(self, data, mu=1, epsilon=0, seed=None): params = {'mu':mu, 'epsilon':epsilon} if seed is not None:self.logger.info("seed ignored") if data.is_weighted(): raise Exception("only undirected and unweighted graph is supported") if not utils.file_exists(data.file_edges): data.to_edgelist() cmd = "{} -e {} -m {} -r {}".format(config.SCANPP_PROG, epsilon, mu, data.file_edges) self.logger.info("Running " + cmd) timecost, output = utils.timeit(lambda: utils.check_output(cmd.split(" "))) if not output.startswith('node'): raise Exception("Something wrong with scapp. output:\n" + output) output = [u.strip() for u in output.split("\n")][1:] output = [u.split("\t") for u in output if u] output = [[int(v) for v in u] for u in output] from collections import defaultdict clusters = defaultdict(list) for n, c in output: clusters[c].append(n) self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost)) result = {} result['runname'] = self.name result['params'] = params result['dataname'] = data.name result['meta'] = self.get_meta() result['timecost'] = timecost result['clusters'] = clusters save_result(result) self.result = result return self
def has_downloaded(self): for fname in self.local_files: if not utils.file_exists(fname): return False else: return True