コード例 #1
0
    def run(self, data, **kwargs):
        
        if False and (data.is_directed()):
            raise Exception("only undirected is supported")
        params = dict(kwargs)
        if 'seed' in params:
            if params['seed'] is not None:  self.logger.info("seed ignored")
            del params['seed']
        params['n_jobs'] = utils.get_num_thread(None if "n_jobs" not in params else params['n_jobs'])
        params['metric'] = 'precomputed'
        A = convert.to_coo_adjacency_matrix(data, simalarity=False, distance_fun='exp_minus')
        params['eps'] = 0.5 if not data.is_weighted() else float(np.mean(data.get_edges()['weight']))

        def fun():
            obj = sklearn.cluster.DBSCAN(**params)
            return obj.fit_predict(A)

        timecost, res = utils.timeit(fun)
        
        clusters = DefaultDict(list)
        for i, c in enumerate(res):
            clusters[str(int(c))].append(i)
        self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost))

        result = {}
        result['runname'] = self.name
        result['params'] = params
        result['dataname'] = data.name
        result['meta'] = self.get_meta()
        result['timecost'] = timecost
        result['clusters'] = clusters 

        save_result(result)
        self.result = result 
        return self 
コード例 #2
0
    def run(self, data, seed=None):
        if (data.is_directed()) and False:
            raise Exception("only undirected is supported")
        if seed is not None:self.logger.info("seed ignored")        
        g = convert.to_igraph(data)
        timecost, ret = utils.timeit(lambda: g.community_infomap(edge_weights='weight' if data.is_weighted() else None))
        vc = ret
        clusters = {}
        for i, a in enumerate(vc):
            clusters[i] = a
            
        modularity = vc.modularity
        self.logger.info("Made %d clusters in %f seconds. modularity=%f " % (len(clusters), timecost, modularity))
        
        result = {}
        result['timecost'] = timecost
        result['modularity'] = modularity        
        result['runname'] = self.name
        result['dataname'] = data.name
        result['meta'] = self.get_meta()
        result['clusters'] = clusters 

        save_result(result)
        self.result = result 
        return self 
コード例 #3
0
    def run(self, data, iterations=25, eps=1e-06, seed=None):
        
        if seed is not None:self.logger.info("seed ignored")
        
        params = {}
        params['iterations'] = iterations
        params['eps'] = eps
        
        timecost, result = utils.timeit(lambda: run_SCD(data, params))
                
        clusters = {}
        for k, v in result.items():
            if v not in clusters:
                clusters[v] = []
            clusters[v].append(k)

        self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost))

        result = {}
        result['runname'] = self.name
        result['params'] = params
        result['dataname'] = data.name
        result['meta'] = self.get_meta()
        result['timecost'] = timecost
        result['clusters'] = clusters 

        save_result(result)
        self.result = result 
        return self
コード例 #4
0
    def run(self, data, component_count=2, cutoff=50, seed=None):
        
        if seed is not None:self.logger.info("seed ignored")
        
        params = {}
        params['component_count'] = component_count
        params['cutoff'] = cutoff
        
        timecost, result = utils.timeit(lambda: run_EdMot(data, params))
                
        clusters = {}
        for k, v in result.items():
            if v not in clusters:
                clusters[v] = []
            clusters[v].append(k)

        self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost))

        result = {}
        result['runname'] = self.name
        result['params'] = params
        result['dataname'] = data.name
        result['meta'] = self.get_meta()
        result['timecost'] = timecost
        result['clusters'] = clusters 

        save_result(result)
        self.result = result 
        return self 
コード例 #5
0
    def run(self, data, alpha=0.1, seed=None):
        if False and (data.is_directed()):
            raise Exception("only undirected is supported")
        if seed is not None: self.logger.info("seed ignored")

        params = {'alpha': alpha}
        g = convert.to_networkit(data)
        fun = lambda: networkit.community.detectCommunities(
            g, algo=networkit.community.CutClustering(g, alpha=alpha))
        timecost, ret = utils.timeit(fun)
        clusters = {}
        for c in ret.getSubsetIds():
            clusters[c] = list(ret.getMembers(c))

        self.logger.info("Made %d clusters in %f seconds." %
                         (len(clusters), timecost))

        result = {}
        result['params'] = params
        result['timecost'] = timecost
        result['runname'] = self.name
        result['dataname'] = data.name
        result['meta'] = self.get_meta()
        result['clusters'] = clusters

        save_result(result)
        self.result = result
        return self
コード例 #6
0
    def run(self, data, damping=None, max_iter=None, convergence=None, verbose=None, seed=None):
        
        if False and (data.is_directed()):
            raise Exception("only undirected is supported")
        if seed is not None:self.logger.info("seed ignored")        
        params = locals();del params['self'];del params['data'];del params['seed']
        params = {u:v for u, v in params.items() if v is not None}
        params['affinity'] = 'precomputed'
        params['copy'] = False

        A = convert.to_coo_adjacency_matrix(data, simalarity=True)

        def fun():
            obj = sklearn.cluster.AffinityPropagation(**params)
            return obj.fit_predict(A.toarray())

        timecost, res = utils.timeit(fun)
        
        clusters = DefaultDict(list)
        for i, c in enumerate(res):
            clusters[str(c)].append(i)
        self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost))

        result = {}
        result['runname'] = self.name
        result['params'] = params
        result['dataname'] = data.name
        result['meta'] = self.get_meta()
        result['timecost'] = timecost
        result['clusters'] = clusters 

        save_result(result)
        self.result = result 
        return self 
コード例 #7
0
    def run(self, data, dimensions=32, iterations=10, seed=42):

        params = {}
        params['dimensions'] = dimensions
        params['iterations'] = iterations
        params['seed'] = seed
        
        timecost, result = utils.timeit(lambda: run_NNSED(data, params))
        clusters = {}
        for k, vv in result.items():
            if isinstance(vv, list):
                for v in vv:
                    if v not in clusters:
                        clusters[v] = []
                    clusters[v].append(k)
            else:
                v = vv
                if v not in clusters:
                    clusters[v] = []
                clusters[v].append(k)

        self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost))

        result = {}
        result['runname'] = self.name
        result['params'] = params
        result['dataname'] = data.name
        result['meta'] = self.get_meta()
        result['timecost'] = timecost
        result['clusters'] = clusters 

        save_result(result)
        self.result = result 
        return self
コード例 #8
0
    def run(self, data, updateThreshold=None, maxIterations=None, seed=None):
        if False and (data.is_directed()):
            raise Exception("only undirected is supported")
        if seed is not None: self.logger.info("seed ignored")

        params = {
            "updateThreshold": updateThreshold,
            'maxIterations': maxIterations
        }
        params = {k: v for k, v in params.items() if v is not None}
        g = convert.to_networkit(data)
        fun = lambda: networkit.community.detectCommunities(
            g, algo=networkit.community.PLP(g, **params))
        timecost, ret = utils.timeit(fun)
        clusters = {}
        for c in ret.getSubsetIds():
            clusters[c] = list(ret.getMembers(c))

        self.logger.info("Made %d clusters in %f seconds." %
                         (len(clusters), timecost))

        result = {}
        result['params'] = params
        result['timecost'] = timecost
        result['runname'] = self.name
        result['dataname'] = data.name
        result['meta'] = self.get_meta()
        result['clusters'] = clusters

        save_result(result)
        self.result = result
        return self
コード例 #9
0
    def run(self, data, seed=None):
        if data.is_directed():
            raise UnsupportedException("only undirected graph is supported")
        if seed is not None: self.logger.info("seed ignored")
        UGraph = convert.to_snap(data)
        CmtyV = snap.TCnComV()
        timecost, modularity = utils.timeit(
            lambda: snap.CommunityGirvanNewman(UGraph, CmtyV))
        clusters = {}
        i = 0
        for Cmty in CmtyV:
            clusters[i] = []
            for NI in Cmty:
                clusters[i].append(NI)
            i += 1

        self.logger.info(
            "Made %d clusters in %f seconds. modularity of the graph is %f" %
            (len(clusters), timecost, modularity))

        result = {}
        result['timecost'] = timecost
        result['runname'] = self.name
        result['dataname'] = data.name
        result['meta'] = self.get_meta()
        result['modularity'] = modularity
        result['clusters'] = clusters

        save_result(result)
        self.result = result
        return self
コード例 #10
0
    def run(self, data, resolution=1.0, seed=None):
        
        if seed is not None:self.logger.info("seed ignored")
        
        params = {}
        params['resolution'] = resolution
        
        timecost, result = utils.timeit(lambda: run_EgoNetSplitter(data, params))
        clusters = {}
        for k, vv in result.items():
            for v in vv:
                if v not in clusters:
                    clusters[v] = []
                clusters[v].append(k)

        self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost))

        result = {}
        result['runname'] = self.name
        result['params'] = params
        result['dataname'] = data.name
        result['meta'] = self.get_meta()
        result['timecost'] = timecost
        result['clusters'] = clusters 

        save_result(result)
        self.result = result 
        return self
コード例 #11
0
    def run(self, data, seed=None):

        if seed is None:
            seed = np.random.randint(999999)
        params = {'seed': seed}
        if (data.is_directed() or data.is_weighted()) and False:
            raise Exception(
                "only undirected and unweighted graph is supported")
        if not utils.file_exists(data.file_pajek):
            data.to_pajek()

        with utils.TempDir() as tmp_dir:

            def link_file(path, destname=None):
                if destname is None:
                    destname = path.split("/")[-1]
                destpath = os.path.join(tmp_dir, destname)
                utils.remove_if_file_exit(destpath)
                os.symlink(path, destpath)
                return destpath

            pajek = link_file(data.file_pajek, 'pajek.txt')
            cmd = "{} {} {} {}".format(
                config.get_OSLOM_prog('infomap', data.is_directed()), seed,
                pajek, 1)

            self.logger.info("Running " + cmd)

            timecost, status = utils.timeit(
                lambda: utils.shell_run_and_wait(cmd, tmp_dir))
            if status != 0:
                raise Exception(
                    "Run command with error status code {}".format(status))

            outputfile = glob.glob(tmp_dir + "/pajek.tree")[0]
            import pandas as pd
            output = pd.read_csv(outputfile, sep=" ", skiprows=1, header=None)
            output['cluster'] = output.loc[:, 0].map(
                lambda u: int(u.split(':')[0]))
            output['node'] = output.loc[:, 2].astype(np.int)
        clusters = output[['cluster', 'node']]
        clusters = clusters.groupby('cluster').apply(
            lambda u: list(u['node'])).to_dict()
        self.logger.info("Made %d clusters in %f seconds" %
                         (len(clusters), timecost))

        result = {}
        result['runname'] = self.name
        result['params'] = params
        result['dataname'] = data.name
        result['meta'] = self.get_meta()
        result['timecost'] = timecost
        result['clusters'] = clusters
        save_result(result)
        self.result = result
        return self
コード例 #12
0
    def run(self, data, f=False, m=None,seed=None):
        
        if False and (data.is_directed()):
            raise Exception("only undirected is supported")
        if seed is not None:self.logger.info("seed ignored")
        
        params = {}
        params['f'] = f
        params['m'] = m 
        
        if not utils.file_exists(data.file_hig):
            data.to_higformat()
        cmd = ["./hirecs"]
        cmd.append("-oje")
        if f: cmd.append('-f')
        if m is not None: cmd.append("-m{}".format(m))
        cmd.append(data.file_hig)
        cmd.append("> output")
        cmd = " ".join(cmd)
        with utils.TempDir() as tmp_dir:
            with open(os.path.join(tmp_dir, "tmpcmd"), 'wt') as f: f.write(cmd)
            self.logger.info("Running " + cmd)
            cmd = "bash tmpcmd" 
            
            utils.link_file(os.path.join(config.HIRECS_PATH, 'hirecs'), tmp_dir)
            utils.link_file(os.path.join(config.HIRECS_PATH, 'libhirecs.so'), tmp_dir)
            utils.link_file(data.file_hig, tmp_dir)
            timecost, status = utils.timeit(lambda: utils.shell_run_and_wait(cmd, tmp_dir))
            if status != 0: 
                raise Exception("Run command with error status code {}".format(status))
            
            with open (os.path.join(tmp_dir, "output"), "r") as f:
                output = json.load(f)
                
            mod = output['mod']
            communities = output['communities']
            
            clusters = {}
            for c in communities:
                clusters[int(c)] = [int(u) for u in communities[c].keys()]
        
        self.logger.info("Made %d clusters in %f seconds with modularity %f" % (len(clusters), timecost, mod))

        result = {}
        result['runname'] = self.name
        result['params'] = params
        result['overlap'] = True
        result['dataname'] = data.name
        result['meta'] = self.get_meta()
        result['timecost'] = timecost
        result['clusters'] = clusters 

        save_result(result)
        self.result = result 
        return self 
コード例 #13
0
    def run(self,
            data,
            seed=None,
            lamb=1,
            trials=5,
            temp_step=0.999,
            initial_temp=1e-6):
        params = locals()
        del params['data']
        del params['self']
        if seed is None:
            seed = np.random.randint(999999)
        params['seed'] = seed
        if (data.is_directed() or data.is_weighted()) and False:
            raise Exception(
                "only undirected and unweighted graph is supported")
        if not utils.file_exists(data.file_edges):
            data.to_edgelist()

        with utils.TempDir() as tmp_dir:
            pajek = os.path.join(tmp_dir, 'edges.txt')
            utils.remove_if_file_exit(pajek)
            os.symlink(data.file_edges, pajek)
            cmd = "{} {} {} {} {} {} {}".format(
                config.get_OSLOM_prog('modopt', data.is_directed()), pajek,
                seed, lamb, trials, temp_step, initial_temp)

            self.logger.info("Running " + cmd)

            timecost, status = utils.timeit(
                lambda: utils.shell_run_and_wait(cmd, tmp_dir))
            if status != 0:
                raise Exception(
                    "Run command with error status code {}".format(status))

            outputfile = pajek + "part"
            with open(outputfile) as f:
                lines = [u.strip() for u in f]
            lines = [[int(v) for v in u.split("\t")] for u in lines]

        clusters = dict(enumerate(lines))
        self.logger.info("Made %d clusters in %f seconds" %
                         (len(clusters), timecost))

        result = {}
        result['runname'] = self.name
        result['params'] = params
        result['dataname'] = data.name
        result['meta'] = self.get_meta()
        result['timecost'] = timecost
        result['clusters'] = clusters
        save_result(result)
        self.result = result
        return self
コード例 #14
0
    def run(self, data, seed=None):
        self.logger.warning(
            "dct assumes node starts with zero and is continuous")
        if seed is None:
            seed = np.random.randint(999999)
        params = {'seed': seed, 'prog': self.progname}
        if (data.is_directed() or data.is_weighted()) and False:
            raise Exception(
                "only undirected and unweighted graph is supported")
        if not utils.file_exists(data.file_edges):
            data.to_edgelist()

        with utils.TempDir() as tmp_dir:
            #tmp_dir = "/tmp/abc"
            pajek = utils.link_file(data.file_edges,
                                    dest_dir=tmp_dir,
                                    destname='edges.txt')
            cmd = "{} {}".format(
                config.get_dct_prog(self.progname, data.is_directed()), pajek)

            self.logger.info("Running " + cmd)

            timecost, status = utils.timeit(lambda: utils.shell_run_and_wait(
                cmd, tmp_dir, env={'SEED': str(seed)}))
            if status != 0:
                raise Exception(
                    "Run command with error status code {}".format(status))

            outputfiles = glob.glob(tmp_dir + "/output*.cluster")
            import pandas as pd
            df_from_each_file = (pd.read_csv(f, sep=" ", header=None)
                                 for f in outputfiles)
            output = pd.concat(df_from_each_file, ignore_index=True)
            output.columns = ['node', 'cluster']
        clusters = output[['cluster', 'node']]
        clusters = clusters.groupby('cluster').apply(
            lambda u: list(u['node'])).to_dict()
        self.logger.info("Made %d clusters in %f seconds" %
                         (len(clusters), timecost))

        result = {}
        result['runname'] = self.name
        result['params'] = params
        result['dataname'] = data.name
        result['meta'] = self.get_meta()
        result['timecost'] = timecost
        result['clusters'] = clusters
        save_result(result)
        self.result = result
        return self
コード例 #15
0
    def run(self, data, seed=None):
        self.logger.warning(
            "dct::seq_louvain assumes node starts with zero and is continuous")
        if seed is None:
            seed = np.random.randint(999999)
        params = {'seed': seed}
        if (data.is_directed() or data.is_weighted()) and False:
            raise Exception(
                "only undirected and unweighted graph is supported")
        if not utils.file_exists(data.file_edges):
            data.to_edgelist()

        with utils.TempDir() as tmp_dir:
            pajek = utils.link_file(data.file_edges,
                                    dest_dir=tmp_dir,
                                    destname='edges.txt')
            cmd = "{} -f -s {} -o output {}".format(
                config.get_dct_prog('infomap', data.is_directed()), seed,
                pajek)

            self.logger.info("Running " + cmd)

            timecost, status = utils.timeit(
                lambda: utils.shell_run_and_wait(cmd, tmp_dir))
            if status != 0:
                raise Exception(
                    "Run command with error status code {}".format(status))

            outputfile = tmp_dir + "/output"
            import pandas as pd
            output = pd.read_csv(outputfile, sep=" ", header=None)
            output.columns = ['cluster']
            output['node'] = range(len(output))
        clusters = output[['cluster', 'node']]
        clusters = clusters.groupby('cluster').apply(
            lambda u: list(u['node'])).to_dict()
        self.logger.info("Made %d clusters in %f seconds" %
                         (len(clusters), timecost))

        result = {}
        result['runname'] = self.name
        result['params'] = params
        result['dataname'] = data.name
        result['meta'] = self.get_meta()
        result['timecost'] = timecost
        result['clusters'] = clusters
        save_result(result)
        self.result = result
        return self
コード例 #16
0
    def run(self, data, algorithm=4, minpts=4, epsilon=0.5, alpha=32768 , beta=32768 , thread=-1, seed=None):
        params = locals()
        del(params['self']);del(params['data'])
        if seed is not None:self.logger.info("seed ignored")        
        thread = utils.get_num_thread(thread)
        params['thread'] = thread 

        if False and (data.is_directed() or data.is_weighted()):
            raise Exception("only undirected and unweighted graph is supported")
        if not utils.file_exists(data.file_anyscan):
            data.to_anyscan()
        
        cmd = "{} -c {} -i {} -m {} -e {} -o {} -a {} -b {} -t {}".format(
            config.ANYSCAN_PROG, algorithm, data.file_anyscan, minpts, epsilon, 'output', alpha, beta, thread)
        self.logger.info("Running " + cmd)

        with utils.TempDir() as tmp_dir:
            timecost, status = utils.timeit(lambda: utils.shell_run_and_wait(cmd, tmp_dir))
            if status != 1:  # anyscan always return 1
                raise Exception("Run command with error status code {}".format(status))
            
            with open (os.path.join(tmp_dir, "output"), "r") as output:
                lines = [u.strip() for u in output.readlines()]
                
            n_nodes = int(lines[0])
            clusters_list = lines[1].split(" ")
            clusters_list = [int(u) for u in clusters_list ]
            
        if (n_nodes != len(clusters_list)):
            raise Exception("#node is not equals #cluster")
            
        from collections import defaultdict
        clusters = defaultdict(list)
        for n, c in enumerate(clusters_list):
            clusters[c].append(n)
        
        self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost))
        
        result = {}
        result['runname'] = self.name
        result['params'] = params
        result['dataname'] = data.name
        result['meta'] = self.get_meta()
        result['timecost'] = timecost
        result['clusters'] = clusters 

        save_result(result)
        self.result = result 
        return self 
コード例 #17
0
    def run(self, data, mu=3, epsilon=0.5, prog='pScan', seed=None):
        assert prog in ['pScan', 'ppScan', 'ppScanSSE']
        if seed is not None:self.logger.info("seed ignored")
                
        params = {'mu':mu, 'epsilon':epsilon, 'prog':prog}
        if False and (data.is_directed() or data.is_weighted()):
            raise Exception("only undirected and unweighted graph is supported")
        if not utils.file_exists(data.file_scanbin):
            data.to_scanbin()
        
        with utils.TempDir() as tmp_dir:
            scanbin = data.file_scanbin 
            os.symlink(os.path.join(scanbin, 'b_degree.bin'), os.path.join(tmp_dir, 'b_degree.bin'))
            os.symlink(os.path.join(scanbin, 'b_adj.bin'), os.path.join(tmp_dir, 'b_adj.bin'))
            if prog == 'pScan':
                EXE = config.PSCAN_PROG
            elif prog == 'ppScan':
                EXE = config.PPSCAN_PROG                
            elif prog == 'ppScanSSE':
                EXE = config.PPSCANSSE_PROG                
            cmd = "{} {} {} {} {}".format(EXE, tmp_dir, epsilon, mu, 'output')
            self.logger.info("Running " + cmd)
            timecost, status = utils.timeit(lambda: utils.shell_run_and_wait(cmd, tmp_dir))
            if status != 0:
                raise Exception("Run command with error status code {}".format(status))
    
            outputfile = glob.glob(tmp_dir + "/result-*-*.txt")[0]
            import pandas as pd 
            output = pd.read_csv(outputfile, sep=" ")
        
        clusters = output[output['c/n'] == 'c'][['vertex_id', 'cluster_id']]
        others = output[output['c/n'] != 'c'][['vertex_id', 'cluster_id']]
        clusters = clusters.groupby('cluster_id').apply(lambda u: list(u['vertex_id'])).to_dict()
        others = others.values.tolist()

        self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost))
        
        result = {}
        result['runname'] = self.name
        result['params'] = params
        result['dataname'] = data.name
        result['meta'] = self.get_meta()
        result['timecost'] = timecost
        result['clusters'] = clusters 
        result['noise_or_hub'] = others 
        save_result(result)
        self.result = result 
        return self 
コード例 #18
0
    def run(self, data, **kwargs):
        if False and (data.is_directed()):
            raise Exception("only undirected is supported")
        params = dict(kwargs)
        params = {k:v for k, v in params.items() if v is not None }
        if not data.is_directed():
            params['Sym'] = 1
        params['d'] = "output"
        if "r" not in params: params['r'] = 0.1

        if not utils.file_exists(data.file_edges):
            data.to_edgelist()
        
        txt_params = " ".join(["-{} {}".format(k, v) for k, v in params.items()]) 
        cmd = "{} -jar {} -i {} {} ".format(utils.get_java_command(), config.GANXISW_PROG, "edges.txt", txt_params)
        with utils.TempDir() as tmp_dir:
            utils.remove_if_file_exit(os.path.join(tmp_dir, "output"), True)
            utils.create_dir_if_not_exists(os.path.join(tmp_dir, "output"))
            self.logger.info("Running " + cmd)
            utils.link_file(data.file_edges, tmp_dir, "edges.txt")
            timecost, status = utils.timeit(lambda: utils.shell_run_and_wait(cmd, tmp_dir))
            if status != 0: 
                raise Exception("Run command with error status code {}".format(status))
            outputfile = glob.glob(os.path.join(tmp_dir, "output/SLPAw*.icpm"))[0]
            clusters = []
            with open (os.path.join(tmp_dir, outputfile), "r") as f:
                for line in f: 
                    clusters.append([int(u) for u in line.strip().split(" ")])
            clusters = dict(enumerate(clusters))
        
        self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost))

        result = {}
        result['runname'] = self.name
        result['params'] = params
        result['dataname'] = data.name
        result['meta'] = self.get_meta()
        result['timecost'] = timecost
        result['clusters'] = clusters 

        save_result(result)
        self.result = result 
        return self 
コード例 #19
0
    def run(self,
            data,
            refine=False,
            gamma=1.0,
            par="balanced",
            maxIter=32,
            turbo=True,
            recurse=True,
            seed=None):
        if False and (data.is_directed()):
            raise Exception("only undirected is supported")
        if seed is not None: self.logger.info("seed ignored")

        params = locals()
        del params['data']
        del params['self']
        del params['seed']

        g = convert.to_networkit(data)
        fun = lambda: networkit.community.detectCommunities(
            g, algo=networkit.community.PLM(g, **params))
        timecost, ret = utils.timeit(fun)
        clusters = {}
        for c in ret.getSubsetIds():
            clusters[c] = list(ret.getMembers(c))

        self.logger.info("Made %d clusters in %f seconds." %
                         (len(clusters), timecost))

        result = {}
        result['params'] = params
        result['timecost'] = timecost
        result['runname'] = self.name
        result['dataname'] = data.name
        result['meta'] = self.get_meta()
        result['clusters'] = clusters

        save_result(result)
        self.result = result
        return self
コード例 #20
0
    def run(self,
            data,
            vmax_start=None,
            vmax_end=None,
            c=None,
            niter=None,
            seed=None):
        if False and (data.is_directed() or data.is_weighted()):
            raise Exception(
                "only undirected and unweighted graph is supported")

        def fun():
            paris = utils.try_import("paris", config.MODULE_PARIS_PATH)
            G = convert.to_networkx(data)
            D = paris.paris(G, copy_graph=False)
            uu = utils.try_import("utils", config.MODULE_PARIS_PATH)
            best = uu.best_clustering(D)
            return best

        timecost, res = utils.timeit(fun)

        params = {}

        clusters = dict(enumerate(res))

        self.logger.info("Made %d clusters in %f seconds" %
                         (len(clusters), timecost))

        result = {}
        result['runname'] = self.name
        result['params'] = params
        result['dataname'] = data.name
        result['meta'] = self.get_meta()
        result['timecost'] = timecost
        result['clusters'] = clusters

        save_result(result)
        self.result = result
        return self
コード例 #21
0
    def run(self, data, **kwargs):
        
        if False and (data.is_directed()):
            raise Exception("only undirected is supported")
        params = dict(kwargs)
        if "seed" in params:
            if  params['seed'] is not None:self.logger.info("seed ignored")
            del params['seed']
        params['n_jobs'] = utils.get_num_thread(None if "n_jobs" not in params else params['n_jobs'])
        params['affinity'] = 'precomputed'
        if  False and ('eigen_solver' not in params or params['eigen_solver'] is None):
            if utils.check_module_available('pyamg'):
                pass 
                params['eigen_solver'] = 'amg'
        A = convert.to_coo_adjacency_matrix(data, simalarity=True)

        def fun():
            obj = sklearn.cluster.SpectralClustering(**params)
            return obj.fit_predict(A)

        timecost, res = utils.timeit(fun)
        
        clusters = DefaultDict(list)
        for i, c in enumerate(res):
            clusters[str(c)].append(i)
        self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost))

        result = {}
        result['runname'] = self.name
        result['params'] = params
        result['dataname'] = data.name
        result['meta'] = self.get_meta()
        result['timecost'] = timecost
        result['clusters'] = clusters 

        save_result(result)
        self.result = result 
        return self 
コード例 #22
0
    def run(self, data, cutoff_r=0.01, inflation_in=2, NBDisimilarity_q=0.3, seed=None):
        if False and (data.is_directed()):
            raise Exception("only undirected is supported")
        if seed is not None:self.logger.info("seed ignored")        
        params = locals();del params['self'];del params['data']

        if not utils.file_exists(data.file_edges):
            data.to_edgelist()
        
        cmd = "{} {} {} {} {}".format(config.LABLE_RANK_PROG, "edges.txt", cutoff_r, inflation_in, NBDisimilarity_q)
        with utils.TempDir() as tmp_dir:
            utils.remove_if_file_exit(os.path.join(tmp_dir, "output"), True)
            utils.create_dir_if_not_exists(os.path.join(tmp_dir, "output"))
            self.logger.info("Running " + cmd)
            utils.link_file(data.file_edges, tmp_dir, "edges.txt")
            timecost, status = utils.timeit(lambda: utils.shell_run_and_wait(cmd, tmp_dir))
            if status != 0: 
                raise Exception("Run command with error status code {}".format(status))
            outputfile = glob.glob(os.path.join(tmp_dir, "output/LabelRank*.icpm"))[0]
            clusters = []
            with open (os.path.join(tmp_dir, outputfile), "r") as f:
                for line in f: 
                    clusters.append([int(u) for u in line.strip().split(" ")])
            clusters = dict(enumerate(clusters))
        
        self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost))

        result = {}
        result['runname'] = self.name
        result['params'] = params
        result['dataname'] = data.name
        result['meta'] = self.get_meta()
        result['timecost'] = timecost
        result['clusters'] = clusters 

        save_result(result)
        self.result = result 
        return self 
コード例 #23
0
    def run(self, data, dimensions=32, iterations=200, rho=100.0, seed=None):
        
        if seed is not None:self.logger.info("seed ignored")

        params = {}
        params['dimensions'] = dimensions
        params['iterations'] = iterations
        params['rho'] = rho
        
        timecost, result = utils.timeit(lambda: run_SymmNMF(data, params))
        clusters = {}
        for k, vv in result.items():
            if isinstance(vv, list):
                for v in vv:
                    v = int(v)
                    if v not in clusters:
                        clusters[v] = []
                    clusters[v].append(k)
            else:
                v = vv
                v = int(v)
                if v not in clusters:
                    clusters[v] = []
                clusters[v].append(k)

        self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost))

        result = {}
        result['runname'] = self.name
        result['params'] = params
        result['dataname'] = data.name
        result['meta'] = self.get_meta()
        result['timecost'] = timecost
        result['clusters'] = clusters 

        save_result(result)
        self.result = result 
        return self
コード例 #24
0
    def run(self, data, mu=1, epsilon=0, seed=None):
        params = {'mu':mu, 'epsilon':epsilon}
        if seed is not None:self.logger.info("seed ignored")        
        if data.is_weighted():
            raise Exception("only undirected and unweighted graph is supported")
        if not utils.file_exists(data.file_edges):
            data.to_edgelist()
        
        cmd = "{} -e {} -m {} -r {}".format(config.SCANPP_PROG, epsilon, mu, data.file_edges)
        self.logger.info("Running " + cmd)
        timecost, output = utils.timeit(lambda: utils.check_output(cmd.split(" ")))
        if not output.startswith('node'):
            raise Exception("Something wrong with scapp. output:\n" + output)

        output = [u.strip() for u in output.split("\n")][1:]
        output = [u.split("\t") for u in output if u]
        output = [[int(v) for v in u] for u in output]
        
        from collections import defaultdict
        clusters = defaultdict(list)
        for n, c in output:
            clusters[c].append(n)
        
        self.logger.info("Made %d clusters in %f seconds" % (len(clusters), timecost))
        
        result = {}
        result['runname'] = self.name
        result['params'] = params
        result['dataname'] = data.name
        result['meta'] = self.get_meta()
        result['timecost'] = timecost
        result['clusters'] = clusters 

        save_result(result)
        self.result = result 
        return self 
コード例 #25
0
    def run(self,
            data,
            v=5,
            v1=None,
            v2=None,
            prop=None,
            repeat=None,
            mo=None,
            nosplit=False,
            extrasimplify=False,
            q=False,
            seed=None):
        assert (v1 is None and v2 is None) or (v1 is not None
                                               and v2 is not None)
        params = locals()
        del params['self']
        del params['data']
        if seed is not None: self.logger.info("seed ignored")
        if (data.is_directed() or data.is_weighted()) and False:
            raise Exception(
                "only undirected and unweighted graph is supported")
        if not utils.file_exists(data.file_edges):
            data.to_edgelist()

        with utils.TempDir() as tmp_dir:
            pajek = os.path.join(tmp_dir, 'edges.txt')
            utils.remove_if_file_exit(pajek)
            os.symlink(data.file_edges, pajek)
            cmd = "{} -cp {} COPRA {} -w -v {}".format(
                utils.get_java_command(),
                config.get_OSLOM_prog('copra', data.is_directed()), pajek, v)
            if (v1 is not None and v2 is not None):
                cmd += "-vs {} {}".format(v1, v2)
            if prop is not None: cmd += ' -prop {}'.format(prop)
            if repeat is not None: cmd += ' -repeat {}'.format(repeat)
            if mo is not None: cmd += ' -mo'
            if nosplit is not None: cmd += ' -nosplit'
            if extrasimplify is not None: cmd += ' -extrasimplify'
            if q is not None: cmd += ' -q'

            self.logger.info("Running " + cmd)

            timecost, status = utils.timeit(
                lambda: utils.shell_run_and_wait(cmd, tmp_dir))
            if status != 0:
                raise Exception(
                    "Run command with error status code {}".format(status))

            outputfile = os.path.join(tmp_dir, 'clusters-edges.txt')
            with open(outputfile) as f:
                lines = [u.strip() for u in f]
            lines = [[int(v) for v in u.split(" ")] for u in lines]

        clusters = dict(enumerate(lines))
        self.logger.info("Made %d clusters in %f seconds" %
                         (len(clusters), timecost))

        result = {}
        result['runname'] = self.name
        result['params'] = params
        result['dataname'] = data.name
        result['meta'] = self.get_meta()
        result['timecost'] = timecost
        result['clusters'] = clusters
        save_result(result)
        self.result = result
        return self
コード例 #26
0
    def run(self,
            data,
            seed=None,
            r=10,
            hr=50,
            t=0.1,
            cp=0.5,
            fast=False,
            singlet=False,
            infomap=False,
            copra=False,
            louvain=False,
            runs=1):
        params = locals()
        del params['data']
        del params['self']
        if seed is None:
            seed = np.random.randint(999999)
        params['seed'] = seed
        if (data.is_directed() or data.is_weighted()) and False:
            raise Exception(
                "only undirected and unweighted graph is supported")
        if int(infomap) + int(copra) + int(louvain) > 1:
            raise Exception("only of infomap, corpra, louvain can be true")

        if not utils.file_exists(data.file_edges):
            data.to_edgelist()

        with utils.TempDir() as tmp_dir:

            # tmp_dir = "/tmp/abc"
            def link_file(path, destname=None):
                if destname is None:
                    destname = path.split("/")[-1]
                destpath = os.path.join(tmp_dir, destname)
                utils.remove_if_file_exit(destpath)
                os.symlink(path, destpath)
                return destpath

            pajek = link_file(data.file_edges)
            if copra:
                _ = link_file(
                    config.get_OSLOM_prog('copra', data.is_directed()))
            if infomap:
                _ = link_file(config.get_OSLOM_prog('infomap_script', True))
                _ = link_file(config.get_OSLOM_prog('infomap', True))
                _ = link_file(config.get_OSLOM_prog('infomap_script', False))
                _ = link_file(config.get_OSLOM_prog('infomap', False))
            if louvain:
                _ = link_file(
                    config.get_OSLOM_prog('louvain_script',
                                          data.is_directed()))
                _ = link_file(
                    config.get_OSLOM_prog('convert', data.is_directed()))
                _ = link_file(
                    config.get_OSLOM_prog('community', data.is_directed()))
                _ = link_file(
                    config.get_OSLOM_prog('hierarchy', data.is_directed()))

            cmd = "{} -f {} -{} -r {} -hr {} -seed {} -t {} -cp {}".format(
                config.get_OSLOM_prog('oslom', data.is_directed()), pajek,
                'w' if data.is_weighted() else 'uw', r, hr, seed, t, cp)
            if fast: cmd += " -fast"
            if singlet: cmd += " -singlet"
            if infomap: cmd += " -infomap {}".format(runs)
            if copra: cmd += " -copra {}".format(runs)
            if louvain: cmd += " -louvain {}".format(runs)

            self.logger.info("Running " + cmd)

            timecost, status = utils.timeit(
                lambda: utils.shell_run_and_wait(cmd, tmp_dir))
            if status != 0:
                raise Exception(
                    "Run command with error status code {}".format(status))

            outputfiles = glob.glob(
                os.path.join(tmp_dir, "edges.txt_oslo_files", "tp*"))
            clusters = {}
            for tp in outputfiles:
                fname = tp.split("/")[-1]
                if fname == 'tp':
                    level = 0
                else:
                    level = int(fname[2:])
                with open(tp) as f:
                    lines = [u.strip() for u in f if not u.startswith('#')]
                    lines = [[int(v) for v in u.split(" ")] for u in lines]
                    clusters[level] = dict(enumerate(lines))

            max_level = max(list(clusters.keys()))

        self.logger.info(
            "Made %d levels of clusters with #clusters %s in %f seconds" %
            (len(clusters), str([len(u)
                                 for u in clusters.values()]), timecost))

        result = {}
        result['multilevel'] = True
        result['num_level'] = len(clusters)
        result['max_level'] = max_level
        result['runname'] = self.name
        result['params'] = params
        result['dataname'] = data.name
        result['meta'] = self.get_meta()
        result['timecost'] = timecost
        result['clusters'] = clusters
        save_result(result)
        self.result = result
        return self
コード例 #27
0
    def run(self,
            data,
            startk=None,
            finalk=None,
            runs=None,
            ensemblesize=None,
            algorithm=None,
            seed=None):
        if False and (data.is_directed() or data.is_weighted()):
            raise Exception(
                "only undirected and unweighted graph is supported")

        params = locals()
        del (params['self'])
        del (params['data'])
        if seed is None:
            seed = np.random.randint(999999)
        params['seed'] = seed

        params['inpfmt'] = 'e'
        params['outfile'] = 'output'
        params['outfmt'] = 'l'
        params = {u: v for u, v in params.items() if v is not None}

        if not utils.file_exists(data.file_edges):
            data.to_edgelist()

        cmd = "{} {} {}".format(
            config.CGGC_PROG,
            " ".join(['--{}={}'.format(u, v) for u, v in params.items()]),
            data.file_edges)
        self.logger.info("Running " + cmd)

        with utils.TempDir() as tmp_dir:
            timecost, status = utils.timeit(
                lambda: utils.shell_run_and_wait(cmd, tmp_dir))
            if status != 0:
                raise Exception(
                    "Run command with error status code {}".format(status))

            with open(os.path.join(tmp_dir, "output"), "r") as output:
                lines = [u.strip() for u in output.readlines()]

        from collections import defaultdict
        clusters = defaultdict(list)
        for c, line in enumerate(lines):
            if line.startswith('#'): continue
            for n in line.split(" "):
                clusters[c].append(int(n))

        self.logger.info("Made %d clusters in %f seconds" %
                         (len(clusters), timecost))

        result = {}
        result['runname'] = self.name
        result['params'] = params
        result['dataname'] = data.name
        result['meta'] = self.get_meta()
        result['timecost'] = timecost
        result['clusters'] = clusters

        save_result(result)
        self.result = result
        return self
コード例 #28
0
    def run(self, data, **kwargs):
        if False and (data.is_directed()):
            raise Exception("only undirected is supported")

        params = dict(kwargs)
        if "seed" in params:
            if params['seed'] is not None:
                self.logger.info("seed ignored")
            del params['seed']

        #params['abc'] = ''
        params['o'] = 'output'
        params = {u: v for u, v in params.items() if v is not None}

        if not utils.file_exists(data.file_mcl_mci):
            data.to_mcl_mci()
            if not utils.file_exists(data.file_mcl_mci):
                raise Exception("failed to crate mcl mci format file")

        with utils.TempDir() as tmp_dir:
            cmd1 = "{} {} {}".format(
                config.MCL_PROG, data.file_mcl_mci, " ".join([
                    '{}{} {}'.format('-' if len(u) == 1 else '--', u,
                                     v).strip() for u, v in params.items()
                ]))
            cmd2 = "{} -imx {} -o cluster.output".format(
                config.MCLDUMP_PROG, 'output')
            cmdfile = os.path.join(tmp_dir, "tmpcmd.sh")
            with open(cmdfile, 'wt') as f:
                f.write(cmd1 + "\n")
                f.write(cmd2 + "\n")
            self.logger.info("Running " + cmd1)
            self.logger.info("Running " + cmd2)

            timecost, status = utils.timeit(
                lambda: utils.shell_run_and_wait("bash " + cmdfile, tmp_dir))
            if status != 0:
                raise Exception(
                    "Run command with error status code {}".format(status))

            with open(os.path.join(tmp_dir, "cluster.output"), "r") as output:
                lines = [u.strip() for u in output.readlines()]

        from collections import defaultdict
        clusters = defaultdict(list)
        for line in lines:
            cluster, node = line.split("\t")[:2]
            clusters[int(cluster)].append(int(node))

        self.logger.info("Made %d clusters in %f seconds" %
                         (len(clusters), timecost))

        result = {}
        result['runname'] = self.name
        result['params'] = params
        result['dataname'] = data.name
        result['meta'] = self.get_meta()
        result['timecost'] = timecost
        result['clusters'] = clusters

        save_result(result)
        self.result = result
        return self
コード例 #29
0
    def run(self, data, **kwargs):
        if False and (data.is_directed() or data.is_weighted()):
            raise Exception(
                "only undirected and unweighted graph is supported")
        params = dict(kwargs)
        params = {u: v for u, v in params.items() if v is not None}

        if "loss" not in params: params['loss'] = 'modularity'
        argparams = dict(params)
        if argparams['loss'] in ['pmod', 'mom']:
            if argparams['loss'] not in argparams:
                raise Exception(
                    "You have to specify pmod=<val> or mom=<val> for the loss function"
                )
            loss_args = "--loss {{{},{}}}".format(argparams['loss'],
                                                  argparams[argparams['loss']])
            del argparams[argparams['loss']]
            del argparams['loss']

        else:
            loss_args = "--loss {}".format(argparams["loss"])
            del argparams['loss']

        if not utils.file_exists(data.file_edges):
            data.to_edgelist()

        with utils.TempDir() as tmp_dir:
            pajek = utils.link_file(data.file_edges,
                                    dest_dir=tmp_dir,
                                    destname='edges.txt')
            cmdargs = ["--{} {}".format(u, v) for u, v in argparams.items()]
            cmdargs.append(loss_args)
            cmdargs.append("-o lsooutput")
            cmdargs = " ".join(cmdargs)
            cmd = "{} {} {}".format(config.LSO_CLUSTER_PROG, pajek, cmdargs)
            with open(os.path.join(tmp_dir, "tmpcmd"), 'wt') as f:
                f.write(cmd)

            self.logger.info("Running " + cmd)

            timecost, status = utils.timeit(
                lambda: utils.shell_run_and_wait("bash -x ./tmpcmd", tmp_dir))
            if status != 0:
                raise Exception(
                    "Run command with error status code {}".format(status))

            outputfiles = tmp_dir + "/lsooutput"
            import pandas as pd

            output = pd.read_csv(outputfiles, sep="\t", header=None)
            output.columns = ['node', 'cluster']
        clusters = output
        clusters = clusters.groupby('cluster').apply(
            lambda u: list(u['node'])).to_dict()
        self.logger.info("Made %d clusters in %f seconds" %
                         (len(clusters), timecost))

        result = {}
        result['runname'] = self.name
        result['params'] = params
        result['dataname'] = data.name
        result['meta'] = self.get_meta()
        result['timecost'] = timecost
        result['clusters'] = clusters
        save_result(result)
        self.result = result
        return self
コード例 #30
0
    def run(self,
            data,
            vmax_start=None,
            vmax_end=None,
            c=None,
            niter=None,
            seed=None):
        if False and (data.is_directed() or data.is_weighted()):
            raise Exception(
                "only undirected and unweighted graph is supported")
        params = locals()
        del (params['self'])
        del (params['data'])
        params['f'] = data.file_edges
        params['o'] = 'output'
        params = {
            u.replace("_", "-"): v
            for u, v in params.items() if v is not None
        }

        if not utils.file_exists(data.file_edges):
            data.to_edgelist()

        cmd = "{} {} {}".format(
            config.STREAMCOM_PROG, " ".join([
                '{}{} {}'.format('-' if len(u) == 1 else '--', u, v)
                for u, v in params.items()
            ]), data.file_edges)
        self.logger.info("Running " + cmd)

        with utils.TempDir() as tmp_dir:
            timecost, status = utils.timeit(
                lambda: utils.shell_run_and_wait(cmd, tmp_dir))
            if status != 0:
                raise Exception(
                    "Run command with error status code {}".format(status))

            output = glob.glob(os.path.join(tmp_dir, "output*"))[0]
            with open(os.path.join(tmp_dir, output), "r") as output:
                lines = [u.strip() for u in output.readlines()]

        from collections import defaultdict
        clusters = defaultdict(list)
        for c, line in enumerate(lines):
            if line.startswith('#'): continue
            for n in line.split(" "):
                clusters[c].append(int(n))

        self.logger.info("Made %d clusters in %f seconds" %
                         (len(clusters), timecost))

        result = {}
        result['runname'] = self.name
        result['params'] = params
        result['dataname'] = data.name
        result['meta'] = self.get_meta()
        result['timecost'] = timecost
        result['clusters'] = clusters

        save_result(result)
        self.result = result
        return self