def test_small(self): d = Set() for x in range(15): d.add(str(x)) self.assertEqual(hash27("".join(d)), 6636034109572507556)
def test_large(self): d = Set() for x in range(60000): d.add(str(x)) self.assertEqual(hash27("".join(d)), -35326655653467556)
def core_removal(threshold, graph): if len(graph) == 1: # need at least two nodes in the graph... return [graph] avg_deg, density = graph_stats(graph) if density >= threshold: return [graph] else: # find and remove core nodes; create connected subcomponents core_nodes = get_core_nodes(graph, avg_deg) result = [] subgraphs = [] for v, n in graph.items(): if v in core_nodes: continue n = n - core_nodes # note that we're reassigning n for s in subgraphs: if not n.isdisjoint(s): s |= n break else: subgraphs.append(n | Set([v])) # connected subcomponent joining i = 0 while i < len(subgraphs) - 1: j = i + 1 while j < len(subgraphs): if not subgraphs[i].isdisjoint(subgraphs[j]): subgraphs[i] |= subgraphs[j] subgraphs.pop(j) else: j += 1 i += 1 # recursive core removal for s in subgraphs: tresults = core_removal(threshold, Dict((v, graph[v] & s) for v in s)) for tc in tresults: nodes = Set() for v, n in tc.items(): nodes.add(v) n |= graph[v] & core_nodes for c in core_nodes: tc[c] = graph[c] & (nodes | core_nodes) result += tresults return result
def test_copy(self): d = Set() for x in range(500): d.add(str(x)) d = d.copy() self.assertEqual(hash27("".join(d)), 1141231293364439680)
def test_values(self): d = Set() d.add(("abc", 1)) d.add(3.3) d.add(30) d.add("test1234") self.assertEqual(hash27("".join([str(k) for k in d])), 7766555225202364718)
def test_pop(self): d = Set() for x in range(500): d.add(str(x)) d.pop() self.assertEqual(hash27("".join(d)), -434207861779954688)
def test_delete(self): d = Set() for x in range(500): d.add(str(x)) d.remove("53") d.discard("155") self.assertEqual(hash27("".join(d)), -8652364590473687932)
def test_clear(self): d = Set() for x in range(500): d.add(str(x)) d.clear() for x in range(1000, 1500): d.add(str(x)) self.assertEqual(hash27("".join(d)), -1473514505880218088)
def test_pickle(self): d = Set() for x in range(500): d.add(str(x)) d.remove("300") # Pickle and reload object data = pickle.dumps(d) d = pickle.loads(data) self.assertEqual(hash27("".join(d)), 6818550152093286356)
def test_merge(self): # Build list of (key, value) pairs to preserve insertion ordering d = [] e = [] for x in range(200): d.append(str(x)) for x in range(200): e.append(str(x)) m = Set(d) m.update(e) self.assertEqual(hash27("".join(m)), -5846033856052761336)
def cluster(self, verbose=False): data = Dict() with open(self.filename, 'r') as f: for line in f: a, b = line.split()[:2] if a in data: data[a].add(b) else: data[a] = Set() data[a].add(b) if b in data: data[b].add(a) else: data[b] = Set() data[b].add(a) # step 1: find preliminary cores SC = [] # currently-detected preliminary cores count = 0 for vertex, neighbors in tqdm(data.items()): # build neighborhood graph vertices = Set([vertex]) | neighbors size1_neighbors = Set() graph = {} for v in vertices: n = data[v] & vertices if len(n) > 1: # ignore size-1 vertices graph[v] = n else: size1_neighbors.add(v) if len(graph) < 2: # not enough connections in this graph continue graph[vertex] -= size1_neighbors # get core graph avg_deg, density = graph_stats(graph) core_nodes = get_core_nodes(graph, avg_deg) vertices = Set(graph.keys()) for v in vertices - core_nodes: del graph[v] for n in graph.values(): n &= core_nodes if len(graph) < 2: # not enough connections in this graph continue graph_nodes = Set(graph) # inner loop for sg in core_removal(self.density_threshold, graph): while True: _, density = graph_stats(sg) # if density threshold met, stop; else, remove min degree node if density >= self.density_threshold: break w = min(sg.items(), key=lambda k: len(k[1]))[0] del sg[w] for n in sg.values(): n.discard(w) sg_nodes = Set(sg) while graph_nodes - sg_nodes: w = max(graph_nodes - sg_nodes, key=lambda v: len(graph[v] & sg_nodes)) new_sg = sg.copy() for v, n in new_sg.items(): if w in graph[v]: n.add(w) new_sg[w] = graph[w] & sg_nodes _, density = graph_stats(new_sg) if density < self.density_threshold: break sg = new_sg sg_nodes.add(w) # redundancy filtering max_sim = -1 for i in range(len(SC)): sim = NA_score(Set(SC[i]), sg_nodes) if sim > max_sim: max_sim = sim index = i if max_sim < self.affinity_threshold: SC.append(sg) else: _, density_i = graph_stats(SC[index]) if density * len(sg) > density_i * len(SC[index]): SC[index] = sg # step 2: adding peripheral proteins clusters = Set() for core in SC: nodes = frozenset(core) neighbors = reduce(lambda x, y: x | y, (data[v] for v in nodes)) - nodes neighbors -= Set(v for v in neighbors if float(len(data[v] & nodes)) / len(nodes) <= self.closeness_threshold) clusters.add(tuple(nodes | neighbors)) self.clusters = clusters print("Found %d clusters/protein complexes" % (len(clusters))) return clusters # if __name__ == '__main__': # filename = "../data/unweighted_example_network.txt" # c = COACH(filename) # c.cluster()
from py27hash.dict import Dict from py27hash.set import Set from tqdm import tqdm from .cluster_alg import ClusterAlg # return average degree and density for a graph def graph_stats(graph): avg_deg = sum(len(n) for n in graph.values()) / float(len(graph)) density = avg_deg / (len(graph) - 1) return avg_deg, density # return core nodes, given a graph and its average degree get_core_nodes = lambda g, avg: Set(v for v, n in g.items() if len(n) >= avg) # return NA score NA_score = lambda a, b: float(len(a & b)**2) / (len(a) * len(b)) def core_removal(threshold, graph): if len(graph) == 1: # need at least two nodes in the graph... return [graph] avg_deg, density = graph_stats(graph) if density >= threshold: return [graph] else: # find and remove core nodes; create connected subcomponents core_nodes = get_core_nodes(graph, avg_deg)
def cluster(self, verbose=False): # data = defaultdict(Set) # node id => neighboring node ids data = Dict() # read in graph with open(self.filename, 'r') as f: counter = 0 for line in f: a, b = line.split()[:2] counter += 1 if a in data: data[a].add(b) else: data[a] = Set() data[a].add(b) if b in data: data[b].add(a) else: data[b] = Set() data[b].add(a) # weights = defaultdict(int) weights = Dict() for a, b in combinations(data, 2): if b not in data[a]: continue shared = len(data[a] & data[b]) if a in weights: weights[a] += shared else: weights[a] = 0 weights[a] += shared if b in weights: weights[b] += shared else: weights[b] = 0 weights[b] += shared unvisited = Set(data) num_clusters = 0 clusters = [] # print(unvisited) # return 0 # Potential culprit seed_nodes = sorted(data, key=lambda k: (weights[k], len(data[k])), reverse=True) for seed in seed_nodes: # get highest degree node if seed not in unvisited: continue cluster = Set( (seed, next(iter(data[seed])))) # seed and random neighbor while True: # rank neighbors by the number of edges between the node and cluster nodes frontier = sorted((len(data[p] & cluster), p) for p in Set.union(*((data[n] - cluster) for n in cluster))) # do this until IN_vk < T_IN, SP <= 2 is met, or no frontier nodes left found = False while frontier and not found: m_vk, p = frontier.pop() if m_vk < self.t_in * len(cluster): break c_2neighbors = data[p] & cluster c_2neighbors.update(*(data[c] & cluster for c in c_2neighbors)) if cluster == c_2neighbors: found = True break if not found: break # otherwise, add the node to the cluster cluster.add(p) unvisited -= cluster if verbose: print(' '.join(cluster)) num_clusters += 1 if verbose: print(num_clusters, len(cluster), len(unvisited)) clusters.append(cluster) if not unvisited: break self.clusters = clusters # if __name__ == '__main__': # filename = "../data/unweighted_example_network.txt" # c = IPCA(filename) # c.cluster()
def ipca(filename): data = defaultdict(Set) # node id => neighboring node ids # read in graph with open(filename, 'r') as f: counter = 0 for line in f: a,b = line.split()[:2] print(a,b) print(counter) counter += 1 data[a].add(b) data[b].add(a) weights = defaultdict(int) for a,b in combinations(data, 2): if b not in data[a]: continue shared = len(data[a] & data[b]) weights[a] += shared weights[b] += shared unvisited = Set(data) num_clusters = 0 clusters = [] # Potential culprit seed_nodes = sorted(data, key=lambda k: (weights[k],len(data[k])), reverse=True) for seed in seed_nodes: # get highest degree node if seed not in unvisited: continue cluster = Set((seed,next(iter(data[seed])))) # seed and random neighbor while True: # rank neighbors by the number of edges between the node and cluster nodes frontier = sorted((len(data[p] & cluster),p) for p in Set.union(*((data[n] - cluster) for n in cluster))) # do this until IN_vk < T_IN, SP <= 2 is met, or no frontier nodes left found = False while frontier and not found: m_vk,p = frontier.pop() if m_vk < T_IN * len(cluster): break c_2neighbors = data[p] & cluster c_2neighbors.update(*(data[c] & cluster for c in c_2neighbors)) if cluster == c_2neighbors: found = True break if not found: break # otherwise, add the node to the cluster cluster.add(p) unvisited -= cluster print (' '.join(cluster)) num_clusters += 1 print (num_clusters, len(cluster), len(unvisited)) clusters.append(cluster) if not unvisited: break
def coach(filename): # read protein-protein pairs # data = defaultdict(Set) data = Dict() with open(filename, 'r') as f: for line in f: a, b = line.split()[:2] if a in data: data[a].add(b) else: data[a] = Set() data[a].add(b) if b in data: data[b].add(a) else: data[b] = Set() data[b].add(a) # step 1: find preliminary cores SC = [] # currently-detected preliminary cores count = 0 for vertex, neighbors in tqdm(data.items()): # build neighborhood graph vertices = Set([vertex]) | neighbors size1_neighbors = Set() graph = {} for v in vertices: n = data[v] & vertices if len(n) > 1: # ignore size-1 vertices graph[v] = n else: size1_neighbors.add(v) if len(graph) < 2: # not enough connections in this graph continue graph[vertex] -= size1_neighbors # get core graph avg_deg, density = graph_stats(graph) core_nodes = get_core_nodes(graph, avg_deg) vertices = Set(graph.keys()) for v in vertices - core_nodes: del graph[v] for n in graph.values(): n &= core_nodes if len(graph) < 2: # not enough connections in this graph continue graph_nodes = Set(graph) # inner loop for sg in core_removal(graph): while True: _, density = graph_stats(sg) # if density threshold met, stop; else, remove min degree node if density >= DENSITY_THRESHOLD: break w = min(sg.items(), key=lambda k: len(k[1]))[0] del sg[w] for n in sg.values(): n.discard(w) sg_nodes = Set(sg) while graph_nodes - sg_nodes: w = max(graph_nodes - sg_nodes, key=lambda v: len(graph[v] & sg_nodes)) new_sg = sg.copy() for v, n in new_sg.items(): if w in graph[v]: n.add(w) new_sg[w] = graph[w] & sg_nodes _, density = graph_stats(new_sg) if density < DENSITY_THRESHOLD: break sg = new_sg sg_nodes.add(w) # redundancy filtering max_sim = -1 for i in range(len(SC)): sim = NA_score(Set(SC[i]), sg_nodes) if sim > max_sim: max_sim = sim index = i if max_sim < AFFINITY_THRESHOLD: SC.append(sg) else: _, density_i = graph_stats(SC[index]) if density * len(sg) > density_i * len(SC[index]): SC[index] = sg # step 2: adding peripheral proteins clusters = Set() for core in SC: nodes = frozenset(core) neighbors = reduce(lambda x, y: x | y, (data[v] for v in nodes)) - nodes neighbors -= Set( v for v in neighbors if float(len(data[v] & nodes)) / len(nodes) <= CLOSENESS_THRESHOLD) print(nodes) print(neighbors) print(nodes | neighbors) clusters.add(tuple(nodes | neighbors)) return clusters