Example #1
0
    def test_small(self):
        d = Set()

        for x in range(15):
            d.add(str(x))

        self.assertEqual(hash27("".join(d)), 6636034109572507556)
Example #2
0
    def test_large(self):
        d = Set()

        for x in range(60000):
            d.add(str(x))

        self.assertEqual(hash27("".join(d)), -35326655653467556)
Example #3
0
def core_removal(threshold, graph):
    if len(graph) == 1:  # need at least two nodes in the graph...
        return [graph]

    avg_deg, density = graph_stats(graph)
    if density >= threshold:
        return [graph]
    else:
        # find and remove core nodes; create connected subcomponents
        core_nodes = get_core_nodes(graph, avg_deg)
        result = []
        subgraphs = []
        for v, n in graph.items():
            if v in core_nodes: continue
            n = n - core_nodes  # note that we're reassigning n
            for s in subgraphs:
                if not n.isdisjoint(s):
                    s |= n
                    break
            else:
                subgraphs.append(n | Set([v]))
        # connected subcomponent joining
        i = 0
        while i < len(subgraphs) - 1:
            j = i + 1
            while j < len(subgraphs):
                if not subgraphs[i].isdisjoint(subgraphs[j]):
                    subgraphs[i] |= subgraphs[j]
                    subgraphs.pop(j)
                else:
                    j += 1
            i += 1
        # recursive core removal
        for s in subgraphs:
            tresults = core_removal(threshold,
                                    Dict((v, graph[v] & s) for v in s))
            for tc in tresults:
                nodes = Set()
                for v, n in tc.items():
                    nodes.add(v)
                    n |= graph[v] & core_nodes
                for c in core_nodes:
                    tc[c] = graph[c] & (nodes | core_nodes)
            result += tresults
        return result
Example #4
0
    def test_copy(self):
        d = Set()

        for x in range(500):
            d.add(str(x))

        d = d.copy()

        self.assertEqual(hash27("".join(d)), 1141231293364439680)
Example #5
0
    def test_values(self):
        d = Set()
        d.add(("abc", 1))
        d.add(3.3)
        d.add(30)
        d.add("test1234")

        self.assertEqual(hash27("".join([str(k) for k in d])),
                         7766555225202364718)
Example #6
0
    def test_pop(self):
        d = Set()

        for x in range(500):
            d.add(str(x))

        d.pop()

        self.assertEqual(hash27("".join(d)), -434207861779954688)
Example #7
0
    def test_delete(self):
        d = Set()

        for x in range(500):
            d.add(str(x))

        d.remove("53")
        d.discard("155")

        self.assertEqual(hash27("".join(d)), -8652364590473687932)
Example #8
0
    def test_clear(self):
        d = Set()

        for x in range(500):
            d.add(str(x))

        d.clear()

        for x in range(1000, 1500):
            d.add(str(x))

        self.assertEqual(hash27("".join(d)), -1473514505880218088)
Example #9
0
    def test_pickle(self):
        d = Set()

        for x in range(500):
            d.add(str(x))

        d.remove("300")

        # Pickle and reload object
        data = pickle.dumps(d)
        d = pickle.loads(data)

        self.assertEqual(hash27("".join(d)), 6818550152093286356)
Example #10
0
    def test_merge(self):
        # Build list of (key, value) pairs to preserve insertion ordering
        d = []
        e = []

        for x in range(200):
            d.append(str(x))

        for x in range(200):
            e.append(str(x))

        m = Set(d)
        m.update(e)

        self.assertEqual(hash27("".join(m)), -5846033856052761336)
Example #11
0
    def cluster(self, verbose=False):

        data = Dict()

        with open(self.filename, 'r') as f:
            for line in f:
                a, b = line.split()[:2]

                if a in data:
                    data[a].add(b)
                else:
                    data[a] = Set()
                    data[a].add(b)
                if b in data:
                    data[b].add(a)
                else:
                    data[b] = Set()
                    data[b].add(a)

        # step 1: find preliminary cores
        SC = []  # currently-detected preliminary cores
        count = 0
        for vertex, neighbors in tqdm(data.items()):
            # build neighborhood graph
            vertices = Set([vertex]) | neighbors
            size1_neighbors = Set()
            graph = {}
            for v in vertices:
                n = data[v] & vertices
                if len(n) > 1:  # ignore size-1 vertices
                    graph[v] = n
                else:
                    size1_neighbors.add(v)
            if len(graph) < 2:  # not enough connections in this graph
                continue
            graph[vertex] -= size1_neighbors

            # get core graph
            avg_deg, density = graph_stats(graph)
            core_nodes = get_core_nodes(graph, avg_deg)
            vertices = Set(graph.keys())
            for v in vertices - core_nodes:
                del graph[v]
            for n in graph.values():
                n &= core_nodes
            if len(graph) < 2:  # not enough connections in this graph
                continue
            graph_nodes = Set(graph)

            # inner loop
            for sg in core_removal(self.density_threshold, graph):
                while True:
                    _, density = graph_stats(sg)
                    # if density threshold met, stop; else, remove min degree node
                    if density >= self.density_threshold: break
                    w = min(sg.items(), key=lambda k: len(k[1]))[0]
                    del sg[w]
                    for n in sg.values():
                        n.discard(w)

                sg_nodes = Set(sg)
                while graph_nodes - sg_nodes:
                    w = max(graph_nodes - sg_nodes,
                            key=lambda v: len(graph[v] & sg_nodes))
                    new_sg = sg.copy()
                    for v, n in new_sg.items():
                        if w in graph[v]:
                            n.add(w)
                    new_sg[w] = graph[w] & sg_nodes
                    _, density = graph_stats(new_sg)
                    if density < self.density_threshold: break
                    sg = new_sg
                    sg_nodes.add(w)

                # redundancy filtering
                max_sim = -1
                for i in range(len(SC)):
                    sim = NA_score(Set(SC[i]), sg_nodes)
                    if sim > max_sim:
                        max_sim = sim
                        index = i
                if max_sim < self.affinity_threshold:
                    SC.append(sg)
                else:
                    _, density_i = graph_stats(SC[index])
                    if density * len(sg) > density_i * len(SC[index]):
                        SC[index] = sg

        # step 2: adding peripheral proteins
        clusters = Set()
        for core in SC:
            nodes = frozenset(core)
            neighbors = reduce(lambda x, y: x | y,
                               (data[v] for v in nodes)) - nodes
            neighbors -= Set(v for v in neighbors
                             if float(len(data[v] & nodes)) /
                             len(nodes) <= self.closeness_threshold)
            clusters.add(tuple(nodes | neighbors))

        self.clusters = clusters

        print("Found %d clusters/protein complexes" % (len(clusters)))
        return clusters


# if __name__ == '__main__':
#     filename = "../data/unweighted_example_network.txt"
#     c = COACH(filename)
#     c.cluster()
Example #12
0
from py27hash.dict import Dict
from py27hash.set import Set
from tqdm import tqdm

from .cluster_alg import ClusterAlg


# return average degree and density for a graph
def graph_stats(graph):
    avg_deg = sum(len(n) for n in graph.values()) / float(len(graph))
    density = avg_deg / (len(graph) - 1)
    return avg_deg, density


# return core nodes, given a graph and its average degree
get_core_nodes = lambda g, avg: Set(v for v, n in g.items() if len(n) >= avg)

# return NA score
NA_score = lambda a, b: float(len(a & b)**2) / (len(a) * len(b))


def core_removal(threshold, graph):
    if len(graph) == 1:  # need at least two nodes in the graph...
        return [graph]

    avg_deg, density = graph_stats(graph)
    if density >= threshold:
        return [graph]
    else:
        # find and remove core nodes; create connected subcomponents
        core_nodes = get_core_nodes(graph, avg_deg)
Example #13
0
    def cluster(self, verbose=False):
        # data = defaultdict(Set) # node id => neighboring node ids

        data = Dict()
        # read in graph
        with open(self.filename, 'r') as f:
            counter = 0
            for line in f:
                a, b = line.split()[:2]
                counter += 1
                if a in data:
                    data[a].add(b)
                else:
                    data[a] = Set()
                    data[a].add(b)
                if b in data:
                    data[b].add(a)
                else:
                    data[b] = Set()
                    data[b].add(a)

        # weights = defaultdict(int)
        weights = Dict()
        for a, b in combinations(data, 2):
            if b not in data[a]: continue
            shared = len(data[a] & data[b])
            if a in weights:
                weights[a] += shared
            else:
                weights[a] = 0
                weights[a] += shared
            if b in weights:
                weights[b] += shared
            else:
                weights[b] = 0
                weights[b] += shared

        unvisited = Set(data)
        num_clusters = 0
        clusters = []

        # print(unvisited)
        # return 0

        # Potential culprit
        seed_nodes = sorted(data,
                            key=lambda k: (weights[k], len(data[k])),
                            reverse=True)

        for seed in seed_nodes:  # get highest degree node
            if seed not in unvisited: continue

            cluster = Set(
                (seed, next(iter(data[seed]))))  # seed and random neighbor

            while True:
                # rank neighbors by the number of edges between the node and cluster nodes
                frontier = sorted((len(data[p] & cluster), p)
                                  for p in Set.union(*((data[n] - cluster)
                                                       for n in cluster)))

                # do this until IN_vk < T_IN, SP <= 2 is met, or no frontier nodes left
                found = False
                while frontier and not found:
                    m_vk, p = frontier.pop()
                    if m_vk < self.t_in * len(cluster): break
                    c_2neighbors = data[p] & cluster
                    c_2neighbors.update(*(data[c] & cluster
                                          for c in c_2neighbors))
                    if cluster == c_2neighbors:
                        found = True
                        break

                if not found: break

                # otherwise, add the node to the cluster
                cluster.add(p)

            unvisited -= cluster

            if verbose:
                print(' '.join(cluster))

            num_clusters += 1

            if verbose:
                print(num_clusters, len(cluster), len(unvisited))

            clusters.append(cluster)

            if not unvisited: break

        self.clusters = clusters


# if __name__ == '__main__':
#     filename = "../data/unweighted_example_network.txt"
#     c = IPCA(filename)
#     c.cluster()
Example #14
0
def ipca(filename):
    data = defaultdict(Set) # node id => neighboring node ids

    # read in graph
    with open(filename, 'r') as f:
        counter = 0
        for line in f:
            a,b = line.split()[:2]
            print(a,b)
            print(counter)
            counter += 1
            data[a].add(b)
            data[b].add(a)

    weights = defaultdict(int)
    for a,b in combinations(data, 2):
        if b not in data[a]: continue   
        shared = len(data[a] & data[b])
        weights[a] += shared
        weights[b] += shared

    unvisited = Set(data)
    num_clusters = 0
    clusters = []
    
    # Potential culprit
    seed_nodes = sorted(data, key=lambda k: (weights[k],len(data[k])), reverse=True)

    for seed in seed_nodes: # get highest degree node
        if seed not in unvisited: continue

        cluster = Set((seed,next(iter(data[seed])))) # seed and random neighbor

        while True:
            # rank neighbors by the number of edges between the node and cluster nodes
            frontier = sorted((len(data[p] & cluster),p) for p in
                Set.union(*((data[n] - cluster) for n in cluster)))

            # do this until IN_vk < T_IN, SP <= 2 is met, or no frontier nodes left
            found = False
            while frontier and not found:
                m_vk,p = frontier.pop()
                if m_vk < T_IN * len(cluster): break
                c_2neighbors = data[p] & cluster
                c_2neighbors.update(*(data[c] & cluster for c in c_2neighbors))
                if cluster == c_2neighbors:
                    found = True
                    break

            if not found: break
                
            # otherwise, add the node to the cluster
            cluster.add(p)

        unvisited -= cluster
        print (' '.join(cluster))

        num_clusters += 1
        print (num_clusters, len(cluster), len(unvisited))

        clusters.append(cluster)

        if not unvisited: break
Example #15
0
def coach(filename):
    # read protein-protein pairs
    # data = defaultdict(Set)

    data = Dict()

    with open(filename, 'r') as f:
        for line in f:
            a, b = line.split()[:2]

            if a in data:
                data[a].add(b)
            else:
                data[a] = Set()
                data[a].add(b)
            if b in data:
                data[b].add(a)
            else:
                data[b] = Set()
                data[b].add(a)

    # step 1: find preliminary cores
    SC = []  # currently-detected preliminary cores
    count = 0
    for vertex, neighbors in tqdm(data.items()):
        # build neighborhood graph
        vertices = Set([vertex]) | neighbors
        size1_neighbors = Set()
        graph = {}
        for v in vertices:
            n = data[v] & vertices
            if len(n) > 1:  # ignore size-1 vertices
                graph[v] = n
            else:
                size1_neighbors.add(v)
        if len(graph) < 2:  # not enough connections in this graph
            continue
        graph[vertex] -= size1_neighbors

        # get core graph
        avg_deg, density = graph_stats(graph)
        core_nodes = get_core_nodes(graph, avg_deg)
        vertices = Set(graph.keys())
        for v in vertices - core_nodes:
            del graph[v]
        for n in graph.values():
            n &= core_nodes
        if len(graph) < 2:  # not enough connections in this graph
            continue
        graph_nodes = Set(graph)

        # inner loop
        for sg in core_removal(graph):
            while True:
                _, density = graph_stats(sg)
                # if density threshold met, stop; else, remove min degree node
                if density >= DENSITY_THRESHOLD: break
                w = min(sg.items(), key=lambda k: len(k[1]))[0]
                del sg[w]
                for n in sg.values():
                    n.discard(w)

            sg_nodes = Set(sg)
            while graph_nodes - sg_nodes:
                w = max(graph_nodes - sg_nodes,
                        key=lambda v: len(graph[v] & sg_nodes))
                new_sg = sg.copy()
                for v, n in new_sg.items():
                    if w in graph[v]:
                        n.add(w)
                new_sg[w] = graph[w] & sg_nodes
                _, density = graph_stats(new_sg)
                if density < DENSITY_THRESHOLD: break
                sg = new_sg
                sg_nodes.add(w)

            # redundancy filtering
            max_sim = -1
            for i in range(len(SC)):
                sim = NA_score(Set(SC[i]), sg_nodes)
                if sim > max_sim:
                    max_sim = sim
                    index = i
            if max_sim < AFFINITY_THRESHOLD:
                SC.append(sg)
            else:
                _, density_i = graph_stats(SC[index])
                if density * len(sg) > density_i * len(SC[index]):
                    SC[index] = sg

    # step 2: adding peripheral proteins
    clusters = Set()
    for core in SC:
        nodes = frozenset(core)
        neighbors = reduce(lambda x, y: x | y,
                           (data[v] for v in nodes)) - nodes
        neighbors -= Set(
            v for v in neighbors
            if float(len(data[v] & nodes)) / len(nodes) <= CLOSENESS_THRESHOLD)
        print(nodes)
        print(neighbors)
        print(nodes | neighbors)
        clusters.add(tuple(nodes | neighbors))

    return clusters