def getDDData(G, maxk, p): data = dict() for i in range(1, maxk + 1): S = degreeDiscountIC(G, i, p) size = avgSize(G, S, p, 200) data[i] = size return data
def getGreedyData(G, maxk, p): data = dict() for i in range(1, maxk + 1): S = generalGreedy(G, i, p) size = avgSize(G, S, p, 200) data[i] = size return data
def getHarvesterData(G, maxk, Ep): data = dict() for i in range(1, maxk + 1): S = Harvester(G, i, Ep, 100) size = avgSize(G, S, .03, 200) data[i] = size return data
def getDegreeHeu(G, maxk, p): data = dict() for i in range(1, maxk + 1): S = degreeHeuristic2(G, i, p) size = avgSize(G, S, p, 200) data[i] = size return data
def getNEIMData(G, maxk, p, result): data = dict() for i in range(1, maxk + 1): S = getSeedSet(i, result) size = avgSize(G, S, p, 200) data[i] = size return data
def getNewGreedyIC(G, maxk, p): data = dict() for i in range(1, maxk + 1): S = newGreedyIC(G, i, p) size = avgSize(G, S, p, 1) data[i] = size return data
def getDataTvsR(G, maxR, stepR, algo, k=50, p=.01): data = dict() for R in range(1, maxR + 2, stepR): S = algo(G, k, p, R) size = avgSize(G, S, p, R) data[R] = size print R return data
def getDataTvsR(G, maxR, stepR, algo, k = 50, p=.01): data = dict() for R in range(1, maxR+2, stepR): S = algo(G, k, p, R) size = avgSize(G, S, p, R) data[R] = size print R return data
def stopDegreeDiscount(G, tsize, ic_step=1, p=.01, iterations=200): ''' Finds initial set of nodes to propagate in Independent Cascade model (with priority queue) Input: G -- networkx graph object tsize -- number of nodes necessary to reach ic_step -- step of change in k between 2 iterations of IC p -- propagation probability Output: S -- seed set Tspread -- spread values for different sizes of seed set ''' S = [] dd = PQ() # degree discount t = dict() # number of adjacent vertices that are in S d = dict() # degree of each vertex # initialize degree discount for u in G.nodes(): d[u] = sum([G[u][v]['weight'] for v in G[u]]) # each edge adds degree 1 # d[u] = len(G[u]) # each neighbor adds degree 1 dd.add_task(u, -d[u]) # add degree of each node t[u] = 0 # add vertices to S greedily # until necessary number of nodes can be reached Tspread = dict() # spread for different k k = 0 Tspread[k] = 0 stepk = 1 while Tspread[k] < tsize: u, priority = dd.pop_item( ) # extract node with maximal degree discount S.append(u) for v in G[u]: if v not in S: t[v] += G[u][v][ 'weight'] # increase number of selected neighbors priority = d[v] - 2 * t[v] - ( d[v] - t[v]) * t[v] * p # discount of degree dd.add_task(v, -priority) # calculate IC spread with ic_step if stepk == ic_step: k = len(S) Tspread[k] = avgSize(G, S, p, iterations) print k, Tspread[k] stepk = 0 stepk += 1 # search precise boundary if abs(int(math.ceil(float(ic_step) / 2))) == 1: return S, Tspread else: return binarySearchBoundary(G, k, Tspread, tsize, ic_step, p, iterations)
def getCCData(G, maxk, p): data = dict() for i in range(1, maxk + 1): S = CC_heuristic(G, i, p) print(S) M = [] for j in S: M.insert(-1, j[0]) size = avgSize(G, M, p, 200) data[i] = size return data
def getData (G, maxk, algo, p, axis): data = dict() for k in range(1,maxk+1): if axis == "size": S = algo(G, k, p) size = avgSize(G, S, p, 200) data[k] = size elif axis == "time": start = time.time() S = algo(G, k, p) finish = time.time() data[k] = finish - start return data
def getData(G, maxk, algo, p, axis): data = dict() for k in range(1, maxk + 1): if axis == "size": S = algo(G, k, p) size = avgSize(G, S, p, 200) data[k] = size elif axis == "time": start = time.time() S = algo(G, k, p) finish = time.time() data[k] = finish - start return data
def stopDegreeDiscount(G, tsize, ic_step=1, p=.01, iterations=200): ''' Finds initial set of nodes to propagate in Independent Cascade model (with priority queue) Input: G -- networkx graph object tsize -- number of nodes necessary to reach ic_step -- step of change in k between 2 iterations of IC p -- propagation probability Output: S -- seed set Tspread -- spread values for different sizes of seed set ''' S = [] dd = PQ() # degree discount t = dict() # number of adjacent vertices that are in S d = dict() # degree of each vertex # initialize degree discount for u in G.nodes(): d[u] = sum([G[u][v]['weight'] for v in G[u]]) # each edge adds degree 1 # d[u] = len(G[u]) # each neighbor adds degree 1 dd.add_task(u, -d[u]) # add degree of each node t[u] = 0 # add vertices to S greedily # until necessary number of nodes can be reached Tspread = dict() # spread for different k k = 0 Tspread[k] = 0 stepk = 1 while Tspread[k] < tsize: u, priority = dd.pop_item() # extract node with maximal degree discount S.append(u) for v in G[u]: if v not in S: t[v] += G[u][v]['weight'] # increase number of selected neighbors priority = d[v] - 2*t[v] - (d[v] - t[v])*t[v]*p # discount of degree dd.add_task(v, -priority) # calculate IC spread with ic_step if stepk == ic_step: k = len(S) Tspread[k] = avgSize(G, S, p, iterations) print k, Tspread[k] stepk = 0 stepk += 1 # search precise boundary if abs(int(math.ceil(float(ic_step)/2))) == 1: return S, Tspread else: return binarySearchBoundary(G, k, Tspread, tsize, ic_step, p, iterations)
R = 200 # define map function def map_CC(it): print it return CC_parallel(G, seed_size, .01) pool = multiprocessing.Pool(processes=multiprocessing.cpu_count() * 2) Scores = pool.map(map_CC, range(R)) print 'Finished mapping in', time.time() - time2map time2reduce = time.time() print 'Reducing scores...' scores = {v: sum([s[v] for s in Scores]) for v in G} topScores = nlargest(seed_size, scores.iteritems(), key=lambda (dk, dv): dv) S = [v for (v, _) in topScores] print 'Time to reduce', time.time() - time2reduce print 'Average size is', avgSize(G, S, .01, 200) print 'Average size of 10 nodes is', avgSize(G, S[:10], .01, 200) print 'Total time:', time.time() - start # # write results S to file # with open('visualisation.txt', 'w') as f: # for node in S: # f.write(str(node) + os.linesep) console = []
def reverseCCWP(G, tsize, p, R, iterations): ''' Input: G -- undirected graph (nx.Graph) tsize -- coverage size (int) p -- propagation probability among all edges (int) R -- number of iterations to discover CCs (int) iterations -- number of iterations to run IC to calculate influence spread Output: S -- seed set ''' scores = dict(zip(G.nodes(), [0]*len(G))) # initialize scores start = time.time() minL = float("Inf") # number of nodes we start with (tbd later) maxL = 1 avgL = -1 for it in range(R): # remove blocked edges from graph G E = deepcopy(G) edge_rem = [e for e in E.edges() if random.random() < (1-p)**(E[e[0]][e[1]]['weight'])] E.remove_edges_from(edge_rem) # initialize CC CC = dict() # each component is reflection os the number of a component to its members explored = dict(zip(E.nodes(), [False]*len(E))) c = 0 # perform BFS to discover CC for node in E: if not explored[node]: c += 1 explored[node] = True CC[c] = [node] component = E[node].keys() for neighbor in component: if not explored[neighbor]: explored[neighbor] = True CC[c].append(neighbor) component.extend(E[neighbor].keys()) # find top components that can reach tsize activated nodes sortedCC = sorted([(len(dv), dk) for (dk, dv) in CC.iteritems()], reverse=True) cumsum = 0 # sum of top components curL = 0 # current number of CC that achieve tsize # find curL first for length, numberCC in sortedCC: curL += 1 cumsum += length if cumsum >= tsize: break # assign scores to L components for length, numberCC in sortedCC[:int(2.3*curL)]: weighted_score = 1.0/(length*curL) for node in CC[numberCC]: scores[node] += weighted_score if curL < minL: minL = curL if curL > maxL: maxL = curL print 'curL', curL avgL += curL print it + 1, R, time.time() - start print 'maxL', maxL print 'minL', minL print 'avgL', avgL/R # find nodes that achieve tsize coverage starting from top-maxL scores nodes orderedScores = sorted(scores.iteritems(), key = operator.itemgetter(1), reverse=True) topScores = orderedScores[:1] S = [node for (node,_) in topScores] coverage = avgSize(G, S, p, iterations) print '|S| = %s --> %s' %(len(S), coverage) # Penalization phase scores_copied = deepcopy(scores) # remove all nodes that are already in S for node in S: scores_copied.pop(node) # add new node by one penalizing scores at the same time while coverage < tsize: maxk, maxv = max(scores_copied.iteritems(), key = operator.itemgetter(1)) print maxv S.append(maxk) scores_copied.pop(maxk) coverage = avgSize(G, S, p, iterations) print '|S| = %s --> %s' %(len(S), coverage) for v in G[maxk]: if v not in S: penalty = (1-p)**G[maxk][v]['weight'] scores_copied[v] *= penalty return S
penalty = (1-p)**G[maxk][v]['weight'] scores_copied[v] *= penalty return S if __name__ == '__main__': import time start = time.time() # read in graph G = nx.Graph() with open('../graphdata/hep.txt') as f: n, m = f.readline().split() for line in f: u, v = map(int, line.split()) try: G[u][v]['weight'] += 1 except: G.add_edge(u, v, weight=1) print 'Built graph G' print time.time() - start tsize = 150 p = .01 R = 400 iterations = 300 S = reverseCCWP(G, tsize, p, R, iterations) print S print 'Necessary %s initial nodes to target %s nodes in graph G' %(len(S), avgSize(G, S, p, iterations)) print time.time() - start console = []
def reverseCCWP(G, tsize, p, R, iterations): ''' Input: G -- undirected graph (nx.Graph) tsize -- coverage size (int) p -- propagation probability among all edges (int) R -- number of iterations to discover CCs (int) iterations -- number of iterations to run IC to calculate influence spread Output: S -- seed set ''' scores = dict(zip(G.nodes(), [0] * len(G))) # initialize scores start = time.time() minL = float("Inf") # number of nodes we start with (tbd later) maxL = 1 avgL = -1 for it in range(R): # remove blocked edges from graph G E = deepcopy(G) edge_rem = [ e for e in E.edges() if random.random() < (1 - p)**(E[e[0]][e[1]]['weight']) ] E.remove_edges_from(edge_rem) # initialize CC CC = dict( ) # each component is reflection os the number of a component to its members explored = dict(zip(E.nodes(), [False] * len(E))) c = 0 # perform BFS to discover CC for node in E: if not explored[node]: c += 1 explored[node] = True CC[c] = [node] component = E[node].keys() for neighbor in component: if not explored[neighbor]: explored[neighbor] = True CC[c].append(neighbor) component.extend(E[neighbor].keys()) # find top components that can reach tsize activated nodes sortedCC = sorted([(len(dv), dk) for (dk, dv) in CC.items()], reverse=True) cumsum = 0 # sum of top components curL = 0 # current number of CC that achieve tsize # find curL first for length, numberCC in sortedCC: curL += 1 cumsum += length if cumsum >= tsize: break # assign scores to L components for length, numberCC in sortedCC[:int(2.3 * curL)]: weighted_score = 1.0 / (length * curL) for node in CC[numberCC]: scores[node] += weighted_score if curL < minL: minL = curL if curL > maxL: maxL = curL print('curL', curL) avgL += curL print(it + 1, R, time.time() - start) print('maxL', maxL) print('minL', minL) print('avgL', avgL / R) # find nodes that achieve tsize coverage starting from top-maxL scores nodes orderedScores = sorted(scores.items(), key=operator.itemgetter(1), reverse=True) topScores = orderedScores[:1] S = [node for (node, _) in topScores] coverage = avgSize(G, S, p, iterations) print('|S| = %s --> %s' % (len(S), coverage)) # Penalization phase scores_copied = deepcopy(scores) # remove all nodes that are already in S for node in S: scores_copied.pop(node) # add new node by one penalizing scores at the same time while coverage < tsize: maxk, maxv = max(scores_copied.items(), key=operator.itemgetter(1)) print(maxv) S.append(maxk) scores_copied.pop(maxk) coverage = avgSize(G, S, p, iterations) print('|S| = %s --> %s' % (len(S), coverage)) for v in G[maxk]: if v not in S: penalty = (1 - p)**G[maxk][v]['weight'] scores_copied[v] *= penalty return S
u, v = map(int, line.split()) try: G[u][v]['weight'] += 1 except: G.add_edge(u,v, weight=1) # G.add_edge(u, v, weight=1) print 'Built graph G' print time.time() - start seed_size = 5 p = .01 nodes = G.nodes() C = combinations(nodes, seed_size) spread = dict() for candidate in C: print candidate, time2spread = time.time() spread[candidate] = avgSize(G, list(candidate), p, 1000) print spread[candidate], time.time() - time2spread S, val = max(spread.iteritems(), key = lambda (dk, dv): dv) print 'S (by brute-force):', S, ' -->', val S2 = degreeDiscountIC(G, seed_size, p) print 'S (by degree discount):', tuple(S2), ' -->', avgSize(G, S2, p, 1000) print 'S (by degree discount) spreads to %s nodes (according to brute-force)' %(spread[tuple(sorted(S2))]) print 'Total time:', time.time() - start console = []
try: G[u][v]['weight'] += 1 except: G.add_edge(u, v, weight=1) # G.add_edge(u, v, weight=1) print 'Built graph G' print time.time() - start seed_size = 5 p = .01 nodes = G.nodes() C = combinations(nodes, seed_size) spread = dict() for candidate in C: print candidate, time2spread = time.time() spread[candidate] = avgSize(G, list(candidate), p, 1000) print spread[candidate], time.time() - time2spread S, val = max(spread.iteritems(), key=lambda (dk, dv): dv) print 'S (by brute-force):', S, ' -->', val S2 = degreeDiscountIC(G, seed_size, p) print 'S (by degree discount):', tuple(S2), ' -->', avgSize(G, S2, p, 1000) print 'S (by degree discount) spreads to %s nodes (according to brute-force)' % ( spread[tuple(sorted(S2))]) print 'Total time:', time.time() - start console = []
return S if __name__ == '__main__': import time start = time.time() # read in graph G = nx.Graph() with open('../graphdata/hep.txt') as f: n, m = f.readline().split() for line in f: u, v = map(int, line.split()) try: G[u][v]['weight'] += 1 except: G.add_edge(u, v, weight=1) print 'Built graph G' print time.time() - start tsize = 150 p = .01 R = 400 iterations = 300 S = reverseCCWP(G, tsize, p, R, iterations) print S print 'Necessary %s initial nodes to target %s nodes in graph G' % ( len(S), avgSize(G, S, p, iterations)) print time.time() - start console = []
seed_size = 50 print 'Start mapping...' time2map = time.time() R = 200 # define map function def map_CC(it): print it return CC_parallel(G, seed_size, .01) pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()*2) Scores = pool.map(map_CC, range(R)) print 'Finished mapping in', time.time() - time2map time2reduce = time.time() print 'Reducing scores...' scores = {v: sum([s[v] for s in Scores]) for v in G} topScores = nlargest(seed_size, scores.iteritems(), key = lambda (dk,dv): dv) S = [v for (v,_) in topScores] print 'Time to reduce', time.time() - time2reduce print 'Average size is', avgSize(G,S,.01,200) print 'Average size of 10 nodes is', avgSize(G,S[:10],.01,200) print 'Total time:', time.time() - start # # write results S to file # with open('visualisation.txt', 'w') as f: # for node in S: # f.write(str(node) + os.linesep) console = []
def mapAvgSize (S): return avgSize(G, S, p, I)