def main(): if len(sys.argv) != 2: print('File name must be provided') sys.exit(-1) filename = sys.argv[1] filenameWOExt = filename.split('.')[0].split('/')[-1] graph = snap.LoadEdgeList(snap.PNEANet, filename, 0, 1, '\t') snap.PrintInfo(graph, "New York", filenameWOExt + '_info.txt', False) Rnd = snap.TRnd(123124) erdosRenyi = snap.GenRndGnm(snap.PNEANet, graph.GetNodes(), graph.GetEdges(), True, Rnd) snap.PrintInfo(erdosRenyi, "Erdos-Renyi", 'erdos_renyi_info.txt', False) grid = snap.GenGrid(snap.PNEANet, 220, 250, False) snap.PrintInfo(grid, "Grid", 'grid_info.txt', False) printGenericInformation(graph, 'New York street network') printGenericInformation(erdosRenyi, 'Erdos-Renyi random graph') printGenericInformation(grid, 'Grid random graph') # Plot everything in the plots directory os.chdir(os.path.join(os.path.abspath(sys.path[0]), 'plots')) saveDegreeDistribution(graph, 'deg_dist_ny.tab') saveDegreeDistribution(erdosRenyi, 'deg_dist_er.tab') saveDegreeDistribution(grid, 'deg_dist_gr.tab') testRobustnessAll([graph, erdosRenyi, grid]) call(['gnuplot', 'deg_dist.plt']) call(['gnuplot', 'robustness_rand.plt']) call(['gnuplot', 'robustness_max.plt'])
def main (): import json import snap import graphviz import matplotlib.pyplot as plt import numpy as np import xlrd #----------------- #The common area rumor_number = "21" path_input = 'D:\\Papers\\Social Network Mining\\Analysis_of_Rumor_Dataset\\Step 18\\Rumor_'+ rumor_number +'\\Input\\' workbook_input1_D = xlrd.open_workbook(path_input + 'DATASET.xlsx', on_demand = True) path_jsonl = 'D:\\Papers\\Social Network Mining\\Analysis_of_Rumor_Dataset\\Step 18\\Rumor_'+ rumor_number +'\\Input\\Rumor_' + rumor_number + '.jsonl' path_graph = 'D:\\Papers\\Social Network Mining\\Analysis_of_Rumor_Dataset\\Step 18\\Rumor_'+ rumor_number +'\\Input\\Rumor_' + rumor_number + '.graph' path_output = 'D:\\Papers\\Social Network Mining\\Analysis_of_Rumor_Dataset\\Step 18\\Rumor_'+ rumor_number +'\\Output\\' FIn = snap.TFIn(path_graph) G_Directed = snap.TNGraph.Load(FIn) G_Directed_with_Attributes = snap.ConvertGraph(snap.PNEANet, G_Directed) #Convert Directed Graph to Directed Graph with attributes: it means now we can assign attributes to the graph nodes G_Directed_with_Attributes = Get_Graph_with_Attributes_New (path_jsonl, G_Directed_with_Attributes, workbook_input1_D) #----------------- #The specific area snap.PrintInfo(G_Directed_with_Attributes, "Python type PNEANet", path_output + "S18_5_Output.txt", False)
def transform_directed_to_undirected(): GUn = snap.ConvertGraph(snap.PUNGraph, G) snap.PrintInfo(GUn, "Tweets UN stats", "Tweets_UN_info.txt", False) f = open('Tweets_UN_info.txt', 'r') file_contents = f.read() #print(file_contents) f.close() return GUn
def preferential_attachment(): GUn = transform_directed_to_undirected() AverageDegree = average_degree() Rnd = snap.TRnd() GPA = snap.GenPrefAttach(GUn.GetNodes(), int(AverageDegree), Rnd) snap.PrintInfo(GPA, "Tweets PA Stats", "Tweets_PA-info.txt", False) f = open('Tweets_PA-info.txt', 'r') file_contents = f.read() print(file_contents) f.close()
def erdos_renyi(): GUn = transform_directed_to_undirected() # Erdos-Renyi random graph GER = snap.GenRndGnm(snap.PNGraph, G.GetNodes(), G.GetEdges()) snap.PrintInfo(GER, "Tweets Random Stats", "Tweets_Random-info.txt", False) GUn.GetEdges() f = open('Tweets_Random-info.txt', 'r') file_contents = f.read() print(file_contents) f.close()
def biggest_connected_component_on_the_network(): WccV = snap.TIntPrV() snap.GetWccSzCnt(G, WccV) con_comp = {} print("Connected components info.\n") print("# of connected component", WccV.Len()) for comp in WccV: con_comp[comp.GetVal1()] = comp.GetVal2() print("Biggest connected component has size of:", max(con_comp.values())) snap.PrintInfo(G, "tweet Information", "tweet_stats_extended.txt", False)
def main(): """ See usage message in module header block """ get_subgraph = False # if True discard nodes without attribute data try: opts, args = getopt.getopt(sys.argv[1:], "d") except: usage(sys.argv[0]) for opt, arg in opts: if opt == "-d": get_subgraph = True else: usage(sys.argv[0]) if len(args) != 1: usage(sys.argv[0]) data_dir = args[0] outputdir = '.' sys.stdout.write('loading data from ' + data_dir + '...') start = time.time() datazipfile = data_dir + os.path.sep + 'physician-shared-patient-patterns-2014-days30.zip' G = load_physician_referral_data(datazipfile) print time.time() - start, 's' snap.PrintInfo(G) # Remove loops (self-edges). # G is a PNGraph so multiple edges not allowed in this type anyway. snap.DelSelfEdges(G) snap.PrintInfo(G) # specify ordered nodelist to map sequential ids to original ids consistent nodelist = [node.GetId() for node in G.Nodes()] graph_filename = outputdir + os.path.sep + "physician_referall_arclist" + os.path.extsep + "txt" nodeid_filename = outputdir + os.path.sep + "nodeid" + os.path.extsep + "txt" write_graph_file(graph_filename, G, nodelist) write_subgraph_nodeids(nodeid_filename, nodelist)
def node_rewiring(): GUn = transform_directed_to_undirected() # Node Rewiring Rnd = snap.TRnd() GRW = snap.GenRewire(GUn, 1000, Rnd) snap.PrintInfo(GRW, "Tweets Rewire Stats", "Tweets_Rewire-info.txt", False) f = open('Tweets_Rewire-info.txt', 'r') file_contents = f.read() print(file_contents) f.close()
def configuration_model(): GUn = transform_directed_to_undirected() GUnDegSeqV = snap.TIntV() snap.GetDegSeqV(GUn, GUnDegSeqV) Rnd = snap.TRnd() GConfModel = snap.GenConfModel(GUnDegSeqV, Rnd) snap.PrintInfo(GConfModel, "Tweets ConfModel Stats", "Tweets_ConfModel-info.txt", False) f = open('Tweets_ConfModel-info.txt', 'r') file_contents = f.read() print(file_contents) f.close()
def get_props(g, gname, out_path="", fast=False, to_file=True): # get properties of a graph, e.g. density, connected components, diameter, etc. if to_file: desc_f = "%s/%s_desc.txt" % (out_path, gname) else: desc_f = "/dev/stdout" snap.PrintInfo(g, "description", desc_f, fast) all_deg = get_deg_dist(g) if to_file: deg_f = "%s/%s_deg_dist.csv" % (out_path, gname) all_deg.to_csv(deg_f, index=False) #else: return all_deg
def getGraphicalGraphInfo(graph_name, conn, cur): queryResult = QueryResult() graph_name = graph_name.strip() graph_path = helper.getGraph(graph_name) cur.execute("select graphType from my_matgraphs where matgraphname = '%s';" % (graph_name)) conn.commit() one_row = cur.fetchone() if one_row is None: queryResult.setType("string") queryResult.setContent("Can't find this graph in my_matgraphs") else: graph_type = one_row[0].strip() snap_graph_type = snap.PNGraph if graph_type == "digraph" else snap.PUNGraph graph = snap.LoadEdgeList(snap_graph_type, graph_path, 0, 1) tmpGraphDir = "/dev/shm/RG_Tmp_Graph/" tmpGraphInfoPath = tmpGraphDir + graph_name + '_info' #snap print info snap.PrintInfo(graph, "Graph Type", tmpGraphInfoPath) tableHeaderLst = ['Attribute', 'Value'] rowsContent = [] with open(tmpGraphInfoPath) as f: for line in f: fields = line.split(':') key = fields[0].strip() value = fields[1].strip() if key == "Graph Type": value = 'Directed' if graph_type == "digraph" else "Undirected" rowsContent.append([key, value]) #definition cur.execute("select definition from pg_matviews where matviewname = '%s';" % (graph_name)) conn.commit() one_row = cur.fetchone() if one_row is not None: rowsContent.append(['Definition', one_row[0].strip()]) queryResult.setType("table") queryResult.setContent(TableResult(tableHeaderLst, rowsContent)) return queryResult
def print_statistics(self, outfile_name): print 'Writing to file:', outfile_name snap.PrintInfo(self.Graph, 'Python type TUNGraph', outfile_name, False) with open(outfile_name, 'a') as f: f.write('\n####More information') max_degree_node = snap.GetMxDegNId(self.Graph) for artist_id in self.ids: if self.ids[artist_id] == max_degree_node: print artist_id # These may throw gnuplot errors; if so, edit the generated .plt files to correct the errors and run # gnuplot from terminal. (May need to set terminal to svg instead of png depending on your gnuplot # installation.) snap.PlotOutDegDistr(self.Graph, 'out_degree_distr', 'Out-degree distribution') snap.PlotInDegDistr(self.Graph, 'in_degree_distr', 'In-degree distribution')
def visualize_k_random_users(k, fanout, fanout_samples, graph): """ OUTDATED @params: [k (int), fanout_samples (int), graph (snap.TUNGraph)] @returns: None Loads the snap.py graph from graph, and samples k edges from the network to visualize using networkx. Samples fanout_samples nodes to fanout, to prevent intractibly large sample graphs. """ sample_graph = snap.GetRndESubGraph(graph, k) sample_graph = get_k_graph_egonet( fanout, fanout_samples, sample_graph, graph ) snap.PrintInfo( sample_graph, 'Sampled Graph Information', '/dev/stdout', False ) nx_graph = nx.Graph() for node in sample_graph.Nodes(): nx_graph.add_node(node.GetId()) for edge in sample_graph.Edges(): n1 = edge.GetSrcNId() n2 = edge.GetDstNId() nx_graph.add_edge(n1, n2) edges_list = [edge for edge in nx_graph.edges()] pos = nx.spring_layout(nx_graph) nx.draw_networkx_nodes( nx_graph, pos, node_color='b', node_size=10, alpha=0.6 ) nx.draw_networkx_edges(nx_graph, pos, edgelist=edges_list, arrows=False) plt.show()
def analyze_network( k=1000, fanout=1, fanout_samples=1, graph_in_path='bad_actors.graph' ): """ @params: [k (int), graph_in_path (str)] @returns: None Loads a network from 'graph_in_path' and prints basic information about the network. Samples k edges from the network to visualize using networkx. """ graph = snap.TNEANet.Load(snap.TFIn(graph_in_path)) snap.PrintInfo(graph, 'Basic Graph Information', '/dev/stdout', False) MxScc = snap.GetMxScc(graph) print('Nodes in largest strongly-connected subcomponent: %d' % MxScc.GetNodes() ) visualize_k_random_users(k, fanout, fanout_samples, graph)
def main(args): ub_review_edges_file = args.ub_review_edges graph_info_file = args.graph_info # load graph G = snap.LoadEdgeList(snap.PUNGraph, ub_review_edges_file, 0, 1) # graph info snap.PrintInfo(G, "yelp-review-stats", graph_info_file, False) # plift plfitDegreeDistr(G, graph_info_file) # clustering coefficient clustCf(G, graph_info_file) # wcc topWCC(G, graph_info_file) # number of users, businesses, reviews load_graph(ub_review_edges_file, graph_info_file)
import snap G = snap.LoadEdgeList(snap.PNGraph, 'wiki-Vote.txt', 0, 1) print " 1 ----------" snap.PrintInfo(G, "a") print " 2 ----------" snap.PrintInfo(G, "a", "file.txt") print " 3 ----------" snap.PrintInfo(G, "a", "file.txt", False) print " 4 ----------" snap.PrintInfo(G, "a", "") #print " 5 ----------" #snap.PrintInfo(G, "a", "", False)
def main(argv): argv.pop(0) directory = argv.pop(0) directoryReviews = argv.pop(0) directoryItems = argv.pop(0) item = argv.pop(0) goodRating = int(argv.pop(0)) yearList = list(argv) inFiles = [ f for f in listdir(directoryReviews) if isfile(join(directoryReviews, f)) ] fileList = [] for f in inFiles: for y in yearList: if y in f: fileList.append(directoryReviews + f) with open(directoryReviews + 'reviews_' + item + '_combined.json', 'w') as outfile: for fname in fileList: with open(fname) as infile: for line in infile: outfile.write(line) with open(directoryReviews + 'reviews_' + item + '_combined.json', 'rb') as f_in, gzip.open( directoryReviews + 'reviews_' + item + '_combined.json.gz', 'wb') as f_out: shutil.copyfileobj(f_in, f_out) # Parsing Items parseItems(directoryItems + 'meta_' + item + '.json.gz', directory) snap.PrintInfo(GItems, 'GItems Information') # Saving GItems snap.SaveEdgeList(GItems, directory + 'Edge_List_Items_' + item + '.txt') with open(directory + 'Dictionary_Items_' + item + '.txt', 'w') as f1: json.dump(asinItems, f1) with open(directory + 'Dictionary_Items_Nodes_' + item + '.txt', 'w') as f3: json.dump(nodeItems, f3) userItemsFileName = directory + '_User_Item_' + item + '.txt' # Parsing Reviews parseReviews(directoryReviews + 'reviews_' + item + '_combined.json.gz', goodRating, userItemsFileName, directory) snap.PrintInfo(GUsers, 'GUsers Information') # Saving GUsers snap.SaveEdgeList(GUsers, directory + 'Edge_List_Users_' + item + '.txt') with open(directory + 'Dictionary_Users_' + item + '.txt', 'w') as f2: json.dump(reviewerIdUsers, f2) snap.PrintInfo(GCombined, 'GCombined Information') snap.SaveEdgeList(GCombined, directory + 'Edge_List_Combined_' + item + '.txt')
def compute_graph_statistics(graph_path, overwrite, compute_betweenness=False): graph_abs_path = os.path.abspath(graph_path) graph_name = os.path.basename(graph_abs_path).replace(".graph", "") fin = snap.TFIn(graph_abs_path) graph = snap.TNEANet.Load(fin) # rebuild the id => pkg dictionary id_pkg_dict = {} for node in graph.Nodes(): id_pkg_dict[node.GetId()] = graph.GetStrAttrDatN(node.GetId(), "pkg") directory = os.path.dirname(os.path.abspath(graph_path)) json_path = os.path.join(directory, graph_name + "_statistics.json") if os.path.isfile(json_path): with open(json_path, "r") as f: statistics = json.load(f, object_pairs_hook=OrderedDict) else: statistics = OrderedDict() # snap.py doesn't suport absolute paths for some operations. Let's cd to the directory os.chdir(directory) # general statistics output = os.path.join(directory, graph_name + "_main_statistics.txt") if not os.path.isfile(output) or overwrite: print("{0} Computing general statistics".format(datetime.datetime.now())) snap.PrintInfo(graph, "Play Store Graph -- main statistics", output, False) # info about the nodes with the max in degree if "max_in_degree" not in statistics or overwrite: print("{0} Computing max indegree".format(datetime.datetime.now())) max_in_deg_id = snap.GetMxInDegNId(graph) iterator = graph.GetNI(max_in_deg_id) max_in_deg = iterator.GetInDeg() max_in_deg_pkg = graph.GetStrAttrDatN(max_in_deg_id, "pkg") statistics["max_in_degree"] = max_in_deg statistics["max_in_degree_id"] = max_in_deg_id statistics["max_in_degree_pkg"] = max_in_deg_pkg # info about the nodes with the max out degree if "max_out_degree" not in statistics or overwrite: print("{0} Computing max outdegree".format(datetime.datetime.now())) max_out_deg_id = snap.GetMxOutDegNId(graph) iterator = graph.GetNI(max_out_deg_id) max_out_deg = iterator.GetOutDeg() max_out_deg_pkg = graph.GetStrAttrDatN(max_out_deg_id, "pkg") statistics["max_out_degree"] = max_out_deg statistics["max_out_degree_id"] = max_out_deg_id statistics["max_out_degree_pkg"] = max_out_deg_pkg # pagerank statistics output = graph_name + "_topNpagerank.eps" if not os.path.isfile(output) or "top_n_pagerank" not in statistics or overwrite: print("{0} Computing top 20 nodes with highest pagerank".format(datetime.datetime.now())) data_file = graph_name + "_pageranks" prank_hashtable = snap.TIntFltH() if not os.path.isfile(data_file) or overwrite: # Damping Factor: 0.85, Convergence difference: 1e-4, MaxIter: 100 snap.GetPageRank(graph, prank_hashtable, 0.85) fout = snap.TFOut(data_file) prank_hashtable.Save(fout) else: fin = snap.TFIn(data_file) prank_hashtable.Load(fin) top_n = get_top_nodes_from_hashtable(prank_hashtable) top_n.sort(key=itemgetter(1)) if "top_n_pagerank" not in statistics or overwrite: top_n_labeled = [] for pair in top_n: top_n_labeled.append((id_pkg_dict[pair[0]], pair[1])) statistics["top_n_pagerank"] = list(reversed(top_n_labeled)) if not os.path.isfile(output) or overwrite: # let's build a subgraph induced on the top 20 pagerank nodes subgraph = get_subgraph(graph, [x[0] for x in top_n]) labels_dict = get_labels_subset(id_pkg_dict, subgraph) values = snap_hashtable_to_dict(prank_hashtable, [x[0] for x in top_n]) plot_subgraph_colored(subgraph, labels_dict, values, "PageRank", "Play Store Graph - top 20 PageRank nodes", output, "autumn_r") # betweeness statistics output = graph_name + "_topNbetweenness.eps" if compute_betweenness and (not os.path.isfile(output) or "betweenness" not in statistics or overwrite): print("{0} Computing top 20 nodes with highest betweenness".format(datetime.datetime.now())) data_file1 = graph_name + "_node_betweenness" data_file2 = graph_name + "_edge_betweenness" node_betwenness_hashtable = snap.TIntFltH() edge_betwenness_hashtable = snap.TIntPrFltH() if not os.path.isfile(data_file1) or not os.path.isfile(data_file2) or overwrite: snap.GetBetweennessCentr(graph, node_betwenness_hashtable, edge_betwenness_hashtable, 0.85, True) fout = snap.TFOut(data_file1) node_betwenness_hashtable.Save(fout) fout = snap.TFOut(data_file2) edge_betwenness_hashtable.Save(fout) else: fin = snap.TFIn(data_file1) node_betwenness_hashtable.Load(fin) fin = snap.TFIn(data_file2) edge_betwenness_hashtable.Load(fin) # unused, as now top_n = get_top_nodes_from_hashtable(node_betwenness_hashtable) top_n.sort(key=itemgetter(1)) if "top_n_betweenness" not in statistics or overwrite: top_n_labeled = [] for pair in top_n: top_n_labeled.append((id_pkg_dict[pair[0]], pair[1])) statistics["top_n_betweenness"] = list(reversed(top_n_labeled)) if not os.path.isfile(output) or overwrite: # let's build a subgraph induced on the top 20 betweenness nodes subgraph = get_subgraph(graph, [x[0] for x in top_n]) labels_dict = get_labels_subset(id_pkg_dict, subgraph) values = snap_hashtable_to_dict(node_betwenness_hashtable, [x[0] for x in top_n]) plot_subgraph_colored(subgraph, labels_dict, values, "Betweenness", "Play Store Graph - top 20 Betweenness nodes", output) # HITS statistics output_hub = graph_name + "_topNhitshubs.eps" output_auth = graph_name + "_topNhitsauth.eps" if not os.path.isfile(output_hub) or not os.path.isfile(output_auth) or "top_n_hits_hubs" not in statistics \ or "top_n_hits_authorities" not in statistics or overwrite: print("{0} Computing top 20 HITS hubs and auths".format(datetime.datetime.now())) data_file1 = graph_name + "_hits_hubs" data_file2 = graph_name + "_hits_auth" hubs_hashtable = snap.TIntFltH() auth_hashtable = snap.TIntFltH() if not os.path.isfile(data_file1) or not os.path.isfile(data_file2) or overwrite: # MaxIter = 20 snap.GetHits(graph, hubs_hashtable, auth_hashtable, 20) fout = snap.TFOut(data_file1) hubs_hashtable.Save(fout) fout = snap.TFOut(data_file2) auth_hashtable.Save(fout) else: fin = snap.TFIn(data_file1) hubs_hashtable.Load(fin) fin = snap.TFIn(data_file2) auth_hashtable.Load(fin) top_n_hubs = get_top_nodes_from_hashtable(hubs_hashtable) top_n_hubs.sort(key=itemgetter(1)) if "top_n_hits_hubs" not in statistics or overwrite: top_n_labeled = [] for pair in top_n_hubs: top_n_labeled.append((id_pkg_dict[pair[0]], pair[1])) statistics["top_n_hits_hubs"] = list(reversed(top_n_labeled)) top_n_auth = get_top_nodes_from_hashtable(auth_hashtable) top_n_auth.sort(key=itemgetter(1)) if "top_n_hits_authorities" not in statistics or overwrite: top_n_labeled = [] for pair in top_n_auth: top_n_labeled.append((id_pkg_dict[pair[0]], pair[1])) statistics["top_n_hits_authorities"] = list(reversed(top_n_labeled)) if not os.path.isfile(output_hub) or not os.path.isfile(output_auth) or overwrite: nodes_subset = set() for pair in top_n_hubs: nodes_subset.add(pair[0]) for pair in top_n_auth: nodes_subset.add(pair[0]) # let's build a subgraph induced on the top N HITS auths and hubs nodes subgraph = get_subgraph(graph, nodes_subset) labels_dict = get_labels_subset(id_pkg_dict, subgraph) values = snap_hashtable_to_dict(hubs_hashtable, nodes_subset) values2 = snap_hashtable_to_dict(auth_hashtable, nodes_subset) plot_subgraph_colored(subgraph, labels_dict, values, "HITS - Hub Index", "Play Store Graph - top 20 HITS hubs + top 20 HITS authorities", output_hub, "bwr") plot_subgraph_colored(subgraph, labels_dict, values2, "HITS - Authority Index", "Play Store Graph - top 20 HITS hubs + top 20 HITS authorities", output_auth, "bwr_r") # indegree histogram output = graph_name + "_indegree" if not os.path.isfile("inDeg." + output + ".plt") or not os.path.isfile( "inDeg." + output + ".tab") or not os.path.isfile("inDeg." + output + ".png") or overwrite: print("{0} Computing indegree distribution".format(datetime.datetime.now())) snap.PlotInDegDistr(graph, output, "Play Store Graph - in-degree Distribution") # outdegree histogram output = graph_name + "_outdegree" if not os.path.isfile("outDeg." + output + ".plt") or not os.path.isfile( "outDeg." + output + ".tab") or not os.path.isfile( "outDeg." + output + ".png") or overwrite: print("{0} Computing outdegree distribution".format(datetime.datetime.now())) snap.PlotOutDegDistr(graph, output, "Play Store Graph - out-degree Distribution") # strongly connected components print output = graph_name + "_scc" if not os.path.isfile("scc." + output + ".plt") or not os.path.isfile( "scc." + output + ".tab") or not os.path.isfile("scc." + output + ".png") or overwrite: print("{0} Computing scc distribution".format(datetime.datetime.now())) snap.PlotSccDistr(graph, output, "Play Store Graph - strongly connected components distribution") # weakly connected components print output = graph_name + "_wcc" if not os.path.isfile("wcc." + output + ".plt") or not os.path.isfile( "wcc." + output + ".tab") or not os.path.isfile("wcc." + output + ".png") or overwrite: print("{0} Computing wcc distribution".format(datetime.datetime.now())) snap.PlotWccDistr(graph, output, "Play Store Graph - weakly connected components distribution") # clustering coefficient distribution output = graph_name + "_cf" if not os.path.isfile("ccf." + output + ".plt") or not os.path.isfile( "ccf." + output + ".tab") or not os.path.isfile("ccf." + output + ".png") or overwrite: print("{0} Computing cf distribution".format(datetime.datetime.now())) snap.PlotClustCf(graph, output, "Play Store Graph - clustering coefficient distribution") # shortest path distribution output = graph_name + "_hops" if not os.path.isfile("hop." + output + ".plt") or not os.path.isfile( "hop." + output + ".tab") or not os.path.isfile("hop." + output + ".png") or overwrite: print("{0} Computing shortest path distribution".format(datetime.datetime.now())) snap.PlotHops(graph, output, "Play Store Graph - Cumulative Shortest Paths (hops) distribution", True) # k-core edges distribution output = graph_name + "_kcore_edges" if not os.path.isfile("coreEdges." + output + ".plt") or not os.path.isfile( "coreEdges." + output + ".tab") or not os.path.isfile( "coreEdges." + output + ".png") or overwrite: print("{0} Computing k-core edges distribution".format(datetime.datetime.now())) snap.PlotKCoreEdges(graph, output, "Play Store Graph - K-Core edges distribution") # k-core nodes distribution output = graph_name + "_kcore_nodes" if not os.path.isfile("coreNodes." + output + ".plt") or not os.path.isfile( "coreNodes." + output + ".tab") or not os.path.isfile( "coreNodes." + output + ".png") or overwrite: print("{0} Computing k-core nodes distribution".format(datetime.datetime.now())) snap.PlotKCoreNodes(graph, output, "Play Store Graph - K-Core nodes distribution") with open(json_path, 'w') as outfile: json.dump(statistics, outfile, indent=2)
from __future__ import division """Run some exploratory analysis on Twitter replies network. Some of the code is adapted from the snap.py tutorial.""" import snap from twython import Twython import sys if __name__ == '__main__': CONSUMER_KEY, CONSUMER_SECRET = open('twitapikeys.txt').read().split()[:2] twitterapi = Twython(CONSUMER_KEY, CONSUMER_SECRET) filename = sys.argv[1] repliesgraph = snap.LoadEdgeList(snap.PNGraph, filename, 0, 1) snap.PrintInfo(repliesgraph, "Twitter replies network") print #reciprocity num_dir_edges = snap.CntUniqDirEdges(repliesgraph) print "{0:.2f}% of directed edges are reciprocal".format( snap.CntUniqBiDirEdges(repliesgraph) * 2 * 100 / num_dir_edges) #clustering coefficient print "The clustering coefficient is {0:.2f}%".format( snap.GetClustCf(repliesgraph) * 100) #strongly and weakly connected components CntV = snap.TIntPrV() snap.GetSccSzCnt(repliesgraph, CntV) num_cc = 0 for p in CntV:
import snap G = snap.LoadEdgeList(snap.PNGraph, "Wiki-Vote.txt", 0, 1) snap.PrintInfo(G, "votes Stats", "votes-info.txt", False) # Node ID with maximum degree NId1 = snap.GetMxDegNId(G) print("Node ID with Maximum-Degree: %d" % NId1) # Number of Strongly connected components ComponentDist = snap.TIntPrV() snap.GetSccSzCnt(G, ComponentDist) for comp in ComponentDist: print("Size: %d - Number of Components: %d" % (comp.GetVal1(), comp.GetVal2())) # Size of largest strongly connected component print("Strongly Connected Component - Maximum size:", snap.GetMxSccSz(G)) # Number of Weakly Connected Components CompDist = snap.TIntPrV() snap.GetWccSzCnt(G, CompDist) for comp in CompDist: print("Size: %d - Number of Components: %d" % (comp.GetVal1(), comp.GetVal2())) # Size of largest weakly connected component print("Weakly Connected Component - Maximum size:", snap.GetMxWccSz(G)) # Plot of Outdegree Distribution snap.PlotOutDegDistr(G, "Wiki Votes", "Wiki-Votes Out Degree")
def get_graph_info(file_path, output_path): Graph, H = load_graph(file_path) snap.PrintInfo(Graph, 'Python type PNGraph', output_path, False)
def main(): """ See usage message in module header block """ directed = True try: opts, args = getopt.getopt(sys.argv[1:], "") except: usage(sys.argv[0]) for opt, arg in opts: usage(sys.argv[0]) if len(args) != 5: usage(sys.argv[0]) data_dir = args[0] num_samples = int(args[1]) num_seeds = int(args[2]) num_waves = int(args[3]) - 1 # -1 for consistency with SPNet outputdir = args[4] print "directed:", directed print "number of samples:", num_samples print "number of seeds:", num_seeds print "number of waves:", num_waves print "output directory:", outputdir if not os.path.exists(outputdir): os.mkdir(outputdir) sys.stdout.write('loading data from ' + data_dir + '...') start = time.time() datazipfile = data_dir + os.path.sep + 'physician-shared-patient-patterns-2014-days30.zip' G = load_physician_referral_data(datazipfile) print time.time() - start, 's' snap.PrintInfo(G) # get num_samples * num_seeds distinct random seed nodes (sample without replacement) # and convert to list of lists where each list is seed set for one sample allseeds = random.sample([node.GetId() for node in G.Nodes()], num_samples * num_seeds) seedsets = [ allseeds[i:i + num_seeds] for i in range(0, len(allseeds), num_seeds) ] sampledesc_filename = outputdir + os.path.sep + "sampledesc" + os.path.extsep + "txt" sampledesc_f = open(sampledesc_filename, 'w') for i in range(num_samples): sys.stdout.write('generating snowball sample ' + str(i + 1) + '... ') start = time.time() # have to convert seedset to TIntV for SNAP seedsVec = snap.TIntV() for nodeid in seedsets[i]: seedsVec.Add(nodeid) Gsample0 = snowball_sample(G, num_waves, seedsVec) #print 'XXX',Gsample0.GetIntAttrDatN(Gsample0.GetRndNId(), "zone")#XXX # renumber nodes so they are numbered 0..N-1 # Actually can't do this as it loses the node attributes (zone) # so instead build a dictionary mapping nodeid:zone # so that can be written to zone file in correct order. # Note that then the index in nodelist of a nodeid can be used # as sequential node number of each node. ##Gsample = snap.ConvertGraph(snap.PNEANet, Gsample0, True) #print 'YYY',Gsample.GetIntAttrDatN(Gsample.GetRndNId(), "zone")#XXX Gsample = Gsample0 nodelist = list( ) # keep this iteration in list so we always use same order in future zonedict = dict() # map nodeid : zone for node in Gsample.Nodes(): nodelist.append(node.GetId()) zonedict[node.GetId()] = Gsample.GetIntAttrDatN( node.GetId(), "zone") print time.time() - start, 's' snap.PrintInfo(Gsample) subgraph_filename = outputdir + os.path.sep + "subgraph" + str( i) + os.path.extsep + "txt" write_graph_file(subgraph_filename, Gsample, nodelist) subzone_filename = outputdir + os.path.sep + "subzone" + str( i) + os.path.extsep + "txt" write_zone_file(subzone_filename, Gsample, nodelist, zonedict) subactor_filename = outputdir + os.path.sep + "subactor" + str( i) + os.path.extsep + "txt" # TODO get actor attributes #write_subactors_file(subactor_filename, Gsample, nodelist) # format of sampledesc file is: # N subzone_filename subgraph_filename subactor_filename sampledesc_filename = outputdir + os.path.sep + "sampledesc" + os.path.extsep + "txt" sampledesc_f.write("%d %s %s %s\n" % (Gsample.GetNodes(), subzone_filename, subgraph_filename, subactor_filename)) sampledesc_f.close()
import snap import time #from utils.network_utils import get_num_elem_per_mode filename = "Graphs/oldMinerNewSNAP.graph" FIn = snap.TFIn(filename) Graph = snap.TMMNet.Load(FIn) print('Modes: %d' % Graph.GetModeNets()) print('Link types: %d' % Graph.GetCrossNets()) crossnetids = snap.TInt64V() crossneti = Graph.BegCrossNetI() while crossneti < Graph.EndCrossNetI(): crossnetids.Add(crossneti.GetCrossId()) crossneti.Next() nodeattrmapping = snap.TIntStrStrTr64V() edgeattrmapping = snap.TIntStrStrTr64V() start_time = time.time() DirectedNetwork = Graph.ToNetwork(crossnetids, nodeattrmapping, edgeattrmapping) end_time = time.time() print("Converting to TNEANet takes %s seconds" % (end_time - start_time)) snap.PrintInfo(DirectedNetwork, "Python type PNEANet", "output.txt", False) map(lambda x: x.replace("\n", ""), open("output.txt").readlines())
# print 'Loading Boards' # boardfile = open('../data/boards.tsv') # board2user = {} # user2boards = defaultdict(list) # for line in boardfile: # board_id, board_name, board_description, user_id, board_create_time = line.split('\t') # board2user[board_id] = (user_id,board_name) # user2boards[int(user_id)].append((board_id,board_name)) graph = snap.LoadEdgeList(snap.PNGraph, '../graphs/firstMillionGraph.txt', 0, 1) print "This Graph has ",graph.GetNodes(), " nodes" print "This Graph has ",graph.GetEdges(), " edges" snap.PrintInfo(graph, "Python type PNEANet") n = graph.GetNI(snap.GetMxInDegNId(graph)) print 'Max in degree node:', n.GetId() print 'In degree: ',n.GetInDeg() print 'Out degree:',n.GetOutDeg() # print user2boards[int(n.GetId()) print 'Calculating Page Rank' PRankH = snap.TIntFltH() snap.GetPageRank(graph, PRankH) betw = [] for n in PRankH: betw.append((n,PRankH[n]))
import snap Graph = snap.LoadEdgeList(snap.PUNGraph, "facebook_combined.txt", 0, 1, ' ') snap.PrintInfo(Graph, "Facebook Data Set") SubGraph = snap.GetRndSubGraph(Graph, 10) SubGraph.Dump()
import snap import matplotlib.pyplot as plt import numpy as np from pathlib import Path import sys gfile = sys.argv[1] print('Printing summary stats for file at:', gfile) if gfile.endswith('.graph'): FIn = snap.TFIn(gfile) Network = snap.TUNGraph.Load(FIn) else: Network = snap.LoadEdgeList(snap.PUNGraph, gfile, 0, 1) snap.PrintInfo(Network) print('Edges:', snap.CntUniqUndirEdges(Network)) # for directed graphs, should be same for undir DegToCntV = snap.TIntPrV() snap.GetInDegCnt(Network, DegToCntV) print('Nodes with deg > 10', sum([item.GetVal2() for item in DegToCntV if item.GetVal1() > 10])) ClustCoeff = snap.GetClustCf(Network, 10000) print('Clustering coeff', ClustCoeff)
def main(): """ See usage message in module header block """ get_subgraph = False # if True discard nodes without attribute data try: opts, args = getopt.getopt(sys.argv[1:], "d") except: usage(sys.argv[0]) for opt, arg in opts: if opt == "-d": get_subgraph = True else: usage(sys.argv[0]) if len(args) != 1: usage(sys.argv[0]) data_dir = args[0] outputdir = '.' sys.stdout.write('loading data from ' + data_dir + '...') start = time.time() (G, patdata, colnames) = load_nber_patent_data(data_dir) print time.time() - start, 's' snap.PrintInfo(G) # Remove loops (self-edges). # There is actually for some reason one loop (patent id 5489070). # G is a PNGraph so multiple edges not allowed in this type anyway. snap.DelSelfEdges(G) snap.PrintInfo(G) # We do not add attributes to nodes as SNAP node attribute as # these seem to get lost by varoius operations including subgraph # that we need to use, so instead maintain them just in the # dictionary mapping the original node ids to the attributes - # fortunately the original node ids are maintained by # GetSubGraph() so we can used these to index the patdata # dictoinary in the subgraphs # Cannot do this: #patdata[:][colnames['COUNTRY']] = convert_to_int_cat(patdata[:][colnames['COUNTRY']]) # like factor in R # as get "TypeError: unhashable type" so have to do this instead: id_countries = [(k, p[colnames['COUNTRY']]) for (k, p) in patdata.iteritems()] id_countries_int = convert_to_int_cat([x[1] for x in id_countries]) for i in xrange(len(id_countries)): patdata[id_countries[i][0]][colnames['COUNTRY']] = id_countries_int[i] for attr in ['COUNTRY']: sys.stdout.write('There are %d NA for %s\n' % ([p[colnames[attr]] for p in patdata.itervalues()].count('NA'), attr)) id_states = [(k, p[colnames['POSTATE']]) for (k, p) in patdata.iteritems()] id_states_int = convert_to_int_cat([x[1] for x in id_states]) for i in xrange(len(id_states)): patdata[id_states[i][0]][colnames['POSTATE']] = id_states_int[i] for attr in ['POSTATE']: sys.stdout.write('There are %d NA for %s\n' % ([p[colnames[attr]] for p in patdata.itervalues()].count('NA'), attr)) # There are 3774768 unique patent identifiers in the citation data but # only 2923922 unique patent identifiers in the patent data (patdata). # The size of the set intersection of these patent ids is 2755865 # i.e. there is patent data for 73% of the patents in the citation network. # Presumably this is because the patdata (pat63_99.txt) contains all # utilit patents in the period 1963 to 1999 but the citation data # cit75_99.txt contains all US patent citations for utility patents # granted in the period 1975 to 1999, so there are patent ids in here # from earlier periods that are cited by patents in that period, # for which therefore we don't have the patent data (prior to 1963). # So we have to set the data for all patents in network that we have it # for, and the rest (27%) to NA. nodelist = list( ) # keep the iteration below in list so we always use same order in future if get_subgraph: # get subgraph induced by nodes that have patent data in the # pat63_99.txt file nodeVec = snap.TIntV() # nodelist in TIntV format for use in SNAP for node in G.Nodes(): patid = node.GetId() if patdata.has_key(patid): nodelist.append(patid) nodeVec.Add(patid) G = snap.GetSubGraph(G, nodeVec) print 'Subgraph with only nodes with patent attribute data:' snap.PrintInfo(G) else: # keep all the graph and just put NA for all data attributes citepatent_count = 0 patentdata_count = 0 for node in G.Nodes(): citepatent_count += 1 patid = node.GetId() nodelist.append(patid) #print citepatent_count, patentdata_count, patid #XXX if not patdata.has_key(patid): #print 'NA for ', patid #XXX patdata[patid] = len(colnames) * ["NA"] patdata[patid][ colnames['HASDATA']] = 0 # no data on this patent else: patentdata_count += 1 sys.stdout.write( "There are %d unique cited/citing patents of which %d (%f%%) have patent data\n" % (citepatent_count, patentdata_count, 100 * float(patentdata_count) / citepatent_count)) graph_filename = outputdir + os.path.sep + "patent_citations" + os.path.extsep + "txt" write_graph_file(graph_filename, G, nodelist) attributes_binary_filename = outputdir + os.path.sep + "patent_binattr" + os.path.extsep + "txt" attributes_categorical_filename = outputdir + os.path.sep + "patent_catattr" + os.path.extsep + "txt" attributes_continuous_filename = outputdir + os.path.sep + "patent_contattr" + os.path.extsep + "txt" write_attributes_file_binary(attributes_binary_filename, G, nodelist, patdata, colnames) write_attributes_file_categorical(attributes_categorical_filename, G, nodelist, patdata, colnames) write_attributes_file_continuous(attributes_continuous_filename, G, nodelist, patdata, colnames) nodeid_filename = outputdir + os.path.sep + "nodeid" + os.path.extsep + "txt" write_subgraph_nodeids(nodeid_filename, nodelist)
metadata = {} metadata['number_of_patents'] = len(patents) # Number of primary patents Graph = snap.PUNGraph.New() # Merge list of patents from this company and external patents they cite citation_map = {} patent_set = set() for patent in patents: patent_set.add(patent) # This patent patent_set.update(citation_cache[patent]) # This patent's citations citation_map[patent] = citation_cache[patent] patent_nid_map = {} # Add all nodes to graph for i, patent in enumerate(patent_set): patent_nid_map[patent] = i Graph.AddNode(i) # Add all backward citation edges for patent, citations in citation_map.iteritems(): for cite in citations: Graph.AddEdge(patent_nid_map[patent], patent_nid_map[cite]) snap.PrintInfo(Graph) with open(out_folder + '%s.json' % company_name, 'w') as fp: json.dump(metadata, fp, sort_keys=True, indent=4) snap.SaveEdgeList(Graph, out_folder + '{}.txt'.format(company_name), \ "Backward citation network for company, drawn from patent data") print "Saved data for {}".format(company_name)
'\n') f.close() print 'finished writing pagerank components values' def component_distribution(g): print 'executing component distribution --- getting components' ComponentDist = snap.TIntPrV() snap.GetWccSzCnt(g, ComponentDist) f = open('component_distribution.txt', 'w') f.write("Size - Number of Components:\n") for comp in ComponentDist: f.write("% d \t % d\n" % (comp.GetVal1(), comp.GetVal2())) f.close() print 'finshed componet distribution' snap.PrintInfo(g, "Python type PNGraph", "info-pngraph.txt", False) pr = Process(target=pagerank, args=(g, )) cd = Process(target=component_distribution, args=(g, )) prc = Process(target=pageRank_components, args=(g, )) pr.start() cd.start() prc.start() pr.join() cd.join() prc.join()
def printGStats(self): snap.PrintInfo(self.G, self.graphName, statDir + self.graphName + "-info.txt", False)