def get_monthly_trans_graphs(): files = os.listdir(TRANS_PATH) file_month_list = [] for fn in files: if not fn.endswith('DS_Store'): file_month_list.append(fn) return [GH.load_graph(TRANS_PATH, fn) for fn in file_month_list]
def get_monthly_venue_graphs(): files = os.listdir(DIR_PATH) file_month_list = [] for fn in files: if fn.startswith(FILE_PREFIX): file_month_list.append(fn) return [GH.load_graph(DIR_PATH, fn) for fn in file_month_list]
def main(): data_path = '../CS224W_Dataset/GraphData' filename = 'sf_venue_center_small' venue_g = GH.load_graph(data_path, filename) weight_hash = create_weight_hash(venue_g) degree_hash = create_degree_hash(venue_g) un_venue_g = create_undirected(venue_g) print "start community detection" communities = detect_community(weight_hash, un_venue_g, degree_hash) print communities
def generate_node_change(): x, y = [], [] for i in range(0, 13): filename = snapshot_list[i] g = GH.load_graph(graph_path, filename) print g.GetNodes() x.append(i) y.append(g.GetNodes()) plt.plot(x, y, '-') plt.show()
def generate_edge_ratio(): x, y = [], [] g = GH.load_graph(graph_path, snapshot_list[0]) prev_nom, prev_denom = g.GetEdges(), 0.0 for E in g.Edges(): prev_denom += g.GetIntAttrDatE(E.GetId(), 'trsn_cnt') for i in range(1, 13): filename = snapshot_list[i] g = GH.load_graph(graph_path, filename) nom, denom = g.GetEdges(), 0.0 for E in g.Edges(): denom += g.GetIntAttrDatE(E.GetId(), 'trsn_cnt') x.append(i) y.append((nom-prev_nom) / (denom-prev_denom)) prev_denom = denom prev_nom = nom plt.plot(x, y, '-') plt.show()
def generate_monthly_trans_graphs(): graphs = get_monthly_venue_graphs() for idx in range(len(graphs)-1, 0, -1): cur_graph = graphs[idx] prev_graph = graphs[idx-1] for edge in prev_graph.Edges(): src_nid = edge.GetSrcNId() dst_nid = edge.GetDstNId() cur_eid = cur_graph.GetEId(src_nid, dst_nid) cur_weight = cur_graph.GetIntAttrDatE(cur_eid, 'trsn_cnt') prev_eid = prev_graph.GetEId(src_nid, dst_nid) prev_weight = prev_graph.GetIntAttrDatE(prev_eid, 'trsn_cnt') diff = cur_weight - prev_weight # cur_graph.AddIntAttrDatE(cur_eid, cur_weight - prev_weight, 'trsn_cnt') # month_coeff = float(idx) month_coeff = 1 cur_graph.AddIntAttrDatE(cur_eid, ORI_FACTOR*cur_weight + month_coeff*MUL_FACTOR*diff, 'trsn_cnt') print "updated trans graph for month ", idx for idx, G in enumerate(graphs): idx_str = str(idx) if idx < 10 else '9'+str(idx) GH.save_graph(G, TRANS_PATH, TRANS_FILE_PREFIX + idx_str)
''' Currently, the graph has node attribute: - vid - ckn (insofar, checkin number) - sts (start timestamp) - ets (end timestamp) - lat - lng - category - pcategor And edge attribute: - trsn_cnt - duration ''' venue_g = GH.load_graph(graph_path, graph_name) category_list = VH.get_category_list(venue_path, category_name) pcategory_list = VH.get_category_list(venue_path, pcategory_name) #GH.print_node_attr_names(venue_g) #GH.print_edge_attr_names(venue_g) #print category_list GH.print_nids(venue_g) # create snapshop of the graph - node accurate, but edge aren't center = (37.76010, -122.44779) radius = 0.095 print venue_g.GetNodes() print venue_g.GetEdges() i = 0 for edge in venue_g.Edges():
import snap import os import numpy as np import Helper.GraphHelper as GH import Helper.AnalysisHelper as AH import pylab as plt ''' Import Graph: graph is stored in binary form to save space, available in dropbox folder sf_venue_graph_small: A small test graph with only a few venues in sf -- you can use this to test your script first sf_venue_graph: up-to-date venue graph of sf ''' data_path = '../DataSet/GraphData/' result_path = '../DataSet/Analysis/' filename = 'sf_venue_graph' venue_g = GH.load_graph(data_path, filename) '''Analysis 1: graph structure - graph size - SCC, bowtie structure ''' g_size = venue_g.GetNodes() edge_size = venue_g.GetEdges() max_scc = snap.GetMxScc(venue_g) num_max_scc_n = max_scc.GetNodes() rand_node = max_scc.GetRndNId() out_combined = snap.GetBfsTree( venue_g, rand_node, True, False ) in_combined = snap.GetBfsTree( venue_g, rand_node, False, True ) max_wcc = snap.GetMxWcc( venue_g )
''' Currently, the graph has node attribute: - vid - ckn (insofar, checkin number) - sts (start timestamp) - ets (end timestamp) - lat - lng - category - pcategor And edge attribute: - trsn_cnt - duration ''' g = GH.load_graph(graph_path, graph_name) n = g.GetNodes() print g.GetNodes(), g.GetEdges() edge_weight = get_edge_weight(g) node_weight = get_node_weight(g) undirected_g = GH.convert_undirected_graph(g) m = get_m(node_weight) B = get_B(undirected_g, node_weight, m) save_list(B, community_path, 'B_matrix') num_community = 0 mod_list = [] while num_community < 10:
""" Currently, the graph has node attribute: - vid - ckn (insofar, checkin number) - sts (start timestamp) - ets (end timestamp) - lat - lng - category - pcategor And edge attribute: - trsn_cnt - duration """ venue_g = GH.load_graph(graph_path, graph_name) category_list = VH.get_category_list(venue_path, category_name) pcategory_list = VH.get_category_list(venue_path, pcategory_name) # GH.print_node_attr_names(venue_g) # GH.print_edge_attr_names(venue_g) # print category_list GH.print_nids(venue_g) # create snapshop of the graph - node accurate, but edge aren't ts_list = TH.gen_ts_list("201201010000", "201301010000", 30) ts_list.reverse() for ts in ts_list: GH.filter_node_sts(venue_g, ts) GH.save_graph(venue_g, graph_path, "sf_venue_" + ts)
sf_trsn_graph_small: A small test graph with only a few venues in sf -- you can use this to test your script first sf_trsn_graph: up-to-date venue graph of sf ''' def counter_to_arrays(c): values = [] frequencies = [] for n in c: values.append(n) frequencies.append(c[n]) return [values, frequencies] data_path = '../Dataset/GraphData' result_path = '../Dataset/Analysis/' graph = GH.load_graph(data_path, 'sf_venue_graph') occurrences = cl.Counter() dataset = [] for node in graph.Nodes(): ckn = graph.GetIntAttrDatN(node.GetId(), 'ckn') occurrences[ckn] += 1 dataset.append(ckn) x, y = counter_to_arrays(occurrences) alpha = AH.get_mle_alpha(dataset, min(dataset)) powerlaw_y = AH.get_powerlaw_y(dataset, alpha, min(dataset), np.sum(y)) print "check-in distribution: the estimated alpha is", alpha plt.figure() plt.xscale('log')
return False data_path = "../DataSet/Transition/" graph_path = "../DataSet/GraphData/" venue_path = "../DataSet/VenueData/" trsn_list = VH.load_pickle_file(data_path, "sf_trsn_small_new") time_list = VH.load_pickle_file(data_path, "sf_time_small_new") full_venue_dict = VH.GetFullVenueDict(venue_path, "venues-CA-new.json") category_dict = VH.load_json(venue_path, "category_map.json") pcategory_dict = VH.load_json(venue_path, "pcategory_map.json") vid_map = create_vid_map(trsn_list) ts_list = TH.gen_ts_list("201201010000", "201301010000", 30) venue_g = snap.TNEANet.New() for ts_idx, ts in enumerate(ts_list): for trsn_idx, trsn in enumerate(trsn_list): src_ts = time_list[trsn_idx][0] # only need check one ts dst_ts = time_list[trsn_idx][1] if within_ts_range(ts, src_ts): src_nid = vid_map[trsn[0]] dst_nid = vid_map[trsn[1]] GH.add_node(venue_g, src_nid, trsn[0], src_ts) GH.add_node(venue_g, dst_nid, trsn[1], dst_ts) GH.add_edge(venue_g, src_nid, dst_nid, time_list[trsn_idx]) GH.add_category(venue_g, full_venue_dict, category_dict, pcategory_dict) print venue_g.GetNodes() GH.save_graph(venue_g, graph_path, "sf_venue_small_" + str(ts))
for nid, vid in enumerate(node_set): node_hash[vid] = nid trsn_g = snap.TNEANet.New() #node_id: 0 to n-1 for vid, nid in node_hash.iteritems(): trsn_g.AddNode(nid) trsn_g.AddStrAttrDatN(nid, vid, 'vid') #freq: frequncy(cnt) of edge print trsn_g.GetNodes() for idx, trsn in enumerate(trsn_list): src_nid = node_hash[trsn[0]] dst_nid = node_hash[trsn[1]] print src_nid, dst_nid #TODO: add timestamp filter if not trsn_g.IsEdge(src_nid, dst_nid): GH.add_edge_attrs(trsn_g, src_nid, dst_nid, time_list[idx]) GH.add_node_attrs(trsn_g, src_nid, dst_nid, time_list[idx]) print "add a new edge, hoho~" else: GH.update_edge_attrs(trsn_g, src_nid, dst_nid, time_list[idx]) GH.update_node_attrs(trsn_g, src_nid, dst_nid, time_list[idx]) print "update node info, haha~" print idx, trsn print len(trsn_list) GH.save_graph(trsn_g, graph_path, 'sf_trsn_graph') print "succesfully build the graph!"
else: print "catch a false" return False data_path = '../DataSet/' graph_path = '../DataSet/GraphData/' venue_path = '../DataSet/VenueData/' trsn_list = VH.load_pickle_file(data_path, 'sf_trsn') time_list = VH.load_pickle_file(data_path, 'sf_time') full_venue_dict = VH.GetFullVenueDict(venue_path, 'venues-CA-new.json') category_dict = VH.load_json(venue_path, 'category_map.json') pcategory_dict = VH.load_json(venue_path, 'pcategory_map.json') lng_list = [GH.get_lat_lng(full_venue_dict, trsn[1]) for trsn in trsn_list] lng_list.sort(key=lambda t:t[1], reverse=True) print lng_list[0] ''' vid_map = create_vid_map(trsn_list) venue_g = snap.TNEANet.New() lngs = [] for trsn_idx, trsn in enumerate(trsn_list): # only need check one vid lat, lng = GH.get_lat_lng(full_venue_dict, trsn[0]) if within_geo_range(center, radius, lat, lng): lngs.append(lng) src_nid = vid_map[trsn[0]] dst_nid = vid_map[trsn[1]] src_ts = time_list[trsn_idx][0]
node_count = 0 edge_count = 0 for node in g.Nodes(): node_count += 1 un_g.AddNode(node.GetId()) for edge in g.Edges(): edge_count += 1 un_g.AddEdge(edge.GetSrcNId(), edge.GetDstNId()) print "node: %d edge: %d" % (node_count, edge_count) return un_g data_path = '../CS224W_Dataset/GraphData' filename = 'sf_venue_center' trsn_g = GH.load_graph(data_path, filename) un_trsn_g = to_PUNGraph(trsn_g) # try to use the SNAP library function to get the community structure communities = snap.TCnComV() modularity = snap.CommunityCNM(un_trsn_g, communities) print "Community detection complete, modularity score is", modularity # communities = [[3280, 2414, 2662, 2878, 3551], [848, 1106, 1474, 1915, 2089, 3139, 3400, 5759, 6280, 7848]] # fetch venue info and produce a csv for visualization data_path = '../CS224W_Dataset' out_csv = '../CS224W_Dataset/transition-SF-community.csv' venue_hash = VH.GetFullVenueDict(data_path, 'venues-CA-new.json') with open(out_csv, 'w') as fout: a = csv.writer(fout, delimiter=',', quoting=csv.QUOTE_ALL)
venue_dict[data['id']] = data fin.close() return venue_dict # add attributes for graph nodes def AddNodeAttr(graph, full_venue_dict): ''' for each node in the graph, add two attributes 1. two float values: latitude, longitute 2. category 3. parent-category ''' for NI in graph.Nodes(): vid = graph.GetStrAttrDatN(NI.GetId(), 'vid') if vid in full_venue_dict: graph.AddFltAttrDatN(NI.GetId(), float(full_venue_dict[vid]['lat']), 'lat') graph.AddFltAttrDatN(NI.GetId(), float(full_venue_dict[vid]['lng']), 'lng') graph.AddStrAttrDatN(NI.GetId(), full_venue_dict[vid]['category'], 'category') graph.AddStrAttrDatN(NI.GetId(), full_venue_dict[vid]['parentcategory'], 'pcategory') #GH.save_graph(graph, result_path, result_filename) #return None trsn_g = GH.load_graph(graph_data_path, graph_filename) full_venue_dict = VH.GetFullVenueDict(venue_graph_data_path, venue_filename) category_dict = VH.load_json(venue_graph_data_path, 'category_map.json') pcategory_dict = VH.load_json(venue_graph_data_path, 'pcategory_map.json') GH.add_category(trsn_g, full_venue_dict, category_dict, pcategory_dict) GH.save_graph(trsn_g, graph_data_path, 'sf_venue_graph') print 'successfully build venue_graph!'