def setUp(self): self.pr_model = gl.pagerank.create(gl.SGraph()) self.cc_model = gl.connected_components.create(gl.SGraph()) self.__remove_file('./tmp_model-%d' % temp_number) self.__remove_file('/tmp/tmp_model-%d' % temp_number) self.__remove_file('/tmp/tmp_model2-%d' % temp_number)
def pagerank_triple_apply(input_graph, reset_prob=0.15, threshold=1e-3, max_iterations=20): g = gl.SGraph(input_graph.vertices, input_graph.edges) # compute normalized edge weight g.vertices['total_weight'] = 0.0 g = g.triple_apply(sum_weight, ['total_weight']) g = g.triple_apply(normalize_weight, ['weight']) del g.vertices['total_weight'] # initialize vertex field g.vertices['prev_pagerank'] = 1.0 it = 0 total_l1_delta = len(g.vertices) start = time.time() while(total_l1_delta > threshold and it < max_iterations): g.vertices['pagerank'] = 0.0 g = g.triple_apply(pagerank_update_fn, ['pagerank']) g.vertices['pagerank'] = g.vertices['pagerank'] * (1 - reset_prob) \ + reset_prob g.vertices['l1_delta'] = (g.vertices['pagerank'] - \ g.vertices['prev_pagerank']).apply(lambda x: abs(x)) total_l1_delta = g.vertices['l1_delta'].sum() g.vertices['prev_pagerank'] = g.vertices['pagerank'] print 'Iteration %d: total pagerank changed in L1 = %f' % (it,\ total_l1_delta) it = it + 1 print 'Triple apply pagerank finished in: %f secs' % (time.time() - start) del g.vertices['prev_pagerank'] return g
def select_subgraph(movie1='Point Break', movie2='Shrek'): selection = [movie1, movie2] subgraph = graphlab.SGraph() g = construct_graph() extracted = g.get_edges(dst_ids = selection) subgraph = subgraph.add_edges(extracted, src_field='__src_id', dst_field='__dst_id') return subgraph
def make_graph(target_id, followers, apis, update=False): #cache_path = '../cache/_graphs/' #file_name = str(target_id) + '.graph' #if os.path.isfile(cache_path + file_name) and not update: # return gl.load_graph(cache_path + file_name) g - graphlab.SGraph() verts = [make_verts(target_id)] edges = make_edges(followers, target_id) verts.append(make_verts(id)) #if fol_followers: edges.extend(make_edges(fol_followers, id)) #if fol_following: edges.extend(make_edges(id, fol_following)) #g = gl.SGraph().add_vertices(verts).add_edges(edges) #g.save(cache_path + file_name) return g
def triple_apply_knn(features): def graph_JSD(src,edge,dst): P = src['X1'] Q = dst['X1'] _P = P / norm(P, ord=1) _Q = Q / norm(Q, ord=1) _M = 0.5 * (_P + _Q) edge['distance'] = 0.5 * (entropy(_P, _M) + entropy(_Q, _M)) return (src, edge, dst) n = len(features) sf = gl.SFrame(features) sf = sf.add_row_number('row_id') sg = gl.SGraph().add_vertices(sf, vid_field='row_id') edges = [gl.Edge(u, v, attr={'distance': None}) for (u, v) in itertools.combinations(range(n), 2)] sg = sg.add_edges(edges) sg_dist = sg.triple_apply(graph_JSD, mutated_fields=['distance']) #knn = sg_dist.edges.groupby("__src_id", {"knn" : gl.aggregate.CONCAT("__dst_id","distance")}).sort("__src_id") #top_neighbors = knn.apply(lambda row: sorted(row['knn'],key=row['knn'].get)[:N]) top_neighbors = [] for idx in xrange(n): topN_sf = sg_dist.get_edges(src_ids=[idx,None],dst_ids=[None,idx]).topk('distance',k=N,reverse=True) topN = topN_sf.apply(lambda row: row['__src_id'] if row['__dst_id']==idx else row['__dst_id']) top_neighbors.append(topN) return gl.SArray(top_neighbors)
def _wrap_function_return(val): """ Recursively walks each thing in val, opening lists and dictionaries, converting all occurances of UnityGraphProxy to an SGraph, UnitySFrameProxy to SFrame, and UnitySArrayProxy to SArray. """ if type(val) == _UnityGraphProxy: return _gl.SGraph(_proxy=val) elif type(val) == _UnitySFrameProxy: return _gl.SFrame(_proxy=val) elif type(val) == _UnitySArrayProxy: return _gl.SArray(_proxy=val) elif type(val) == _UnityModel: # we need to cast it up to the appropriate type try: if '__uid__' in val.list_fields(): uid = val.get('__uid__') if uid in class_uid_to_class: return class_uid_to_class[uid](_proxy=val) except: pass return val elif type(val) == list: return [_wrap_function_return(i) for i in val] elif type(val) == dict: return {i: _wrap_function_return(val[i]) for i in val} else: return val
def get_sgraph_from_neo4j_json(json_filename): ''' Reads a JSON file, created by Neo4j, into an SGraph. Args: json_filename: The name of the JSON file created by Neo4j. Returns: SGraph ''' # Load json_filename into an SFrame sf = gl.SFrame.read_csv(json_filename, header=False, column_type_hints=dict, verbose=False) # Extract the graph data from sf sf = sf.unpack('X1', column_name_prefix='') sf = sf[['data']].stack('data', new_column_name='data') # Extract nodes and edges nodes_sf = extract_entities(sf, 'nodes') edges_sf = extract_entities(sf, 'relationships') # Create the SGraph sgraph = gl.SGraph() sgraph = sgraph.add_edges(edges_sf, src_field='startNode', dst_field='endNode') sgraph = sgraph.add_vertices(nodes_sf, vid_field='id') return sgraph
def load_graph_task(task): graph_data = gl.SFrame.read_csv(task.params['csv'], header=False, delimiter=' ', column_type_hints=long) task.outputs['graph'] = gl.SGraph().add_edges(graph_data, src_field='X1', dst_field='X2')
def test_shortest_path(self): if "sssp" in get_unity().list_toolkit_functions(): m = gl.shortest_path.create(self.graph, source_vid=0) print m m.summary() self.__test_model_save_load_helper__(m) m2 = gl.shortest_path.create(self.graph, source_vid=0) print m2 self.__test_model_save_load_helper__(m2) # Test get_path function on a simple chain graph and star graph chain_graph = gl.SGraph().add_edges( [gl.Edge(i, i + 1) for i in range(10)]) m3 = gl.shortest_path.create(chain_graph, source_vid=0) for i in range(10): self.assertSequenceEqual(m3.get_path(i), [(j, float(j)) for j in range(i + 1)]) star_graph = gl.SGraph().add_edges( [gl.Edge(0, i + 1) for i in range(10)]) m4 = gl.shortest_path.create(star_graph, source_vid=0) for i in range(1, 11): self.assertSequenceEqual(m4.get_path(i), [(0, 0.0), (i, 1.0)]) # Test sssp ignoring the existing distance field star_graph.vertices['distance'] = 0 m5 = gl.shortest_path.create(star_graph, source_vid=0) for i in range(1, 11): self.assertSequenceEqual(m5.get_path(i), [(0, 0.0), (i, 1.0)]) default_options = gl.shortest_path.get_default_options() print default_options self.assertTrue(len(default_options.keys()) == 2) self.assertTrue(default_options['weight_field'] == "") self.assertTrue(default_options['max_distance'] == 1e30) m6 = gl.shortest_path.create(chain_graph, source_vid=0, max_distance=3) current_options = m6.get_current_options() print current_options self.assertTrue(len(current_options.keys()) == 2) self.assertTrue(current_options['weight_field'] == "") self.assertTrue(current_options['max_distance'] == 3)
def loadData(): edgesData = gl.load_sframe(sframeDataFolder) print 'num_rows:%d ' %edgesData.num_rows() #create graph G = gl.SGraph() G = G.add_edges(edges = edgesData, src_field ='src',dst_field = 'dst') pritn 'create graph done!' return G
def __init__(self, vertices=None, edges=None, child=None): self.parent = None self.child = child if vertices and edges: self.g = gl.SGraph(vertices=vertices, edges=edges, vid_field='__id', src_field='__src_id', dst_field='__dst_id')
def networkx_to_graphlab(g): p = gl.SGraph() # Add nodes p = p.add_vertices(map(lambda x: gl.Vertex(x[0], attr=x[1]), g.nodes(data=True))) # Add edges p = p.add_edges(map(lambda x: gl.Edge(x[0], x[1], attr=x[2]), g.edges(data=True))) if not g.is_directed(): p = p.add_edges(map(lambda x: gl.Edge(x[1], x[0], attr=x[2]), g.edges(data=True))) return p
def load_data(dpath, maxn=None): cites = gl.SFrame.read_csv(dpath+'cites.csv', column_type_hints = [int, int]) paper = gl.SFrame.read_csv(dpath+'papers.csv', column_type_hints = [int, int]) bad = cites.apply(lambda x:x['p1'] == x['p2']) print '%d self-citation detected, delete' % bad.sum() cites = cites[bad == 0] if maxn is not None: paper = paper[paper['id'] < maxn] cites = cites[cites.apply(lambda x: x['p1'] < maxn and x['p2'] < maxn)] sg = gl.SGraph(vertices = paper, edges = cites, vid_field = 'id', src_field = 'p1', dst_field = 'p2') return sg
def create_test_objects(): vertices = pandas.DataFrame({'vid': ['1', '2', '3'], 'color': ['g', 'r', 'b'], 'vec': [[.1, .1, .1], [.1, .1, .1], [.1, .1, .1]]}) edges = pandas.DataFrame({'src_id': ['1', '2', '3'], 'dst_id': ['2', '3', '4'], 'weight': [0., 0.1, 1.]}) graph = graphlab.SGraph().add_vertices(vertices, 'vid').add_edges(edges, 'src_id', 'dst_id') sframe = graphlab.SFrame(edges) model = graphlab.pagerank.create(graph) return (graph, sframe, model)
def loadSubData(): sframeFiles = os.listdir('/home/lt/sframe') edgesData = gl.sFrame() for sf in sframeFiles: edgesData.append(gl.load_sframe('/home/lt/sframe',sf)) edgesData.rename({'X1':'src','X2':'dst'}) #create graph G = gl.SGraph() G = G.add_edges(edges = edgesData, src_field ='src',dst_field = 'dst') pritn 'create graph done!' return G
def load_data(dpath, maxn=None): cites = gl.SFrame.read_csv(dpath + 'cites.csv', column_type_hints=[int, int]) paper = gl.SFrame.read_csv(dpath + 'papers.csv', column_type_hints=[int, int]) if maxn is not None: paper = paper[paper['id'] < maxn] cites = cites[cites.apply(lambda x: x['p1'] < maxn and x['p2'] < maxn)] sg = gl.SGraph(vertices=paper, edges=cites, vid_field='id', src_field='p1', dst_field='p2') return sg
def test_compute_shortest_path(self): edge_src_ids = ['src1', 'src2', 'a', 'b', 'c'] edge_dst_ids = ['a', 'b', 'dst', 'c', 'dst'] edges = gl.SFrame({'__src_id': edge_src_ids, '__dst_id': edge_dst_ids}) g = gl.SGraph().add_edges(edges) res = list( gl.shortest_path._compute_shortest_path(g, ["src1", "src2"], "dst")) self.assertEquals(res, [["src1", "a", "dst"]]) res = list(gl.shortest_path._compute_shortest_path(g, "src2", "dst")) self.assertEquals(res, [["src2", "b", "c", "dst"]]) edge_src_ids = [0, 1, 2, 3, 4] edge_dst_ids = [2, 3, 5, 4, 5] edge_weights = [1, 0.1, 1, 0.1, 0.1] g = gl.SFrame({ '__src_id': edge_src_ids, '__dst_id': edge_dst_ids, 'weights': edge_weights }) g = gl.SGraph(edges=g) t = gl.shortest_path._compute_shortest_path(g, [0, 1], [5], "weights") self.assertEquals(t, [1, 3, 4, 5])
def build_sptgraph(sg, create_self_edges, baseid_name, layer_name): # It is used to generate node ids in the causal multilayer graph # IMPORTANT baseID starts at 1 and not 0 max_id = sg.vertices['__id'].max() def expand_edge_layers(x): """Closure, capture max_id to generate all edges for the graph""" base_src = x['__src_id'] base_tgt = x['__dst_id'] bitfield = int(x['sp_edges']) return expand_causal_edges_from_bitfield(bitfield, base_src, base_tgt, max_id) def expand_vertex_layers(x): """Closure, capture max_id to generate all edges for the graph""" base_src = x['__id'] base_tgt = x['__id'] bitfield = shift_and_bitstrings(x['layers'], x['layers']) return expand_causal_edges_from_bitfield(bitfield, base_src, base_tgt, max_id) def generate_sp_edge_sframe(x): return make_list(x['sp_edges']) # Create empty field which will hold the bitstring for the edge creation sg.edges['sp_edges'] = '' # Create edge bitstring sg = sg.triple_apply(create_causal_edges_bitstring, mutated_fields=['sp_edges']) # Expand to actual source and destination as a string sg.edges['sp_edges'] = sg.edges.apply(expand_edge_layers, dtype=str) # Create new sframe with actual ids sp_edges = sg.edges.flat_map(['source', 'dest'], generate_sp_edge_sframe, column_types=[int, int]) # Create the graph from edges h = gl.SGraph().add_edges(sp_edges, src_field='source', dst_field='dest') del sg.edges['sp_edges'] if create_self_edges: sg.vertices['sp_edges'] = '' # Expand to actual source and destination as a string sg.vertices['sp_edges'] = sg.vertices.apply(expand_vertex_layers, dtype=str) # Create new sframe with actual ids sp_edges = sg.vertices.flat_map(['source', 'dest'], generate_sp_edge_sframe, column_types=[int, int]) h = h.add_edges(sp_edges, src_field='source', dst_field='dest') del sg.vertices['sp_edges'] # Add baseid and layer to spt graph h.vertices[layer_name] = h.vertices.apply(lambda x: (x['__id'] - 1) // max_id, dtype=int) # base_src = src - (l * max_id) h.vertices[baseid_name] = h.vertices.apply(lambda x: x['__id'] - (x[layer_name] * max_id), dtype=int) return h
def create_spatio_temporal_graph(g, data, create_self_edges=True, baseid_name='baseID', layer_name='layer', verbose=True, force_python=False, excluded_ids=None): start = time.time() if verbose: LOGGER.info('Start spatio-temporal graph creation') signal = gl.SFrame(data) if HAS_FAST_MODULE and not force_python: node_signal = create_node_signal_fast(signal, baseid_name, layer_name, verbose=verbose) else: node_signal = create_node_signal(signal, baseid_name, layer_name, verbose=verbose) sg = merge_signal_on_graph(g, node_signal, baseid_name, layer_name, excluded_ids=excluded_ids, verbose=verbose) # Create graph if HAS_FAST_MODULE and not force_python: h = fast.build_sptgraph(sg, baseid_name, layer_name, create_self_edges, verbose=False) else: h = sptgraph_impl.build_sptgraph(sg, create_self_edges, baseid_name, layer_name) k = gl.SGraph(h.vertices.join(signal, ['page_id', 'layer']), h.edges) if verbose: LOGGER.info('Spatio-temporal graph created in: %s seconds', time.time() - start) return k
def calc_pagerank(): sf = gl.SFrame.read_csv('../data/full_citation.csv', delimiter=',', error_bad_lines=True) gr = gl.SGraph(sf, vid_field='Patent', src_field='Citation', dst_field='Patent') gr = gr.add_edges(sf, src_field='Citation', dst_field='Patent') pr = gl.pagerank.create(gr) pr_out = pr['pagerank'] pr_out = pr_out.rename({'__id': 'Patent'}) pr_out = pr_out[['Patent', 'pagerank']] pr_out.save('../data/full_pagerank.csv', format='csv')
def run_shortest_path(self, analytic): try: sf = graphlab.SFrame.read_csv(analytic.dump.get_data_file_path()) g = graphlab.SGraph() g = g.add_edges(sf, 'src', 'dest') except Exception as e: raise Exception(LOAD_FILE, "Error loading the file") try: sp = graphlab.shortest_path.create(g) except Exception as e: raise Exception(RUN_ALGOS, "Error executing the task") try: return sp.get('distance') except Exception as e: raise Exception(PROC_FINA, "Error finishing the task: " + str(e))
def run_connected_components(self, analytic): try: sf = graphlab.SFrame.read_csv(analytic.dump.get_data_file_path()) g = graphlab.SGraph() g = g.add_edges(sf, 'src', 'dest') except Exception as e: raise Exception(LOAD_FILE, "Error loading the file") try: cc = graphlab.connected_components.create(g) except Exception as e: raise Exception(RUN_ALGOS, "Error executing the task") try: return cc.get('componentid') except Exception as e: raise Exception(PROC_FINA, "Error finishing the task: " + str(e))
def sssp_triple_apply(input_graph, src_vid, max_distance=1e30): g = gl.SGraph(input_graph.vertices, input_graph.edges) g.vertices['distance'] = \ g.vertices['__id'].apply(lambda x: max_distance if x != src_vid else 0.0) it = 0 num_changed = len(g.vertices) start = time.time() while (num_changed > 0): g.vertices['changed'] = 0 g = g.triple_apply(sssp_update_fn, ['distance', 'changed']) num_changed = g.vertices['changed'].sum() print 'Iteration %d: num_vertices changed = %d' % (it, num_changed) it = it + 1 print 'Triple apply sssp finished in: %f secs' % (time.time() - start) return g
def construct_graph(): sf = get_sframe() actors = sf['actor_name'].unique() films = sf['film_name'].unique() g = graphlab.SGraph() # we do this twice has a hack around its default Directed graph # this will make it is undirected g = g.add_edges(sf, src_field='actor_name', dst_field='film_name') g = g.add_edges(sf, src_field='film_name', dst_field='actor_name') print "Actor vertex sample:" g.get_vertices(ids=actors).tail(5) print "Movie graph summary:\n", g.summary(), "\n" return g
def label_communities(self): frame = self.child.g.vertices.join(self.verticy_descriptions, '__id', how='left') frame = frame.groupby( 'community_id', { "labels": gl.aggregate.CONCAT("description"), "member_count": gl.aggregate.COUNT("__id") }) def remove_dups(_str): words = _str.split() return " ".join(sorted(set(words), key=words.index)) frame['labels'] = frame['labels'].apply(lambda descriptions: ' '.join( [remove_dups(x) for x in descriptions])) frame['labels'] = gl.text_analytics.count_words(frame['labels']) frame['labels'] = frame['labels'].dict_trim_by_values(3) stopwords = gl.text_analytics.stopwords() stopwords.update(['http', 'https']) frame['labels'] = frame['labels'].dict_trim_by_keys(stopwords, exclude=True) frame['labels'] = gl.text_analytics.tf_idf(frame['labels']) def label_score(row): new_scores = {} for label, value in row['labels'].items(): new_scores[label] = value / row['member_count'] return new_scores frame['labels'] = frame.apply(label_score) def top_labels(labels_dict): labels = sorted(labels_dict.items(), key=lambda x: x[1], reverse=True)[:5] return [x[0] for x in labels] frame['top_labels'] = frame['labels'].apply(top_labels) frame = self.g.vertices.join(frame, {'__id': 'community_id'}) self.g = gl.SGraph(vertices=frame, edges=self.g.get_edges(), vid_field='__id', src_field='__src_id', dst_field='__dst_id')
def merge_signal_on_graph(g, node_signal, baseid_name, layer_name, excluded_ids=None, use_fast=True, verbose=True, remove_self=True): start = time.time() if verbose: LOGGER.info('Start reducing graph to minimum') p = g if isinstance(g, nx.Graph) or isinstance(g, nx.DiGraph): # convert if needed start2 = time.time() if verbose: LOGGER.info('Start networkx to graphlab conversion') p = utils.networkx_to_graphlab(g) if verbose: LOGGER.info('Conversion done in: %s', time.time() - start2) if use_fast: good_nodes = node_signal good_nodes.rename({baseid_name: '__id'}) else: good_nodes = p.vertices.join(node_signal, on={'__id': baseid_name}, how='inner') if excluded_ids: good_nodes = good_nodes.filter_by(excluded_ids, '__id', exclude=True) good_edges = p.get_edges(dst_ids=good_nodes['__id']).filter_by( good_nodes['__id'], '__src_id') # Remove self-edges if remove_self: good_edges = good_edges[(good_edges['__src_id'] - good_edges['__dst_id']) != 0] if verbose: LOGGER.info('Graph reduction done in: %s seconds', time.time() - start) return gl.SGraph(good_nodes, good_edges)
def createGraph(): zhima_usr = loadZhima() zhima_usr.rename({'snwb': '_id'}) subgraph_edges = gl.load_sframe( os.path.join(resultDataFolder, 'subgraph_zhima_2')) #create graph sub_G = gl.SGraph() sub_G = sub_G.add_edges(edges=subgraph_edges, src_field='src', dst_field='dst') #join label to vertices sub_G.vertices.join(zhima_usr, on='_id', how='left') # sub_G.vertices.head(5) print 'save graph' sub_G.save(os.path.join(resultDataFolder, 'subgraph_zhima'))
def show_recomnedations(self, recoms, existing): def shorten_name(name): full_name = name.split(" ") return "{0}. {1}".format(full_name[0][0], full_name[1]) g = gl.SGraph() existing.sort() recoms.sort() list_existing = [] for recom in recoms: g = g.add_edges( gl.Edge(shorten_name(recom[0]), shorten_name(recom[1]))) if recom in existing: list_existing.append(1) else: list_existing.append(0) pass g.show(vlabel="id") print
def run_graph_analytics(data, data_name): #################################################################################################################### g = gl.SGraph().add_edges(data, src_field='input_address', dst_field='output_address') print(g) #################################################################################################################### transaction_count = data.groupby(['year', 'month'], agg.COUNT).sort(['year', 'month'], ascending=True) n_month = transaction_count.num_rows() transaction_count['label'] = transaction_count['month'].astype( str) + "/" + transaction_count['year'].astype(str) print(transaction_count) #################################################################################################################### deg = degree_counting.create(g) deg_graph = deg[ 'graph'] # a new SGraph with degree data attached to each vertex in_degree = deg_graph.vertices[['__id', 'in_degree']] out_degree = deg_graph.vertices[['__id', 'out_degree']] in_degree.export_csv('analytics/' + data_name + '/in_degree.csv', delimiter=',') out_degree.export_csv('analytics/' + data_name + '/out_degree.csv', delimiter=',') pr = gl.pagerank.create(g) pr_out = pr['pagerank'] pr_out.export_csv('analytics/' + data_name + '/pr_out.csv', delimiter=',')
def test_directed_no_self_edge(self): # Directed no self-edge g = sptgraph.create_spatio_temporal_graph(gen_graph(True), gen_signal(), False, verbose=False) # The graph has only 1 connected component h = components.find_connected_components(g) cc = components.create_component_sframe(h) comps = components.extract_components(h, cc) self.assertEqual(1, len(comps)) # We remove the edge (7, 12) to create 2 weakly connected components nodes = g.vertices edges = g.edges.add_row_number('eid') to_remove = g.get_edges(7, 12) edges = edges[edges['eid'] != to_remove['eid'][0]] g = gl.SGraph(nodes, edges) h = components.find_connected_components(g) cc = components.create_component_sframe(h) comps = components.extract_components(h, cc) self.assertEqual(2, len(comps))