def fillSGraph(matchedBills): if not graphlab_is_installed: print "GraphLab is not installed!" Gmatches = None try: Gmatches = SGraph() #Read the dataset weights = list() for mb in matchedBills: #load all nodes/edges into graph if mb['modelBill'] != mb['stateBill']: label1 = mb['modelBill'].split("/")[-1] label2 = mb['stateBill'].split("/")[-1] #use inverse similarity as weight #Gmatches.add_edge(label1,label2,weight=1./float(mb['matchPrecent'])) vertices = list() vertices.append(Vertex(label1)) vertices.append(Vertex(label2)) Gmatches = Gmatches.add_vertices(vertices) Gmatches = Gmatches.add_edges(Edge(label1,label2)) weights.append(1./float(mb['matchPrecent'])) Gmatches.edges['weight'] = weights except: pass return Gmatches
def create_initial_bayesian_network(): ''' Start from a randomly generated Bayesian network where there is no edge between the variables of the same type. First, create a blacklist. ''' g = load_sgraph('data_graph') edges = g.get_edges() features = edges[['__dst_id', 'relation']].unique() features.rename({'__dst_id': 'feature_id', 'relation': 'feature_type'}) bn = SGraph() bn = bn.add_vertices(features, vid_field='feature_id') n_features = features.num_rows() edges_data_graph = g.get_edges() n_patients = edges_data_graph['__src_id'].unique().size() random.seed(1234) for i in range(20): src = features['feature_id'][random.randint(0, n_features - 1)] dst = 'E8498' #dst = features['feature_id'][random.randint(0, n_features-1)] bn = bn.add_edges(Edge(src, dst)) print "Added edge between " + src + " and " + dst bic = get_bic_score(g, bn, n_patients) return g
def create_initial_bayesian_network(): ''' Start from a randomly generated Bayesian network where there is no edge between the variables of the same type. First, create a blacklist. ''' g = load_sgraph('data_graph') edges = g.get_edges() features = edges[['__dst_id', 'relation']].unique() features.rename({'__dst_id': 'feature_id', 'relation': 'feature_type'}) bn = SGraph() bn = bn.add_vertices(features, vid_field = 'feature_id') n_features = features.num_rows() edges_data_graph = g.get_edges() n_patients = edges_data_graph['__src_id'].unique().size() random.seed(1234) for i in range(20): src = features['feature_id'][random.randint(0, n_features-1)] dst = 'E8498' #dst = features['feature_id'][random.randint(0, n_features-1)] bn = bn.add_edges(Edge(src, dst)) print "Added edge between " + src + " and " + dst bic = get_bic_score(g, bn, n_patients) return g
def showPath(self, highlight=None): # with open(self.verticesFn,'a') as Vwr: # with open(self.edgesFn,'a') as Ewr: # for i in range(8): # Vwr.write('\nc0_' + `i` + ', ') # Ewr.write('\np8_0_t,' + 'c0_' + `i` + ',c') # highlight['c0_'+`i`] = [0.69, 0.0, 0.498] # start = datetime.datetime.now() edge_data = SFrame.read_csv(self.edgesFn) vertex_data = SFrame.read_csv(self.verticesFn) g = SGraph(vertices=vertex_data, edges=edge_data, vid_field='name', src_field='src', dst_field='dst') # end = datetime.datetime.now() # print (end - start) # g.show(vlabel='attributes', elabel='relation', h_offset=0.3,v_offset=-0.025, highlight=highlight, arrows=True) # highLight g.show(vlabel='attributes', vlabel_hover=False, elabel='relation', highlight=highlight, arrows=True) # highLight sleep(20) pass
def get_subgraph(self, ids, radius=1, full_subgraph=True): verts = ids # find the vertices within radius (and the path edges) for i in range(radius): edges_out = self._graph.get_edges(src_ids=verts) # edges_in = self._graph.get_edges(dst_ids=verts) verts = list(edges_out['__src_id']) + list(edges_out['__dst_id']) verts = list(set(verts)) # make a new graph to return and add the vertices g = SGraph() g = g.add_vertices(self._graph.get_vertices(verts), vid_field='__id') # add the requested edge set if full_subgraph is True: df_induced = self._graph.get_edges(src_ids=verts) # induced_edge_in = self._graph.get_edges(dst_ids=verts) # df_induced = induced_edge_out.append(induced_edge_in) df_induced = df_induced.groupby(df_induced.column_names(), {}) verts_sa = SArray(list(ids)) edges = df_induced.filter_by(verts_sa, "__src_id") edges.append(df_induced.filter_by(verts_sa, "__dst_id")) g = g.add_edges(edges, src_field='__src_id', dst_field='__dst_id') return GlGraph(is_directed=self.is_directed, graph_obj=g)
def extract_backbone(flavor_network, vertices, edges, alpha): """ Builds a new graph with only the edges with weights that exceed the threshold for statistical significance :param flavor_network: flavor-ingredient network to prune :param vertices: separate list of vertices (to speed extraction) :param edges: separate list of edges (to speed extraction) :param alpha: threshold p-value for keeping an edge in the network :return: the pruned SGraph """ def degree_count_fn(src, connecting_edge, dst): """ increments the degree of the nodes on this edge :param src: source node :param connecting_edge: connecting edge :param dst: destination node :return: source and destination with degree attribute incremented """ src['deg'] += 1 dst['deg'] += 1 return src, connecting_edge, dst def compute_node_moments(node_k): """ computes mean and standard deviation for this node :param node_k: node to compute :return: mean and sigma """ mean = 2*node_k/(node_k+1) sigma = sqrt(node_k**2*((20 + 4*node_k)/((node_k + 1)*(node_k + 2)*(node_k + 3)) - 4/(node_k + 1)**2)) return mean, sigma def test_for_significance(edge, weights_lookup, alpha): """ tests this edge for statistical significance based on it's source and destination nodes :param edge: edge to test :param weights_lookup: quick (hash table) lookup for the edge weights :param alpha: significance threshold :return: significance boolean check """ y_obs = edge.attr['weight'] node1_k = weights_lookup[edge.dst_vid] node2_k = weights_lookup[edge.src_vid] m1, sig1 = compute_node_moments(float(node1_k)) m2, sig2 = compute_node_moments(float(node2_k)) return y_obs >= abs(m1 + alpha*sig1) or y_obs >= abs(m2 + alpha*sig2) flavor_network_w_degree = SGraph() new_node_list = flavor_network.vertices.fillna('deg', 0) flavor_network_w_degree = flavor_network_w_degree.add_vertices(new_node_list).add_edges(edges) flavor_network_w_degree = flavor_network_w_degree.triple_apply(degree_count_fn, mutated_fields=['deg']) weights_dict = flavor_network_w_degree.vertices.to_dataframe().set_index('__id').to_dict()['deg'] significant_edges = [] for edge in edges: if test_for_significance(edge, weights_dict, alpha): significant_edges.append(edge) pruned_network = SGraph().add_vertices(new_node_list) pruned_network = pruned_network.add_edges(significant_edges) return significant_edges, pruned_network
def extract_backbone(flavor_network, alpha): """ makes a new graph with only the edges with weights that exceed the threshold for statistical significance :param ing_comp_graph: full flavor ingredient network :return: the pruned SGraph """ def degree_count_fn(src, edge, dst): """ increments the degree of the nodes on this edge :param src: :param edge: :param dst: :return: """ src['deg'] += 1 dst['deg'] += 1 return src, edge, dst def compute_node_moments(node_k): mean = 2 * node_k / (node_k + 1) sigma = sqrt(node_k**2 * ((20 + 4 * node_k) / ((node_k + 1) * (node_k + 2) * (node_k + 3)) - 4 / (node_k + 1)**2)) return mean, sigma def test_for_significance(edge, weights_lookup, alpha): y_obs = edge['weight'] node1_k = weights_lookup[edge['__dst_id']] node2_k = weights_lookup[edge['__src_id']] m1, sig1 = compute_node_moments(float(node1_k)) m2, sig2 = compute_node_moments(float(node2_k)) return y_obs >= abs(m1 + alpha * sig1) or y_obs >= abs(m2 + alpha * sig2) flav_net_w_deg = SGraph() edge_list = flavor_network.get_edges() new_node_list = flavor_network.vertices.fillna('deg', 0) flav_net_w_deg = flav_net_w_deg.add_vertices(new_node_list).add_edges( edge_list) flav_net_w_deg = flav_net_w_deg.triple_apply(degree_count_fn, mutated_fields=['deg']) weights_dict = flav_net_w_deg.vertices.to_dataframe().set_index( '__id').to_dict()['deg'] significant_edges = [] for edge in flav_net_w_deg.get_edges(): if test_for_significance(edge, weights_dict, alpha): significant_edges.append( flav_net_w_deg.get_edges(src_ids=edge['__src_id'], dst_ids=edge['__dst_id'], format='list')[0]) pruned_network = SGraph().add_vertices(new_node_list) pruned_network = pruned_network.add_edges(significant_edges) return pruned_network
def showPath(self, highlight=None): edge_data = SFrame.read_csv(self.edgesFn) vertex_data = SFrame.read_csv(self.verticesFn) g = SGraph(vertices=vertex_data, edges=edge_data, vid_field='name', src_field='src', dst_field='dst') g.show(vlabel='id', elabel='relation', highlight=highlight, arrows=True) # highLight sleep(10) pass
def SSSP(): url = '/home/gengl/Datasets/SSSP/BerkStan/edge.txt' data = SFrame.read_csv(url, delimiter='\t', header=False, column_type_hints=[int, int, int]) graph = SGraph().add_edges(data, src_field='X1', dst_field='X2') sp_model = shortest_path.create(graph, source_vid=0, weight_field='X3') sp_model.summary()
def CC(): url = '/home/gengl/Datasets/CC/BerkStan/edge.txt' data = SFrame.read_csv(url, delimiter='\t', header=False, column_type_hints=[int, int]) graph = SGraph().add_edges(data, src_field='X1', dst_field='X2') cc_model = connected_components.create(graph, verbose=True) cc_model.summary()
def showPath(self, highlight=None): # start = datetime.datetime.now() edge_data = SFrame.read_csv(self.edgesFn) vertex_data = SFrame.read_csv(self.verticesFn) g = SGraph(vertices=vertex_data, edges=edge_data, vid_field='name', src_field='src', dst_field='dst') # end = datetime.datetime.now() # print (end - start) # g.show(vlabel='attributes',vlabel_hover=True, elabel='relation', h_offset=0.3,v_offset=-0.025, highlight=highlight, arrows=True) # highLight g.show(vlabel='id', elabel='relation', vlabel_hover=True, highlight=highlight, arrows=True) # highLight sleep(30) pass
def get_graph(X_train, k): start = datetime.now() factor0 = (X_train['rating'].mean() / k / 0.25)**0.5 vertices = get_vertices(k, factor0) X_train['uid'] = X_train['userId'].apply(prefix('u')) X_train['mid'] = X_train['movieId'].apply(prefix('m')) sg = SGraph().add_vertices(vertices, vid_field='__id')\ .add_edges(X_train, src_field='uid', dst_field='mid') print 'get_graph %s' % (datetime.now() - start) return sg
def create_network_features(Returns, Network, name='Sales', Start=9, End=12): for quarter in xrange(Start, End): if quarter == 12: continue ReturnsX = Returns[Returns['TaxQuarter'] == quarter] NetworkX = Network[Network['TaxQuarter'] == quarter] g = SGraph(vertices=ReturnsX, edges=NetworkX, vid_field='Mtin', src_field='Mtin', dst_field='SellerBuyerTin') # cc = graphlab.connected_components.create(g) # g.vertices['component_id'] = cc['graph'].vertices['component_id'] pr = graphlab.pagerank.create(g) g.vertices['pagerank'] = pr['graph'].vertices['pagerank'] tc = graphlab.triangle_counting.create(g) g.vertices['triangle_count'] = tc['graph'].vertices['triangle_count'] deg = degree_counting.create(g) deg_graph = deg['graph'] g.vertices['in_degree'] = deg_graph.vertices['in_degree'] g.vertices['out_degree'] = deg_graph.vertices['out_degree'] # kc = kcore.create(g) # g.vertices['core_id'] = kc['graph'].vertices['core_id'] # g.vertices.export_csv('H:\\Ashwin\\dta\\sample_bogusdealersNetworkFeaturesSales17.csv') g.vertices.export_csv( 'H:\\Ashwin\\dta\\bogusdealers\\NetworkFeatures{}{}.csv'.format( name, quarter))
def MP_graph(D, x): N, M = D.shape z = np.zeros((M, 1)) z_temp = np.zeros(M) r = np.copy(x) num_iter = 30 # Create bipartite graph G = SGraph() x_vertices = [Vertex(i) for i in xrange(N)] z_vertices = [Vertex(j + N) for j in xrange(M)] D_edges = [Edge(i, j) for i in xrange(N) for j in xrange(N, N + M)] G.add_vertices(x_vertices, z_vertices) G.add_edges(D_edges) for i in xrange(N): x_vertices[i]["value"] = x[i] for j in xrange(M): z_vertices[j]["value"] = 0.0 z_vertices[j]["dummy"] = 0.0 z_vertices[j]["max"] = 0.0 for i in xrange(N): for j in xrange(M): Edge(x_vertices[i], z_vertices[j])["value"] = D[i][j] def inner_prod(s, e, t): t["dummy"] += e["value"] * s["value"] def update_z(s, e, t): if not t["max"] == 0.0: t["value"] += e["value"] * s["value"] def compute_residual(s, e, t): if not t["max"] == 0.0: s["value"] -= t["value"] * e["value"] for itr in xrange(num_iter): # Compute inner products with r print "NUM ITR = ", itr G = G.triple_apply(inner_prod, mutated_fields=["value", "dummy"]) for i in xrange(M): z_vertices[i]["max"] = 0.0 z_temp[i] = z_vertices[i]["dummy"] max_pos = np.argmax(z_temp) z_vertices[max_pos]["max"] = z_temp[max_pos] G = G.triple_apply(update_z, mutated_fields=["max", "value"]) for i in xrange(M): z[i] = z_vertices[i]["value"] return z
def PageRank(): url = '/clueweb/PageRank/clueweb_20M/edge_pair.txt' data = SFrame.read_csv(url, delimiter='\t', header=False, column_type_hints=[int, int]) graph = SGraph().add_edges(data, src_field='X1', dst_field='X2') pr_model = pagerank.create(graph, reset_probability=0.2, threshold=0.000000000001, max_iterations=42, _distributed=True) pr_model.summary()
def parallel(A, B, prior, observationSequence): # parallel HMM training with graphlab g = SGraph() vertices = map(lambda i: Vertex(str(i) + "a", attr={'i': i, 'ait': [prior[i]] + ([0] * OBSERVATION_LENGTH), 'bit': ([0] * OBSERVATION_LENGTH) + [1], 'b': B[i, :], 'git': [0] * (OBSERVATION_LENGTH + 1), 'self': A[i, i], 'git_sum': 0.0}), xrange(NUM_STATES)) g = g.add_vertices(vertices) edges = [] for i in xrange(NUM_STATES): for j in xrange(NUM_STATES): if i != j: edges.append(Edge(str(i) + "a", str(j) + "a", attr={'aij': A[i, j], 'xi': 0.0})) g = g.add_edges(edges) g = hmm.train(g, observationSequence, NITERS, NUM_STATES, NUM_OBSERVATIONS) print g.vertices print g.edges
def PageRank(): url = '/home/gengl/Datasets/PageRank/BerkStan/edge.txt' data = SFrame.read_csv(url, delimiter='\t', header=False, column_type_hints=[int, int]) graph = SGraph().add_edges(data, src_field='X1', dst_field='X2') pr_model = pagerank.create(graph, reset_probability=0.2, threshold=0.0001, max_iterations=1000, _distributed=True) pr_model.summary()
def build_weighted_graph(ing_comp_dict): """ builds the weighted undirected graph that is the flavor network :param ing_comp_dict: ingredient:compound dictionary :return: SGraph that represents the flavor network """ flav_network = SGraph() vert_list = [] edge_list = [] ingrds_not_seen = ing_comp_dict.keys() for node_1_ingr, compounds in ing_comp_dict.iteritems(): ingrds_not_seen.remove(node_1_ingr) vert_list.append(Vertex(node_1_ingr, attr={'deg': 0})) for node_2_ingr in ingrds_not_seen: weight = len(set(ing_comp_dict[node_2_ingr]).intersection(set(compounds))) if weight > 0: edge_list.append(Edge(node_1_ingr, node_2_ingr, attr={'weight': weight})) vert_list.append(Vertex(node_1_ingr)) flav_network = flav_network.add_vertices(vert_list) flav_network = flav_network.add_edges(edge_list) return flav_network
def build_weighted_graph(ing_comp_dict): """ Builds the weighted, undirected graph that is the flavor network :param ing_comp_dict: ingredient:compound dictionary :return: SGraph that represents the flavor network """ flavor_network = SGraph() vertices = [] edge_list = [] ingredients = ing_comp_dict.keys() for ingredient_node_1, compounds in ing_comp_dict.iteritems(): ingredients.remove(ingredient_node_1) vertices.append(Vertex(ingredient_node_1, attr={'deg': 0})) for ingredient_node_2 in ingredients: weight = len(set(ing_comp_dict[ingredient_node_2]).intersection(set(compounds))) if weight > 0: edge_list.append(Edge(ingredient_node_1, ingredient_node_2, attr={'weight': weight})) vertices.append(Vertex(ingredient_node_1)) flavor_network = flavor_network.add_vertices(vertices) flavor_network = flavor_network.add_edges(edge_list) return flavor_network, vertices, edge_list
def showPath(self, highlight=None): start = datetime.datetime.now() edge_data = SFrame.read_csv(self.edgesFn) vertex_data = SFrame.read_csv(self.verticesFn) g = SGraph(vertices=vertex_data, edges=edge_data, vid_field='name', src_field='src', dst_field='dst') end = datetime.datetime.now() print(end - start) # g.show(vlabel='attributes', elabel='relation', highlight=highlight, arrows=True) # highLight # sleep(40) pass
def build_weighted_graph(ing_comp_dict): """ builds the weighted undirected graph that is the flavor network :param ing_comp_dict: ingredient:compound dictionary :return: SGraph that represents the flavor network """ flav_network = SGraph() vert_list = [] edge_list = [] ingrds_not_seen = ing_comp_dict.keys() for node_1_ingr, compounds in ing_comp_dict.iteritems(): ingrds_not_seen.remove(node_1_ingr) vert_list.append(Vertex(node_1_ingr, attr={'deg': 0})) for node_2_ingr in ingrds_not_seen: weight = len( set(ing_comp_dict[node_2_ingr]).intersection(set(compounds))) if weight > 0: edge_list.append( Edge(node_1_ingr, node_2_ingr, attr={'weight': weight})) vert_list.append(Vertex(node_1_ingr)) flav_network = flav_network.add_vertices(vert_list) flav_network = flav_network.add_edges(edge_list) return flav_network
def SSSP(): url = '/home/gengl/Datasets/SSSP/Google/edge.txt' data = SFrame.read_csv(url, delimiter='\t', header=False, column_type_hints=[int, int, int]) graph = SGraph().add_edges(data, src_field='X1', dst_field='X2') sp_model = shortest_path.create(graph, source_vid=0, weight_field='X3') sp_model.summary() with open('/home/gengl/sssp_graphlab', 'w') as fo: for vid in range(0, 875713): try: result_pair = sp_model.get_path(vid) fo.write(str(result_pair[-1]) + '\n') except: pass
def extract_backbone(flavor_network, alpha): """ makes a new graph with only the edges with weights that exceed the threshold for statistical significance :param ing_comp_graph: full flavor ingredient network :return: the pruned SGraph """ def degree_count_fn(src, edge, dst): """ increments the degree of the nodes on this edge :param src: :param edge: :param dst: :return: """ src['deg'] += 1 dst['deg'] += 1 return src, edge, dst def compute_node_moments(node_k): mean = 2*node_k/(node_k+1) sigma = sqrt(node_k**2*((20 + 4*node_k)/((node_k + 1)*(node_k + 2)*(node_k + 3)) - 4/(node_k + 1)**2)) return mean, sigma def test_for_significance(edge, weights_lookup, alpha): y_obs = edge['weight'] node1_k = weights_lookup[edge['__dst_id']] node2_k = weights_lookup[edge['__src_id']] m1, sig1 = compute_node_moments(float(node1_k)) m2, sig2 = compute_node_moments(float(node2_k)) return y_obs >= abs(m1 + alpha*sig1) or y_obs >= abs(m2 + alpha*sig2) flav_net_w_deg = SGraph() edge_list = flavor_network.get_edges() new_node_list = flavor_network.vertices.fillna('deg', 0) flav_net_w_deg = flav_net_w_deg.add_vertices(new_node_list).add_edges(edge_list) flav_net_w_deg = flav_net_w_deg.triple_apply(degree_count_fn, mutated_fields=['deg']) weights_dict = flav_net_w_deg.vertices.to_dataframe().set_index('__id').to_dict()['deg'] significant_edges = [] for edge in flav_net_w_deg.get_edges(): if test_for_significance(edge, weights_dict, alpha): significant_edges.append(flav_net_w_deg.get_edges(src_ids=edge['__src_id'], dst_ids=edge['__dst_id'], format='list')[0]) pruned_network = SGraph().add_vertices(new_node_list) pruned_network = pruned_network.add_edges(significant_edges) return pruned_network
def build_data_graph(): file_path = "/Users/blahiri/healthcare/documents/recommendation_system/" beneficiaries = SFrame.read_csv(file_path + "beneficiary_summary_2008_2009.csv") bene_packed = beneficiaries.pack_columns(column_prefix = 'chron_', dtype = dict, new_column_name = 'chronic_conditions', remove_prefix = False) #x is a row of bene_packed in the following lambda. We insert the desynpuf_id into the (key, value) tuple, convert the tuple to a list by calling list(), #and the outer [] makes sure we emit a list of lists. bene_chrons = bene_packed.flat_map(["chronic_condition_name", "chronic_condition_value", "desynpuf_id"], lambda x:[list(k + (x['desynpuf_id'], )) for k in x['chronic_conditions'].iteritems()]) bene_chrons = bene_chrons[bene_chrons['chronic_condition_value'] == 1] del bene_chrons['chronic_condition_value'] bene_chrons.rename({'chronic_condition_name': 'chronic_condition'}) g = SGraph() bene_chrons['relation'] = 'had_chronic' g = g.add_edges(bene_chrons, src_field = 'desynpuf_id', dst_field = 'chronic_condition') print g.summary() #Take out the distinct IDs of patients with chronic conditions to avoid repetition in query bene_with_chrons = SFrame(None) bene_with_chrons.add_column(bene_chrons['desynpuf_id'].unique(), 'desynpuf_id') #Add edges to the graph indicating which patient had which diagnosed condition tcdc = SFrame.read_csv(file_path + "transformed_claim_diagnosis_codes.csv") cols_to_drop = ['clm_id', 'clm_from_dt', 'clm_thru_dt', 'claim_type', 'clm_thru_year'] for column in cols_to_drop: del tcdc[column] #Same patient can be diagnosed with same condition multiple times a year, so take distinct tcdc = tcdc.unique() #Take diagnosed conditions for only those patients who had some chronic condition in 2008 or 2009. It is possible that #such a patient had no diagnosed condition, however. bene_chrons_tcdc = bene_with_chrons.join(tcdc) bene_chrons_tcdc['relation'] = 'diagnosed_with' g = g.add_edges(bene_chrons_tcdc, src_field = 'desynpuf_id', dst_field = 'dgns_cd') print g.summary() #Add edges to the graph indicating which patient had which procedure tcpc = SFrame.read_csv(file_path + "transformed_claim_prcdr_codes.csv", column_type_hints = {'prcdr_cd' : str}) cols_to_drop = ['clm_id', 'clm_from_dt', 'clm_thru_dt', 'claim_type', 'clm_thru_year'] for column in cols_to_drop: del tcpc[column] tcpc = tcpc.unique() #Take procedures for only those patients who had some chronic condition in 2008 or 2009. It is possible that #such a patient had no procedure, however. bene_chrons_tcpc = bene_with_chrons.join(tcpc) bene_chrons_tcpc['relation'] = 'underwent' g = g.add_edges(bene_chrons_tcpc, src_field = 'desynpuf_id', dst_field = 'prcdr_cd') print g.summary() #Add edges to the graph indicating which patient had which medicine pde = SFrame.read_csv(file_path + "prescribed_drugs.csv") pde = pde.unique() #Take medicines for only those patients who had some chronic condition in 2008 or 2009. It is possible that #such a patient had no medicine, however. bene_chrons_pde = bene_with_chrons.join(pde) bene_chrons_pde['relation'] = 'had_drug' g = g.add_edges(bene_chrons_pde, src_field = 'desynpuf_id', dst_field = 'substancename') print g.summary() return g
import graphlab as gl from graphlab import SFrame, SGraph, Vertex, Edge edge_data = SFrame.read_csv( 'http://s3.amazonaws.com/dato-datasets/bond/bond_edges.csv') g = SGraph() g = g.add_edges(edge_data, src_field='src', dst_field='dst') print g g.save('james_bond') new_graph = gl.load_sgraph('james_bond') g.show(vlabel='id', highlight=['James Bond', 'Moneypenny'], arrows=True)
# In[33]: edges = gl.SFrame.read_csv(os.path.join(csvDataFolder, '001008_0.csv'), header=False, delimiter=',', column_type_hints=int) edegs_n # In[42]: from graphlab import SGraph, Vertex g = SGraph().add_vertices([ Vertex('cat', {'fluffy': 1}), Vertex('dog', { 'fluffy': 1, 'woof': 1 }), Vertex('hippo', {}) ]) g.vertices.save(os.path.join(resultFolder, 'test_graph_vertices'), format='csv') # In[59]: #sframe reading start_r = time.time() # csvFiles = os.listdir(csvDataFolder) csvFiles = [ '000000_0.csv', '000001_0.csv', '000002_0.csv',
def build_data_graph(): file_path = "/Users/blahiri/healthcare/documents/recommendation_system/" beneficiaries = SFrame.read_csv(file_path + "beneficiary_summary_2008_2009.csv") bene_packed = beneficiaries.pack_columns( column_prefix='chron_', dtype=dict, new_column_name='chronic_conditions', remove_prefix=False) #x is a row of bene_packed in the following lambda. We insert the desynpuf_id into the (key, value) tuple, convert the tuple to a list by calling list(), #and the outer [] makes sure we emit a list of lists. bene_chrons = bene_packed.flat_map( ["chronic_condition_name", "chronic_condition_value", "desynpuf_id"], lambda x: [ list(k + (x['desynpuf_id'], )) for k in x['chronic_conditions'].iteritems() ]) bene_chrons = bene_chrons[bene_chrons['chronic_condition_value'] == 1] del bene_chrons['chronic_condition_value'] bene_chrons.rename({'chronic_condition_name': 'chronic_condition'}) g = SGraph() bene_chrons['relation'] = 'had_chronic' g = g.add_edges(bene_chrons, src_field='desynpuf_id', dst_field='chronic_condition') print g.summary() #Take out the distinct IDs of patients with chronic conditions to avoid repetition in query bene_with_chrons = SFrame(None) bene_with_chrons.add_column(bene_chrons['desynpuf_id'].unique(), 'desynpuf_id') #Add edges to the graph indicating which patient had which diagnosed condition tcdc = SFrame.read_csv(file_path + "transformed_claim_diagnosis_codes.csv") cols_to_drop = [ 'clm_id', 'clm_from_dt', 'clm_thru_dt', 'claim_type', 'clm_thru_year' ] for column in cols_to_drop: del tcdc[column] #Same patient can be diagnosed with same condition multiple times a year, so take distinct tcdc = tcdc.unique() #Take diagnosed conditions for only those patients who had some chronic condition in 2008 or 2009. It is possible that #such a patient had no diagnosed condition, however. bene_chrons_tcdc = bene_with_chrons.join(tcdc) bene_chrons_tcdc['relation'] = 'diagnosed_with' g = g.add_edges(bene_chrons_tcdc, src_field='desynpuf_id', dst_field='dgns_cd') print g.summary() #Add edges to the graph indicating which patient had which procedure tcpc = SFrame.read_csv(file_path + "transformed_claim_prcdr_codes.csv", column_type_hints={'prcdr_cd': str}) cols_to_drop = [ 'clm_id', 'clm_from_dt', 'clm_thru_dt', 'claim_type', 'clm_thru_year' ] for column in cols_to_drop: del tcpc[column] tcpc = tcpc.unique() #Take procedures for only those patients who had some chronic condition in 2008 or 2009. It is possible that #such a patient had no procedure, however. bene_chrons_tcpc = bene_with_chrons.join(tcpc) bene_chrons_tcpc['relation'] = 'underwent' g = g.add_edges(bene_chrons_tcpc, src_field='desynpuf_id', dst_field='prcdr_cd') print g.summary() #Add edges to the graph indicating which patient had which medicine pde = SFrame.read_csv(file_path + "prescribed_drugs.csv") pde = pde.unique() #Take medicines for only those patients who had some chronic condition in 2008 or 2009. It is possible that #such a patient had no medicine, however. bene_chrons_pde = bene_with_chrons.join(pde) bene_chrons_pde['relation'] = 'had_drug' g = g.add_edges(bene_chrons_pde, src_field='desynpuf_id', dst_field='substancename') print g.summary() return g
#g = SGraph(vertices=vertex_data, edges=edge_data, vid_field='name', # src_field='src', dst_field='dst') #targets = ['James Bond', 'Moneypenny'] #subgraph = g.get_neighborhood(ids=targets, radius=1, full_subgraph=True) #subgraph.show(vlabel='id', highlight=['James Bond', 'Moneypenny'], arrows=True) #from graphlab import SGraph, Vertex, Edge #g = SGraph() #verts = [Vertex(0, attr={'breed': 'labrador'}), # Vertex(1, attr={'breed': 'labrador'}), # Vertex(2, attr={'breed': 'vizsla'})] #g = g.add_vertices(verts) #g = g.add_edges(Edge(1, 2)) #print g from graphlab import SFrame, SGraph edge_data = SFrame.read_csv('http://s3.amazonaws.com/dato-datasets/bond/bond_edges.csv') vertex_data = SFrame.read_csv('http://s3.amazonaws.com/dato-datasets/bond/bond_vertices.csv') g = SGraph(vertices=vertex_data, edges=edge_data, vid_field='name',src_field='src', dst_field='dst') #print g g.show()
path = sys.argv[1] else: path = "./data/" verbose = False vertexFiles = [ "City", "Country", "Region", "Advisor", "Category", "Founder", "FundingRound", "HQ", "keywords", "Member", "Office", "organizations", "PrimaryImage", "TeamMember", "Website", "companies_acquired_by_sap" ] edgesFiles = [ "GeoInformation", "acquisitions", "categories_keywords_edges", "investments", "keywords_descriptions_edges", "keywords_webpages_edges", "relationships", "companies_acquired_by_sap_edges" ] g = SGraph() for f in vertexFiles: content = SFrame.read_csv(path + f + '.csv', na_values='null', verbose=verbose) if 'path' in content.column_names(): g = g.add_vertices(content, vid_field='path') elif 'url' in content.column_names(): g = g.add_vertices(content, vid_field='url') else: print "Unknown vid field: ", content.column_names() sys.exit() for f in edgesFiles: content = SFrame.read_csv(path + f + '.csv',
outputPath = os.environ.get("OUTPUT_PATH") startScale = int(os.environ.get("START_SCALE")) tagFile = './tmp' with open(tagFile, 'r') as f: infor = f.readline().strip().split(",") maxScale = int(infor[1]) realEndScale = int(infor[2]) scaleRange = range(startScale, realEndScale + 1) for scale in scaleRange: inputPath = os.path.join(outputPath, 'tmp', 'AdjacentRelationships', str(scale)) url = inputPath data = SFrame.read_csv(url, header=False) if (data.num_rows() == 0): cc_ids = SFrame({"__id": [], "component_id": []}) else: g = SGraph().add_edges(data, src_field=data.column_names()[0], dst_field=data.column_names()[1]) cc = connected_components.create(g) cc_ids = cc.get('component_id') path = os.path.join(outputPath, 'tmp', 'ConnectedComponents', str(scale)) if (~os.path.exists(path)): os.makedirs(path) SFrame.export_csv(cc_ids, os.path.join(path))
#targets = ['James Bond', 'Moneypenny'] #subgraph = g.get_neighborhood(ids=targets, radius=1, full_subgraph=True) #subgraph.show(vlabel='id', highlight=['James Bond', 'Moneypenny'], arrows=True) #from graphlab import SGraph, Vertex, Edge #g = SGraph() #verts = [Vertex(0, attr={'breed': 'labrador'}), # Vertex(1, attr={'breed': 'labrador'}), # Vertex(2, attr={'breed': 'vizsla'})] #g = g.add_vertices(verts) #g = g.add_edges(Edge(1, 2)) #print g from graphlab import SFrame, SGraph edge_data = SFrame.read_csv( 'http://s3.amazonaws.com/dato-datasets/bond/bond_edges.csv') vertex_data = SFrame.read_csv( 'http://s3.amazonaws.com/dato-datasets/bond/bond_vertices.csv') g = SGraph(vertices=vertex_data, edges=edge_data, vid_field='name', src_field='src', dst_field='dst') #print g g.show()
import graphlab as gl from graphlab import SGraph, Vertex, Edge f = open('web-Google.txt', 'r') vertices = set() edges = [] for line in f: v1, v2 = [int(x) for x in line.split()] vertices.add(v1) vertices.add(v2) edges.append(Edge(v1, v2)) print 'In total {0} vertices and {1} edges'.format(len(vertices), len(edges)) g = SGraph().add_vertices([Vertex(x) for x in vertices]).add_edges(edges) g.save('page_graph')
def main(): g = SGraph() verts = [] #initialize the Karate graph with unique label fields for each node for i in range(0, 34): verts.append(Vertex(i, attr={'label': str(i)})) g = g.add_vertices(verts) #prepare the path for the Karate network data fname = "./karate.txt" #read the edges from Karate.txt and add them to the SGraph object with open(fname) as f: for l in f: #print(l) #parse the src and dst ids for the next edge ids = l.split() src = int(ids[0]) dst = int(ids[1]) #add the edge as a graphlabl.Edge object to the graph g = g.add_edges(Edge(src, dst)) #visualize the graph #print(g.summary()) #randId=rn.sample(range(0,34),1)[0] #print(randId) #test = g.get_vertices(fields={'label':'1'})[randId] #test.show() #print(test) ids = range(0, 34) #label propagation loop flag = False iteration = 0 #rounds=5 #initialize neigh dict for performance gns = {} cur_max = 0 start = time.time() #start=time.time() while flag == False: #pick vertice iteration order randomly rn.shuffle(ids) flag = True #print(ids) start = time.time() for index in ids: #print(index) cur_max = LPA(g, index) if str(cur_max) != g.get_vertices(ids=[index])['label'][0]: flag = False g.vertices['label'] = g.vertices.apply(lambda x: str( cur_max) if x['__id'] == index else x['label']) #print(end-start) iteration += 1 print(iteration) end = time.time() #end=time.time() print(end - start) print iteration g.show(vlabel='label')