def fillSGraph(matchedBills):
    if not graphlab_is_installed: print "GraphLab is not installed!"
    Gmatches = None    
    try:
        Gmatches = SGraph()
        #Read the dataset
        weights = list()
        for mb in matchedBills:
            #load all nodes/edges into graph 
            if mb['modelBill'] != mb['stateBill']:
                label1 = mb['modelBill'].split("/")[-1]
                label2 = mb['stateBill'].split("/")[-1]
                #use inverse similarity as weight
                #Gmatches.add_edge(label1,label2,weight=1./float(mb['matchPrecent']))
                vertices = list()
                vertices.append(Vertex(label1))
                vertices.append(Vertex(label2))
                Gmatches = Gmatches.add_vertices(vertices)
                Gmatches = Gmatches.add_edges(Edge(label1,label2))
                weights.append(1./float(mb['matchPrecent']))
            
        Gmatches.edges['weight'] = weights  
    except: pass 
    
    return Gmatches    
Beispiel #2
0
def create_initial_bayesian_network():
    '''
  Start from a randomly generated Bayesian network where there is no edge between the variables of the same type.
  First, create a blacklist. 
  '''
    g = load_sgraph('data_graph')
    edges = g.get_edges()
    features = edges[['__dst_id', 'relation']].unique()
    features.rename({'__dst_id': 'feature_id', 'relation': 'feature_type'})

    bn = SGraph()
    bn = bn.add_vertices(features, vid_field='feature_id')
    n_features = features.num_rows()
    edges_data_graph = g.get_edges()
    n_patients = edges_data_graph['__src_id'].unique().size()

    random.seed(1234)
    for i in range(20):
        src = features['feature_id'][random.randint(0, n_features - 1)]
        dst = 'E8498'
        #dst = features['feature_id'][random.randint(0, n_features-1)]
        bn = bn.add_edges(Edge(src, dst))
        print "Added edge between " + src + " and " + dst

    bic = get_bic_score(g, bn, n_patients)
    return g
def create_initial_bayesian_network():
  '''
  Start from a randomly generated Bayesian network where there is no edge between the variables of the same type.
  First, create a blacklist. 
  '''
  g = load_sgraph('data_graph')
  edges = g.get_edges()
  features = edges[['__dst_id', 'relation']].unique()
  features.rename({'__dst_id': 'feature_id', 'relation': 'feature_type'})
  
  bn = SGraph()
  bn = bn.add_vertices(features, vid_field = 'feature_id')
  n_features = features.num_rows()
  edges_data_graph = g.get_edges()
  n_patients = edges_data_graph['__src_id'].unique().size()
 
  random.seed(1234)
  for i in range(20):
    src = features['feature_id'][random.randint(0, n_features-1)]
    dst = 'E8498'
    #dst = features['feature_id'][random.randint(0, n_features-1)]
    bn = bn.add_edges(Edge(src, dst))
    print "Added edge between " + src + " and " + dst

  bic = get_bic_score(g, bn, n_patients)
  return g
    def showPath(self, highlight=None):
        # with open(self.verticesFn,'a') as Vwr:
        #     with open(self.edgesFn,'a') as Ewr:
        #         for i in range(8):
        #             Vwr.write('\nc0_' + `i` + ', ')
        #             Ewr.write('\np8_0_t,' + 'c0_' + `i` + ',c')
        #             highlight['c0_'+`i`] = [0.69, 0.0, 0.498]
        # start = datetime.datetime.now()
        edge_data = SFrame.read_csv(self.edgesFn)
        vertex_data = SFrame.read_csv(self.verticesFn)
        g = SGraph(vertices=vertex_data,
                   edges=edge_data,
                   vid_field='name',
                   src_field='src',
                   dst_field='dst')
        # end = datetime.datetime.now()
        # print (end - start)
        # g.show(vlabel='attributes', elabel='relation', h_offset=0.3,v_offset=-0.025, highlight=highlight, arrows=True)  # highLight

        g.show(vlabel='attributes',
               vlabel_hover=False,
               elabel='relation',
               highlight=highlight,
               arrows=True)  # highLight
        sleep(20)
        pass
Beispiel #5
0
    def get_subgraph(self, ids, radius=1, full_subgraph=True):
        verts = ids

        # find the vertices within radius (and the path edges)
        for i in range(radius):
            edges_out = self._graph.get_edges(src_ids=verts)
            # edges_in = self._graph.get_edges(dst_ids=verts)

            verts = list(edges_out['__src_id']) + list(edges_out['__dst_id'])
            verts = list(set(verts))

        # make a new graph to return and add the vertices
        g = SGraph()
        g = g.add_vertices(self._graph.get_vertices(verts), vid_field='__id')

        # add the requested edge set
        if full_subgraph is True:
            df_induced = self._graph.get_edges(src_ids=verts)
            # induced_edge_in = self._graph.get_edges(dst_ids=verts)
            # df_induced = induced_edge_out.append(induced_edge_in)
            df_induced = df_induced.groupby(df_induced.column_names(), {})

            verts_sa = SArray(list(ids))
            edges = df_induced.filter_by(verts_sa, "__src_id")
            edges.append(df_induced.filter_by(verts_sa, "__dst_id"))

            g = g.add_edges(edges, src_field='__src_id', dst_field='__dst_id')
        return GlGraph(is_directed=self.is_directed, graph_obj=g)
def extract_backbone(flavor_network, vertices, edges, alpha):
    """
    Builds a new graph with only the edges with weights that exceed the threshold for statistical significance
    :param flavor_network: flavor-ingredient network to prune
    :param vertices: separate list of vertices (to speed extraction)
    :param edges: separate list of edges (to speed extraction)
    :param alpha: threshold p-value for keeping an edge in the network
    :return: the pruned SGraph
    """
    def degree_count_fn(src, connecting_edge, dst):
        """
        increments the degree of the nodes on this edge
        :param src: source node
        :param connecting_edge: connecting edge
        :param dst: destination node
        :return: source and destination with degree attribute incremented
        """
        src['deg'] += 1
        dst['deg'] += 1
        return src, connecting_edge, dst

    def compute_node_moments(node_k):
        """
        computes mean and standard deviation for this node
        :param node_k: node to compute
        :return: mean and sigma
        """
        mean = 2*node_k/(node_k+1)
        sigma = sqrt(node_k**2*((20 + 4*node_k)/((node_k + 1)*(node_k + 2)*(node_k + 3)) - 4/(node_k + 1)**2))
        return mean, sigma

    def test_for_significance(edge, weights_lookup, alpha):
        """
        tests this edge for statistical significance based on it's source and destination nodes
        :param edge: edge to test
        :param weights_lookup: quick (hash table) lookup for the edge weights
        :param alpha: significance threshold
        :return: significance boolean check
        """
        y_obs = edge.attr['weight']
        node1_k = weights_lookup[edge.dst_vid]
        node2_k = weights_lookup[edge.src_vid]
        m1, sig1 = compute_node_moments(float(node1_k))
        m2, sig2 = compute_node_moments(float(node2_k))
        return y_obs >= abs(m1 + alpha*sig1) or y_obs >= abs(m2 + alpha*sig2)

    flavor_network_w_degree = SGraph()
    new_node_list = flavor_network.vertices.fillna('deg', 0)
    flavor_network_w_degree = flavor_network_w_degree.add_vertices(new_node_list).add_edges(edges)
    flavor_network_w_degree = flavor_network_w_degree.triple_apply(degree_count_fn, mutated_fields=['deg'])
    weights_dict = flavor_network_w_degree.vertices.to_dataframe().set_index('__id').to_dict()['deg']

    significant_edges = []
    for edge in edges:
        if test_for_significance(edge, weights_dict, alpha):
            significant_edges.append(edge)
    pruned_network = SGraph().add_vertices(new_node_list)
    pruned_network = pruned_network.add_edges(significant_edges)
    return significant_edges, pruned_network
def extract_backbone(flavor_network, alpha):
    """
    makes a new graph with only the edges with weights that exceed the threshold for statistical significance
    :param ing_comp_graph: full flavor ingredient network
    :return: the pruned SGraph
    """
    def degree_count_fn(src, edge, dst):
        """
        increments the degree of the nodes on this edge
        :param src:
        :param edge:
        :param dst:
        :return:
        """
        src['deg'] += 1
        dst['deg'] += 1
        return src, edge, dst

    def compute_node_moments(node_k):
        mean = 2 * node_k / (node_k + 1)
        sigma = sqrt(node_k**2 * ((20 + 4 * node_k) /
                                  ((node_k + 1) * (node_k + 2) *
                                   (node_k + 3)) - 4 / (node_k + 1)**2))
        return mean, sigma

    def test_for_significance(edge, weights_lookup, alpha):
        y_obs = edge['weight']
        node1_k = weights_lookup[edge['__dst_id']]
        node2_k = weights_lookup[edge['__src_id']]
        m1, sig1 = compute_node_moments(float(node1_k))
        m2, sig2 = compute_node_moments(float(node2_k))

        return y_obs >= abs(m1 + alpha * sig1) or y_obs >= abs(m2 +
                                                               alpha * sig2)

    flav_net_w_deg = SGraph()
    edge_list = flavor_network.get_edges()
    new_node_list = flavor_network.vertices.fillna('deg', 0)
    flav_net_w_deg = flav_net_w_deg.add_vertices(new_node_list).add_edges(
        edge_list)
    flav_net_w_deg = flav_net_w_deg.triple_apply(degree_count_fn,
                                                 mutated_fields=['deg'])
    weights_dict = flav_net_w_deg.vertices.to_dataframe().set_index(
        '__id').to_dict()['deg']

    significant_edges = []
    for edge in flav_net_w_deg.get_edges():
        if test_for_significance(edge, weights_dict, alpha):
            significant_edges.append(
                flav_net_w_deg.get_edges(src_ids=edge['__src_id'],
                                         dst_ids=edge['__dst_id'],
                                         format='list')[0])
    pruned_network = SGraph().add_vertices(new_node_list)
    pruned_network = pruned_network.add_edges(significant_edges)
    return pruned_network
 def showPath(self, highlight=None):
     edge_data = SFrame.read_csv(self.edgesFn)
     vertex_data = SFrame.read_csv(self.verticesFn)
     g = SGraph(vertices=vertex_data,
                edges=edge_data,
                vid_field='name',
                src_field='src',
                dst_field='dst')
     g.show(vlabel='id',
            elabel='relation',
            highlight=highlight,
            arrows=True)  # highLight
     sleep(10)
     pass
Beispiel #9
0
def SSSP():
    url = '/home/gengl/Datasets/SSSP/BerkStan/edge.txt'
    data = SFrame.read_csv(url,
                           delimiter='\t',
                           header=False,
                           column_type_hints=[int, int, int])
    graph = SGraph().add_edges(data, src_field='X1', dst_field='X2')
    sp_model = shortest_path.create(graph, source_vid=0, weight_field='X3')
    sp_model.summary()
def CC():
    url = '/home/gengl/Datasets/CC/BerkStan/edge.txt'
    data = SFrame.read_csv(url,
                           delimiter='\t',
                           header=False,
                           column_type_hints=[int, int])
    graph = SGraph().add_edges(data, src_field='X1', dst_field='X2')
    cc_model = connected_components.create(graph, verbose=True)
    cc_model.summary()
Beispiel #11
0
 def showPath(self, highlight=None):
     # start = datetime.datetime.now()
     edge_data = SFrame.read_csv(self.edgesFn)
     vertex_data = SFrame.read_csv(self.verticesFn)
     g = SGraph(vertices=vertex_data,
                edges=edge_data,
                vid_field='name',
                src_field='src',
                dst_field='dst')
     # end = datetime.datetime.now()
     # print (end - start)
     # g.show(vlabel='attributes',vlabel_hover=True, elabel='relation', h_offset=0.3,v_offset=-0.025, highlight=highlight, arrows=True)  # highLight
     g.show(vlabel='id',
            elabel='relation',
            vlabel_hover=True,
            highlight=highlight,
            arrows=True)  # highLight
     sleep(30)
     pass
Beispiel #12
0
def get_graph(X_train, k):
    start = datetime.now()
    factor0 = (X_train['rating'].mean() / k / 0.25)**0.5
    vertices = get_vertices(k, factor0)
    X_train['uid'] = X_train['userId'].apply(prefix('u'))
    X_train['mid'] = X_train['movieId'].apply(prefix('m'))
    sg = SGraph().add_vertices(vertices, vid_field='__id')\
        .add_edges(X_train, src_field='uid', dst_field='mid')
    print 'get_graph %s' % (datetime.now() - start)
    return sg
def create_network_features(Returns, Network, name='Sales', Start=9, End=12):
    for quarter in xrange(Start, End):
        if quarter == 12:
            continue
        ReturnsX = Returns[Returns['TaxQuarter'] == quarter]
        NetworkX = Network[Network['TaxQuarter'] == quarter]
        g = SGraph(vertices=ReturnsX,
                   edges=NetworkX,
                   vid_field='Mtin',
                   src_field='Mtin',
                   dst_field='SellerBuyerTin')
        #         cc = graphlab.connected_components.create(g)
        #         g.vertices['component_id'] = cc['graph'].vertices['component_id']
        pr = graphlab.pagerank.create(g)
        g.vertices['pagerank'] = pr['graph'].vertices['pagerank']
        tc = graphlab.triangle_counting.create(g)
        g.vertices['triangle_count'] = tc['graph'].vertices['triangle_count']
        deg = degree_counting.create(g)
        deg_graph = deg['graph']
        g.vertices['in_degree'] = deg_graph.vertices['in_degree']
        g.vertices['out_degree'] = deg_graph.vertices['out_degree']
        #         kc = kcore.create(g)
        #         g.vertices['core_id'] = kc['graph'].vertices['core_id']
        #        g.vertices.export_csv('H:\\Ashwin\\dta\\sample_bogusdealersNetworkFeaturesSales17.csv')
        g.vertices.export_csv(
            'H:\\Ashwin\\dta\\bogusdealers\\NetworkFeatures{}{}.csv'.format(
                name, quarter))
Beispiel #14
0
def MP_graph(D, x):
    N, M = D.shape
    z = np.zeros((M, 1))
    z_temp = np.zeros(M)
    r = np.copy(x)
    num_iter = 30
    # Create bipartite graph
    G = SGraph()
    x_vertices = [Vertex(i) for i in xrange(N)]
    z_vertices = [Vertex(j + N) for j in xrange(M)]
    D_edges = [Edge(i, j) for i in xrange(N) for j in xrange(N, N + M)]
    G.add_vertices(x_vertices, z_vertices)
    G.add_edges(D_edges)

    for i in xrange(N):
        x_vertices[i]["value"] = x[i]
    for j in xrange(M):
        z_vertices[j]["value"] = 0.0
        z_vertices[j]["dummy"] = 0.0
        z_vertices[j]["max"] = 0.0
    for i in xrange(N):
        for j in xrange(M):
            Edge(x_vertices[i], z_vertices[j])["value"] = D[i][j]

    def inner_prod(s, e, t):
        t["dummy"] += e["value"] * s["value"]

    def update_z(s, e, t):
        if not t["max"] == 0.0:
            t["value"] += e["value"] * s["value"]

    def compute_residual(s, e, t):
        if not t["max"] == 0.0:
            s["value"] -= t["value"] * e["value"]

    for itr in xrange(num_iter):
        # Compute inner products with r
        print "NUM ITR = ", itr
        G = G.triple_apply(inner_prod, mutated_fields=["value", "dummy"])
        for i in xrange(M):
            z_vertices[i]["max"] = 0.0
            z_temp[i] = z_vertices[i]["dummy"]
        max_pos = np.argmax(z_temp)
        z_vertices[max_pos]["max"] = z_temp[max_pos]
        G = G.triple_apply(update_z, mutated_fields=["max", "value"])

    for i in xrange(M):
        z[i] = z_vertices[i]["value"]

    return z
Beispiel #15
0
def PageRank():
    url = '/clueweb/PageRank/clueweb_20M/edge_pair.txt'
    data = SFrame.read_csv(url,
                           delimiter='\t',
                           header=False,
                           column_type_hints=[int, int])
    graph = SGraph().add_edges(data, src_field='X1', dst_field='X2')
    pr_model = pagerank.create(graph,
                               reset_probability=0.2,
                               threshold=0.000000000001,
                               max_iterations=42,
                               _distributed=True)
    pr_model.summary()
Beispiel #16
0
def parallel(A, B, prior, observationSequence):
	# parallel HMM training with graphlab
	g = SGraph()

	vertices = map(lambda i: Vertex(str(i) + "a", 
		attr={'i': i, 'ait': [prior[i]] + ([0] * OBSERVATION_LENGTH), 
			'bit': ([0] * OBSERVATION_LENGTH) + [1], 
			'b': B[i, :], 'git': [0] * (OBSERVATION_LENGTH + 1), 
			'self': A[i, i], 'git_sum': 0.0}), xrange(NUM_STATES))

	g = g.add_vertices(vertices)
	edges = []
	for i in xrange(NUM_STATES):
		for j in xrange(NUM_STATES):
			if i != j:
				edges.append(Edge(str(i) + "a", str(j) + "a", 
					attr={'aij': A[i, j], 'xi': 0.0}))

	g = g.add_edges(edges)
	g = hmm.train(g, observationSequence, NITERS, NUM_STATES, NUM_OBSERVATIONS)
	print g.vertices
	print g.edges
def PageRank():
    url = '/home/gengl/Datasets/PageRank/BerkStan/edge.txt'
    data = SFrame.read_csv(url,
                           delimiter='\t',
                           header=False,
                           column_type_hints=[int, int])
    graph = SGraph().add_edges(data, src_field='X1', dst_field='X2')
    pr_model = pagerank.create(graph,
                               reset_probability=0.2,
                               threshold=0.0001,
                               max_iterations=1000,
                               _distributed=True)
    pr_model.summary()
def build_weighted_graph(ing_comp_dict):
    """
    builds the weighted undirected graph that is the flavor network
    :param ing_comp_dict: ingredient:compound dictionary
    :return: SGraph that represents the flavor network
    """
    flav_network = SGraph()
    vert_list = []
    edge_list = []
    ingrds_not_seen = ing_comp_dict.keys()
    for node_1_ingr, compounds in ing_comp_dict.iteritems():
        ingrds_not_seen.remove(node_1_ingr)
        vert_list.append(Vertex(node_1_ingr, attr={'deg': 0}))
        for node_2_ingr in ingrds_not_seen:
            weight = len(set(ing_comp_dict[node_2_ingr]).intersection(set(compounds)))
            if weight > 0:
                edge_list.append(Edge(node_1_ingr, node_2_ingr, attr={'weight': weight}))
        vert_list.append(Vertex(node_1_ingr))

    flav_network = flav_network.add_vertices(vert_list)
    flav_network = flav_network.add_edges(edge_list)
    return flav_network
Beispiel #19
0
def build_weighted_graph(ing_comp_dict):
    """
    Builds the weighted, undirected graph that is the flavor network
    :param ing_comp_dict: ingredient:compound dictionary
    :return: SGraph that represents the flavor network
    """
    flavor_network = SGraph()
    vertices = []
    edge_list = []
    ingredients = ing_comp_dict.keys()

    for ingredient_node_1, compounds in ing_comp_dict.iteritems():
        ingredients.remove(ingredient_node_1)
        vertices.append(Vertex(ingredient_node_1, attr={'deg': 0}))
        for ingredient_node_2 in ingredients:
            weight = len(set(ing_comp_dict[ingredient_node_2]).intersection(set(compounds)))
            if weight > 0:
                edge_list.append(Edge(ingredient_node_1, ingredient_node_2, attr={'weight': weight}))
        vertices.append(Vertex(ingredient_node_1))

    flavor_network = flavor_network.add_vertices(vertices)
    flavor_network = flavor_network.add_edges(edge_list)
    return flavor_network, vertices, edge_list
Beispiel #20
0
 def showPath(self, highlight=None):
     start = datetime.datetime.now()
     edge_data = SFrame.read_csv(self.edgesFn)
     vertex_data = SFrame.read_csv(self.verticesFn)
     g = SGraph(vertices=vertex_data,
                edges=edge_data,
                vid_field='name',
                src_field='src',
                dst_field='dst')
     end = datetime.datetime.now()
     print(end - start)
     # g.show(vlabel='attributes', elabel='relation', highlight=highlight, arrows=True)  # highLight
     # sleep(40)
     pass
def build_weighted_graph(ing_comp_dict):
    """
    builds the weighted undirected graph that is the flavor network
    :param ing_comp_dict: ingredient:compound dictionary
    :return: SGraph that represents the flavor network
    """
    flav_network = SGraph()
    vert_list = []
    edge_list = []
    ingrds_not_seen = ing_comp_dict.keys()
    for node_1_ingr, compounds in ing_comp_dict.iteritems():
        ingrds_not_seen.remove(node_1_ingr)
        vert_list.append(Vertex(node_1_ingr, attr={'deg': 0}))
        for node_2_ingr in ingrds_not_seen:
            weight = len(
                set(ing_comp_dict[node_2_ingr]).intersection(set(compounds)))
            if weight > 0:
                edge_list.append(
                    Edge(node_1_ingr, node_2_ingr, attr={'weight': weight}))
        vert_list.append(Vertex(node_1_ingr))

    flav_network = flav_network.add_vertices(vert_list)
    flav_network = flav_network.add_edges(edge_list)
    return flav_network
def SSSP():
    url = '/home/gengl/Datasets/SSSP/Google/edge.txt'
    data = SFrame.read_csv(url,
                           delimiter='\t',
                           header=False,
                           column_type_hints=[int, int, int])
    graph = SGraph().add_edges(data, src_field='X1', dst_field='X2')
    sp_model = shortest_path.create(graph, source_vid=0, weight_field='X3')
    sp_model.summary()
    with open('/home/gengl/sssp_graphlab', 'w') as fo:
        for vid in range(0, 875713):
            try:
                result_pair = sp_model.get_path(vid)
                fo.write(str(result_pair[-1]) + '\n')
            except:
                pass
Beispiel #23
0
def MP_graph(D, x):
    N, M = D.shape
    z = np.zeros((M, 1))
    z_temp = np.zeros(M)
    r = np.copy(x)
    num_iter = 30
    # Create bipartite graph
    G = SGraph()
    x_vertices = [Vertex(i) for i in xrange(N)]
    z_vertices = [Vertex(j + N) for j in xrange(M)]
    D_edges = [Edge(i, j) for i in xrange(N) for j in xrange(N, N + M)]
    G.add_vertices(x_vertices, z_vertices)
    G.add_edges(D_edges)

    for i in xrange(N):
        x_vertices[i]["value"] = x[i]
    for j in xrange(M):
        z_vertices[j]["value"] = 0.0
        z_vertices[j]["dummy"] = 0.0
        z_vertices[j]["max"] = 0.0
    for i in xrange(N):
        for j in xrange(M):
            Edge(x_vertices[i], z_vertices[j])["value"] = D[i][j]

    def inner_prod(s, e, t):
        t["dummy"] += e["value"] * s["value"]

    def update_z(s, e, t):
        if not t["max"] == 0.0:
            t["value"] += e["value"] * s["value"]

    def compute_residual(s, e, t):
        if not t["max"] == 0.0:
            s["value"] -= t["value"] * e["value"]

    for itr in xrange(num_iter):
        # Compute inner products with r
        print "NUM ITR = ", itr
        G = G.triple_apply(inner_prod, mutated_fields=["value", "dummy"])
        for i in xrange(M):
            z_vertices[i]["max"] = 0.0
            z_temp[i] = z_vertices[i]["dummy"]
        max_pos = np.argmax(z_temp)
        z_vertices[max_pos]["max"] = z_temp[max_pos]
        G = G.triple_apply(update_z, mutated_fields=["max", "value"])

    for i in xrange(M):
        z[i] = z_vertices[i]["value"]

    return z
def extract_backbone(flavor_network, alpha):
    """
    makes a new graph with only the edges with weights that exceed the threshold for statistical significance
    :param ing_comp_graph: full flavor ingredient network
    :return: the pruned SGraph
    """
    def degree_count_fn(src, edge, dst):
        """
        increments the degree of the nodes on this edge
        :param src:
        :param edge:
        :param dst:
        :return:
        """
        src['deg'] += 1
        dst['deg'] += 1
        return src, edge, dst

    def compute_node_moments(node_k):
        mean = 2*node_k/(node_k+1)
        sigma = sqrt(node_k**2*((20 + 4*node_k)/((node_k + 1)*(node_k + 2)*(node_k + 3)) - 4/(node_k + 1)**2))
        return mean, sigma

    def test_for_significance(edge, weights_lookup, alpha):
        y_obs = edge['weight']
        node1_k = weights_lookup[edge['__dst_id']]
        node2_k = weights_lookup[edge['__src_id']]
        m1, sig1 = compute_node_moments(float(node1_k))
        m2, sig2 = compute_node_moments(float(node2_k))

        return y_obs >= abs(m1 + alpha*sig1) or y_obs >= abs(m2 + alpha*sig2)

    flav_net_w_deg = SGraph()
    edge_list = flavor_network.get_edges()
    new_node_list = flavor_network.vertices.fillna('deg', 0)
    flav_net_w_deg = flav_net_w_deg.add_vertices(new_node_list).add_edges(edge_list)
    flav_net_w_deg = flav_net_w_deg.triple_apply(degree_count_fn, mutated_fields=['deg'])
    weights_dict = flav_net_w_deg.vertices.to_dataframe().set_index('__id').to_dict()['deg']

    significant_edges = []
    for edge in flav_net_w_deg.get_edges():
        if test_for_significance(edge, weights_dict, alpha):
            significant_edges.append(flav_net_w_deg.get_edges(src_ids=edge['__src_id'],
                                                              dst_ids=edge['__dst_id'], format='list')[0])
    pruned_network = SGraph().add_vertices(new_node_list)
    pruned_network = pruned_network.add_edges(significant_edges)
    return pruned_network
def build_data_graph():
  file_path = "/Users/blahiri/healthcare/documents/recommendation_system/"
  beneficiaries = SFrame.read_csv(file_path + "beneficiary_summary_2008_2009.csv")
  bene_packed = beneficiaries.pack_columns(column_prefix = 'chron_', dtype = dict, new_column_name = 'chronic_conditions', remove_prefix = False)
  
  #x is a row of bene_packed in the following lambda. We insert the desynpuf_id into the (key, value) tuple, convert the tuple to a list by calling list(), 
  #and the outer [] makes sure we emit a list of lists.
  bene_chrons = bene_packed.flat_map(["chronic_condition_name", "chronic_condition_value", "desynpuf_id"], 
                                     lambda x:[list(k + (x['desynpuf_id'], )) for k in x['chronic_conditions'].iteritems()])
 

  bene_chrons = bene_chrons[bene_chrons['chronic_condition_value'] == 1]
  del bene_chrons['chronic_condition_value']
  bene_chrons.rename({'chronic_condition_name': 'chronic_condition'})

  g = SGraph()
  bene_chrons['relation'] = 'had_chronic'
  g = g.add_edges(bene_chrons, src_field = 'desynpuf_id', dst_field = 'chronic_condition')
  print g.summary()
 
  #Take out the distinct IDs of patients with chronic conditions to avoid repetition in query
  bene_with_chrons = SFrame(None)
  bene_with_chrons.add_column(bene_chrons['desynpuf_id'].unique(), 'desynpuf_id')
  
  #Add edges to the graph indicating which patient had which diagnosed condition
  tcdc = SFrame.read_csv(file_path + "transformed_claim_diagnosis_codes.csv")
  cols_to_drop = ['clm_id', 'clm_from_dt', 'clm_thru_dt', 'claim_type', 'clm_thru_year']
  for column in cols_to_drop:
     del tcdc[column]
  #Same patient can be diagnosed with same condition multiple times a year, so take distinct
  tcdc = tcdc.unique()
  #Take diagnosed conditions for only those patients who had some chronic condition in 2008 or 2009. It is possible that 
  #such a patient had no diagnosed condition, however.
  bene_chrons_tcdc = bene_with_chrons.join(tcdc)
  
  bene_chrons_tcdc['relation'] = 'diagnosed_with'
  g = g.add_edges(bene_chrons_tcdc, src_field = 'desynpuf_id', dst_field = 'dgns_cd')
  print g.summary()

  
  #Add edges to the graph indicating which patient had which procedure
  tcpc = SFrame.read_csv(file_path + "transformed_claim_prcdr_codes.csv", column_type_hints = {'prcdr_cd' : str})
  cols_to_drop = ['clm_id', 'clm_from_dt', 'clm_thru_dt', 'claim_type', 'clm_thru_year']
  for column in cols_to_drop:
     del tcpc[column]
  tcpc = tcpc.unique()
  #Take procedures for only those patients who had some chronic condition in 2008 or 2009. It is possible that 
  #such a patient had no procedure, however.
  bene_chrons_tcpc = bene_with_chrons.join(tcpc)
  bene_chrons_tcpc['relation'] = 'underwent'
  g = g.add_edges(bene_chrons_tcpc, src_field = 'desynpuf_id', dst_field = 'prcdr_cd')
  print g.summary()

  #Add edges to the graph indicating which patient had which medicine
  pde = SFrame.read_csv(file_path + "prescribed_drugs.csv")
  pde = pde.unique()
  #Take medicines for only those patients who had some chronic condition in 2008 or 2009. It is possible that 
  #such a patient had no medicine, however.
  bene_chrons_pde = bene_with_chrons.join(pde)
  bene_chrons_pde['relation'] = 'had_drug'
  g = g.add_edges(bene_chrons_pde, src_field = 'desynpuf_id', dst_field = 'substancename')
  print g.summary()
   
  return g
Beispiel #26
0
import graphlab as gl

from graphlab import SFrame, SGraph, Vertex, Edge
edge_data = SFrame.read_csv(
    'http://s3.amazonaws.com/dato-datasets/bond/bond_edges.csv')

g = SGraph()
g = g.add_edges(edge_data, src_field='src', dst_field='dst')
print g

g.save('james_bond')
new_graph = gl.load_sgraph('james_bond')

g.show(vlabel='id', highlight=['James Bond', 'Moneypenny'], arrows=True)
# In[33]:

edges = gl.SFrame.read_csv(os.path.join(csvDataFolder, '001008_0.csv'),
                           header=False,
                           delimiter=',',
                           column_type_hints=int)
edegs_n

# In[42]:

from graphlab import SGraph, Vertex
g = SGraph().add_vertices([
    Vertex('cat', {'fluffy': 1}),
    Vertex('dog', {
        'fluffy': 1,
        'woof': 1
    }),
    Vertex('hippo', {})
])
g.vertices.save(os.path.join(resultFolder, 'test_graph_vertices'),
                format='csv')

# In[59]:

#sframe reading
start_r = time.time()
# csvFiles = os.listdir(csvDataFolder)
csvFiles = [
    '000000_0.csv',
    '000001_0.csv',
    '000002_0.csv',
Beispiel #28
0
def build_data_graph():
    file_path = "/Users/blahiri/healthcare/documents/recommendation_system/"
    beneficiaries = SFrame.read_csv(file_path +
                                    "beneficiary_summary_2008_2009.csv")
    bene_packed = beneficiaries.pack_columns(
        column_prefix='chron_',
        dtype=dict,
        new_column_name='chronic_conditions',
        remove_prefix=False)

    #x is a row of bene_packed in the following lambda. We insert the desynpuf_id into the (key, value) tuple, convert the tuple to a list by calling list(),
    #and the outer [] makes sure we emit a list of lists.
    bene_chrons = bene_packed.flat_map(
        ["chronic_condition_name", "chronic_condition_value", "desynpuf_id"],
        lambda x: [
            list(k + (x['desynpuf_id'], ))
            for k in x['chronic_conditions'].iteritems()
        ])

    bene_chrons = bene_chrons[bene_chrons['chronic_condition_value'] == 1]
    del bene_chrons['chronic_condition_value']
    bene_chrons.rename({'chronic_condition_name': 'chronic_condition'})

    g = SGraph()
    bene_chrons['relation'] = 'had_chronic'
    g = g.add_edges(bene_chrons,
                    src_field='desynpuf_id',
                    dst_field='chronic_condition')
    print g.summary()

    #Take out the distinct IDs of patients with chronic conditions to avoid repetition in query
    bene_with_chrons = SFrame(None)
    bene_with_chrons.add_column(bene_chrons['desynpuf_id'].unique(),
                                'desynpuf_id')

    #Add edges to the graph indicating which patient had which diagnosed condition
    tcdc = SFrame.read_csv(file_path + "transformed_claim_diagnosis_codes.csv")
    cols_to_drop = [
        'clm_id', 'clm_from_dt', 'clm_thru_dt', 'claim_type', 'clm_thru_year'
    ]
    for column in cols_to_drop:
        del tcdc[column]
    #Same patient can be diagnosed with same condition multiple times a year, so take distinct
    tcdc = tcdc.unique()
    #Take diagnosed conditions for only those patients who had some chronic condition in 2008 or 2009. It is possible that
    #such a patient had no diagnosed condition, however.
    bene_chrons_tcdc = bene_with_chrons.join(tcdc)

    bene_chrons_tcdc['relation'] = 'diagnosed_with'
    g = g.add_edges(bene_chrons_tcdc,
                    src_field='desynpuf_id',
                    dst_field='dgns_cd')
    print g.summary()

    #Add edges to the graph indicating which patient had which procedure
    tcpc = SFrame.read_csv(file_path + "transformed_claim_prcdr_codes.csv",
                           column_type_hints={'prcdr_cd': str})
    cols_to_drop = [
        'clm_id', 'clm_from_dt', 'clm_thru_dt', 'claim_type', 'clm_thru_year'
    ]
    for column in cols_to_drop:
        del tcpc[column]
    tcpc = tcpc.unique()
    #Take procedures for only those patients who had some chronic condition in 2008 or 2009. It is possible that
    #such a patient had no procedure, however.
    bene_chrons_tcpc = bene_with_chrons.join(tcpc)
    bene_chrons_tcpc['relation'] = 'underwent'
    g = g.add_edges(bene_chrons_tcpc,
                    src_field='desynpuf_id',
                    dst_field='prcdr_cd')
    print g.summary()

    #Add edges to the graph indicating which patient had which medicine
    pde = SFrame.read_csv(file_path + "prescribed_drugs.csv")
    pde = pde.unique()
    #Take medicines for only those patients who had some chronic condition in 2008 or 2009. It is possible that
    #such a patient had no medicine, however.
    bene_chrons_pde = bene_with_chrons.join(pde)
    bene_chrons_pde['relation'] = 'had_drug'
    g = g.add_edges(bene_chrons_pde,
                    src_field='desynpuf_id',
                    dst_field='substancename')
    print g.summary()

    return g
Beispiel #29
0
#g = SGraph(vertices=vertex_data, edges=edge_data, vid_field='name',
#           src_field='src', dst_field='dst')

#targets = ['James Bond', 'Moneypenny']
#subgraph = g.get_neighborhood(ids=targets, radius=1, full_subgraph=True)
#subgraph.show(vlabel='id', highlight=['James Bond', 'Moneypenny'], arrows=True)

#from graphlab import SGraph, Vertex, Edge

#g = SGraph()
#verts = [Vertex(0, attr={'breed': 'labrador'}),
#         Vertex(1, attr={'breed': 'labrador'}),
#         Vertex(2, attr={'breed': 'vizsla'})]

#g = g.add_vertices(verts)
#g = g.add_edges(Edge(1, 2))

#print g

from graphlab import SFrame, SGraph
edge_data = SFrame.read_csv('http://s3.amazonaws.com/dato-datasets/bond/bond_edges.csv')
vertex_data = SFrame.read_csv('http://s3.amazonaws.com/dato-datasets/bond/bond_vertices.csv')

g = SGraph(vertices=vertex_data, edges=edge_data, vid_field='name',src_field='src', dst_field='dst')
#print g

g.show()


Beispiel #30
0
    path = sys.argv[1]
else:
    path = "./data/"

verbose = False
vertexFiles = [
    "City", "Country", "Region", "Advisor", "Category", "Founder",
    "FundingRound", "HQ", "keywords", "Member", "Office", "organizations",
    "PrimaryImage", "TeamMember", "Website", "companies_acquired_by_sap"
]
edgesFiles = [
    "GeoInformation", "acquisitions", "categories_keywords_edges",
    "investments", "keywords_descriptions_edges", "keywords_webpages_edges",
    "relationships", "companies_acquired_by_sap_edges"
]
g = SGraph()

for f in vertexFiles:
    content = SFrame.read_csv(path + f + '.csv',
                              na_values='null',
                              verbose=verbose)
    if 'path' in content.column_names():
        g = g.add_vertices(content, vid_field='path')
    elif 'url' in content.column_names():
        g = g.add_vertices(content, vid_field='url')
    else:
        print "Unknown vid field: ", content.column_names()
        sys.exit()

for f in edgesFiles:
    content = SFrame.read_csv(path + f + '.csv',
Beispiel #31
0
outputPath = os.environ.get("OUTPUT_PATH")
startScale = int(os.environ.get("START_SCALE"))

tagFile = './tmp'
with open(tagFile, 'r') as f:
    infor = f.readline().strip().split(",")
    maxScale = int(infor[1])
    realEndScale = int(infor[2])

scaleRange = range(startScale, realEndScale + 1)

for scale in scaleRange:

    inputPath = os.path.join(outputPath, 'tmp', 'AdjacentRelationships',
                             str(scale))
    url = inputPath
    data = SFrame.read_csv(url, header=False)
    if (data.num_rows() == 0):
        cc_ids = SFrame({"__id": [], "component_id": []})
    else:
        g = SGraph().add_edges(data,
                               src_field=data.column_names()[0],
                               dst_field=data.column_names()[1])
        cc = connected_components.create(g)
        cc_ids = cc.get('component_id')
    path = os.path.join(outputPath, 'tmp', 'ConnectedComponents', str(scale))
    if (~os.path.exists(path)):
        os.makedirs(path)

    SFrame.export_csv(cc_ids, os.path.join(path))
Beispiel #32
0
#targets = ['James Bond', 'Moneypenny']
#subgraph = g.get_neighborhood(ids=targets, radius=1, full_subgraph=True)
#subgraph.show(vlabel='id', highlight=['James Bond', 'Moneypenny'], arrows=True)

#from graphlab import SGraph, Vertex, Edge

#g = SGraph()
#verts = [Vertex(0, attr={'breed': 'labrador'}),
#         Vertex(1, attr={'breed': 'labrador'}),
#         Vertex(2, attr={'breed': 'vizsla'})]

#g = g.add_vertices(verts)
#g = g.add_edges(Edge(1, 2))

#print g

from graphlab import SFrame, SGraph
edge_data = SFrame.read_csv(
    'http://s3.amazonaws.com/dato-datasets/bond/bond_edges.csv')
vertex_data = SFrame.read_csv(
    'http://s3.amazonaws.com/dato-datasets/bond/bond_vertices.csv')

g = SGraph(vertices=vertex_data,
           edges=edge_data,
           vid_field='name',
           src_field='src',
           dst_field='dst')
#print g

g.show()
Beispiel #33
0
import graphlab as gl
from graphlab import SGraph, Vertex, Edge

f = open('web-Google.txt', 'r')
vertices = set()
edges = []

for line in f:
    v1, v2 = [int(x) for x in line.split()]
    vertices.add(v1)
    vertices.add(v2)
    edges.append(Edge(v1, v2))

print 'In total {0} vertices and {1} edges'.format(len(vertices), len(edges))
g = SGraph().add_vertices([Vertex(x) for x in vertices]).add_edges(edges)

g.save('page_graph')
Beispiel #34
0
def main():
    g = SGraph()

    verts = []

    #initialize the Karate graph with unique label fields for each node
    for i in range(0, 34):
        verts.append(Vertex(i, attr={'label': str(i)}))

    g = g.add_vertices(verts)

    #prepare the path for the Karate network data
    fname = "./karate.txt"
    #read the edges from Karate.txt and add them to the SGraph object
    with open(fname) as f:

        for l in f:
            #print(l)

            #parse the src and dst ids for the next edge
            ids = l.split()
            src = int(ids[0])
            dst = int(ids[1])

            #add the edge as a graphlabl.Edge object to the graph
            g = g.add_edges(Edge(src, dst))

    #visualize the graph
    #print(g.summary())
    #randId=rn.sample(range(0,34),1)[0]
    #print(randId)
    #test = g.get_vertices(fields={'label':'1'})[randId]
    #test.show()

    #print(test)
    ids = range(0, 34)

    #label propagation loop
    flag = False

    iteration = 0
    #rounds=5

    #initialize neigh dict for performance
    gns = {}
    cur_max = 0
    start = time.time()
    #start=time.time()
    while flag == False:
        #pick vertice iteration order randomly
        rn.shuffle(ids)
        flag = True
        #print(ids)
        start = time.time()
        for index in ids:
            #print(index)

            cur_max = LPA(g, index)
            if str(cur_max) != g.get_vertices(ids=[index])['label'][0]:
                flag = False
                g.vertices['label'] = g.vertices.apply(lambda x: str(
                    cur_max) if x['__id'] == index else x['label'])

        #print(end-start)
        iteration += 1
        print(iteration)
    end = time.time()
    #end=time.time()
    print(end - start)
    print iteration
    g.show(vlabel='label')