Exemple #1
0
 def from_previous_reduction(cls, input_dir):
     parent = gl.load_sgraph(input_dir+'parent')
     verticy_descriptions = gl.load_sframe(input_dir+'verticy_descriptions')
     child = gl.load_sgraph(input_dir+'child')
     gw = cls()
     gw.g = parent
     gw.verticy_descriptions = verticy_descriptions
     gw.child = cls()
     gw.child.g = child
     return gw
Exemple #2
0
 def from_previous_reduction(cls, input_dir):
     parent = gl.load_sgraph(input_dir + 'parent')
     verticy_descriptions = gl.load_sframe(input_dir +
                                           'verticy_descriptions')
     child = gl.load_sgraph(input_dir + 'child')
     gw = cls()
     gw.g = parent
     gw.verticy_descriptions = verticy_descriptions
     gw.child = cls()
     gw.child.g = child
     return gw
Exemple #3
0
def create_initial_bayesian_network():
    '''
  Start from a randomly generated Bayesian network where there is no edge between the variables of the same type.
  First, create a blacklist. 
  '''
    g = load_sgraph('data_graph')
    edges = g.get_edges()
    features = edges[['__dst_id', 'relation']].unique()
    features.rename({'__dst_id': 'feature_id', 'relation': 'feature_type'})

    bn = SGraph()
    bn = bn.add_vertices(features, vid_field='feature_id')
    n_features = features.num_rows()
    edges_data_graph = g.get_edges()
    n_patients = edges_data_graph['__src_id'].unique().size()

    random.seed(1234)
    for i in range(20):
        src = features['feature_id'][random.randint(0, n_features - 1)]
        dst = 'E8498'
        #dst = features['feature_id'][random.randint(0, n_features-1)]
        bn = bn.add_edges(Edge(src, dst))
        print "Added edge between " + src + " and " + dst

    bic = get_bic_score(g, bn, n_patients)
    return g
def create_initial_bayesian_network():
  '''
  Start from a randomly generated Bayesian network where there is no edge between the variables of the same type.
  First, create a blacklist. 
  '''
  g = load_sgraph('data_graph')
  edges = g.get_edges()
  features = edges[['__dst_id', 'relation']].unique()
  features.rename({'__dst_id': 'feature_id', 'relation': 'feature_type'})
  
  bn = SGraph()
  bn = bn.add_vertices(features, vid_field = 'feature_id')
  n_features = features.num_rows()
  edges_data_graph = g.get_edges()
  n_patients = edges_data_graph['__src_id'].unique().size()
 
  random.seed(1234)
  for i in range(20):
    src = features['feature_id'][random.randint(0, n_features-1)]
    dst = 'E8498'
    #dst = features['feature_id'][random.randint(0, n_features-1)]
    bn = bn.add_edges(Edge(src, dst))
    print "Added edge between " + src + " and " + dst

  bic = get_bic_score(g, bn, n_patients)
  return g
    def __load_data_structure__(self, filepath):
        """Return data structure if can be loaded, otherwise returns None and logs warning"""
        # try to load different supported types, since don't know what type just try all and swallow exceptions
        obj = None
        try:
            obj = _gl.load_sframe(filepath)
            return obj
        except:
            pass
        try:
            obj = _gl.load_sgraph(filepath)
            return obj
        except:
            pass
        
        try:
            obj = _gl.load_model(filepath)
            return obj
        except:
            pass

        try:
            obj = _gl.SArray(data=filepath)
            return obj
        except:
            pass

        __LOGGER__.debug("Unable to load dependency, unsupported type at path: %s" % filepath)
        return None
def main():
    args = parse_args('Test the result of the CommunityDetection algorithm')
    result_graph = gl.load_sgraph(args.graph_name)
    expected = gl.SFrame.read_csv(args.expected_output, delimiter=' ', header=False, column_type_hints=long)
    for node in result_graph.vertices:
        test = expected.apply(lambda x: node['label'] != x['X2'] and node['__id'] == x['X1'])
        if test.sum() > 0:
            print('Not all values match, invalid algorithm')
            exit(1)
def load_nxgraph_from_sgraph(graph_path):
    sg = gl.load_sgraph(graph_path)
    import networkx as nx
    g = nx.Graph()

    # Put the nodes and edges from the SGraph into a NetworkX graph
    g.add_nodes_from(list(sg.vertices['__id']))
    g.add_edges_from([(e['__src_id'], e['__dst_id'], e['attr'])
                      for e in sg.edges])
    return g
def get_pr_result_from_input(input_file):
    g = gl.load_sgraph(input_file, format='csv')
    pr = gl.pagerank.create(g)
    pr_out = pr['pagerank']
    csvfile_pr_movie_start_result = open("output/pr_movie_start_result.csv", "w")
    writer_pr_movie_start_result = csv.writer(csvfile_pr_movie_start_result)

    for pr_out_item in pr_out:
        writer_pr_movie_start_result.writerow([pr_out_item['__id'], pr_out_item['pagerank'], pr_out_item['delta']])
    csvfile_pr_movie_start_result.close()
 def _load_graphlab_object(cls, obj_type, obj_path):
   if obj_type == 'model':
     return graphlab.load_model(obj_path)
   elif obj_type == 'sarray':
     return graphlab.SArray(obj_path)
   elif obj_type == 'sframe':
     return graphlab.load_sframe(obj_path)
   elif obj_type == 'sgraph':
     return graphlab.load_sgraph(obj_path)
   else:
     raise RuntimeError(str(obj_type) + ' is not supported')
Exemple #10
0
def main():
    args = parse_args('Test the result of the CommunityDetection algorithm')
    result_graph = gl.load_sgraph(args.graph_name)
    expected = gl.SFrame.read_csv(args.expected_output,
                                  delimiter=' ',
                                  header=False,
                                  column_type_hints=long)
    for node in result_graph.vertices:
        test = expected.apply(
            lambda x: node['label'] != x['X2'] and node['__id'] == x['X1'])
        if test.sum() > 0:
            print('Not all values match, invalid algorithm')
            exit(1)
 def __init__(self, sf_path=None, g_path=None, cache_max=0.75):
     self.sf = None
     self.label = None
     self.bin_sfs = None
     self.reps = gl.SArray(dtype=str)
     self.hier_graph = None
     self.num_bins = 0
     self.features = None
     self.distance = None
     self.cache_max = cache_max
     if g_path:
         self.g = gl.load_sgraph(g_path)
         self.sf = self.g.vertices
     elif sf_path:
         self.sf = gl.load_sframe(sf_path)
def main():
    args = parse_args('Test the result of the CommunityDetection algorithm')
    result_graph = gl.load_sgraph(args.graph_name)
    expected = gl.SFrame.read_csv(args.expected_output, delimiter=' ', header=False, column_type_hints=long)

    for node in result_graph.vertices.sort('__id'):
        test = expected.apply(
            lambda x: float_not_equals(x['X2'], node['local_clustering_coefficient']) and node['__id'] == x['X1'])
        if test.sum() > 0:
            print('Not all values match, invalid algorithm')
            exit(1)

    expected_average_cc = expected.filter_by([0], 'X1')['X2'][0]
    actual_average_cc = result_graph.vertices['average_clustering_coefficient'][0]
    if float_not_equals(expected_average_cc, actual_average_cc):
        print('Average Clustering Coefficient is wrong: expected: "%s", but got: "%s"' % (
            expected_average_cc, actual_average_cc))
def load_gl_object(filename):
    """
    Load a GLC datastructure from a filename.

    Parameters
    ----------
    filename : Filename for the archive

    Returns
    ----------
    The GLC object.

    """
    obj = None
    if not os.path.exists(filename):
        raise "Loading error: %s is not a valid filename." % filename

    try:
        obj = _gl.load_sframe(filename)
        return obj
    except:
        pass
    try:
        obj = _gl.load_sgraph(filename)
        return obj
    except:
        pass

    try:
        obj = _gl.load_model(filename)
        return obj
    except:
        pass

    try:
        obj = _gl.SArray(data=filename)
        return obj
    except:
        pass

    return obj
def main():
    args = parse_args('Test the result of the CommunityDetection algorithm')
    result_graph = gl.load_sgraph(args.graph_name)
    expected = gl.SFrame.read_csv(args.expected_output,
                                  delimiter=' ',
                                  header=False,
                                  column_type_hints=long)

    for node in result_graph.vertices.sort('__id'):
        test = expected.apply(
            lambda x: float_not_equals(x['X2'], node[
                'local_clustering_coefficient']) and node['__id'] == x['X1'])
        if test.sum() > 0:
            print('Not all values match, invalid algorithm')
            exit(1)

    expected_average_cc = expected.filter_by([0], 'X1')['X2'][0]
    actual_average_cc = result_graph.vertices[
        'average_clustering_coefficient'][0]
    if float_not_equals(expected_average_cc, actual_average_cc):
        print(
            'Average Clustering Coefficient is wrong: expected: "%s", but got: "%s"'
            % (expected_average_cc, actual_average_cc))
Exemple #15
0
    def get_nodes_and_edges(self):
        directory_names = os.listdir(self._input_directory_path)
        graph_tuples = []
        for directory_name in directory_names:
            file_names = os.listdir(self._input_directory_path +
                                    directory_name)
            for file_name in file_names:
                if ".sgraph" in file_name:
                    print("File name is: {0}".format(file_name))
                    pattern = "^([^\.]+)__([^\.]+).[^\.]+.([^\.]+).sgraph$"
                    match = re.match(pattern, file_name)
                    group_tuple = match.groups()
                    category = group_tuple[0]
                    sub_category = group_tuple[1]
                    timestamp = group_tuple[2]

                    sub_graph = gl.load_sgraph(self._input_directory_path +
                                               directory_name + "/" +
                                               file_name)
                    sub_graph.save(self._output_directory_path + file_name +
                                   ".csv",
                                   format='csv')

                    summary_dict = sub_graph.summary()

                    num_vertices = summary_dict['num_vertices']
                    num_edges = summary_dict['num_edges']

                    tuple = (category, sub_category, timestamp, num_vertices,
                             num_edges)
                    graph_tuples.append(tuple)

        df = pd.DataFrame(
            graph_tuples,
            columns=['category', 'sub_category', 'date', 'nodes', 'edges'])
        df.to_csv(self._output_directory_path + "graph_summary.csv")
    if numpy.linalg.norm(numpy.array(dst['memVector']), ord=2) > 0.0:
        if numpy.linalg.norm(
                numpy.array(dst['prev']) - numpy.array(dst['memVector']),
                ord=2) < CONVERGENCE_VALUE:
            dst['isSuperNode'] = 1
    return (src, edge, dst)


def updatePrev(src, edge, dst):
    if src['isSuperNode'] == 0:
        src['prev'] = src['memVector']
    if dst['isSuperNode'] == 0:
        dst['prev'] = dst['memVector']
    return (src, edge, dst)


if __name__ == '__main__':
    graph = gl.load_sgraph("s3://sdurgam/GraphLab/Graph")
    graph = graph.triple_apply(initialise, mutated_fields=['prev'])

    convergence = graph.vertices['isSuperNode'].sum()

    while (convergence < NUM_NON_SUPERNODES):
        graph = graph.triple_apply(propagate, mutated_fields=['memVector'])
        graph = graph.triple_apply(l2Norm, mutated_fields=['isSuperNode'])
        graph = graph.triple_apply(updatePrev, mutated_fields=['prev'])
        graph.vertices['memVector'] = graph.vertices['memVector'].apply(
            lambda x: [0.0] * 92000)
        convergence = graph.vertices['isSuperNode'].sum()

    graph = graph.save("s3://sdurgam/GraphLab/Graph")
Exemple #17
0
import graphlab as gl
from graphlab.data_structures.sgraph import SGraph as _SGraph
import graphlab.aggregate as _Aggregate
from graphlab import SArray
from graphlab import SFrame
from graphlab import Vertex
from graphlab import SGraph
from graphlab import Edge 

g = gl.load_sgraph('/home/tweninge/wiki.graph')  
  
def initVertex(g):
    g.vertices['dist'] = 8888

    g.vertices['sent'] = 0
    #g.vertices['from_last_art'] = 0
    #g.vertices['count'] =0 
    g.vertices['isDead'] = 0
    #g.vertices['vid_set'] = SArray.from_const({}, g.summary()['num_vertices'])
#seen here have two function, for the cat, it is used to remember the articles, for art, it is used as the vid_set
#in fact, it is a dict with the form of {'id':[dist, from_last_art]}
    g.vertices['seen'] = SArray.from_const({}, g.summary()['num_vertices'])
    #g.vertices['msg_q'] = SArray.from_const([], g.summary()['num_vertices']) we donnot need it any more 

#g = gl.load_graph('/Users/liuzuozhu/Downloads/web-Google.txt', format='snap')   

initVertex(g)
#print g.get_vertices()

# def initEdge(g):  
#     #g.edges.head(5) 
Exemple #18
0
import os
import graphlab as gl
data_file = 'US_business_links'
if os.path.exists(data_file):
    sg = gl.load_sgraph(data_file)
    # sg.save('1', format='csv')
else:
    url = 'https://static.turi.com/datasets/' + data_file
    sg = gl.load_sgraph(url)
    sg.save(data_file)
print sg.summary()

pr = gl.pagerank.create(sg, max_iterations=10)
# print pr['pagerank']
print pr.summary()

pr_out = pr['pagerank']
print pr_out.topk('pagerank', k=10)

##Triangle counting
##The number of triangles in a vertex's immediate neighborhood is a measure of the "density" of the vertex's neighborhood.
tri = gl.triangle_counting.create(sg)
print tri.summary()

tri_out = tri['triangle_count']
print tri_out.topk('triangle_count', k=10)

##Because GraphLab Create SGraphs use directed edges, the shortest path toolkit also finds the shortest directed paths to a source vertex.

sssp = gl.shortest_path.create(sg, source_vid='Microsoft')
sssp.get_path(vid='Weyerhaeuser',
Exemple #19
0
import graphlab as gl

from graphlab import SFrame, SGraph, Vertex, Edge
edge_data = SFrame.read_csv(
    'http://s3.amazonaws.com/dato-datasets/bond/bond_edges.csv')

g = SGraph()
g = g.add_edges(edge_data, src_field='src', dst_field='dst')
print g

g.save('james_bond')
new_graph = gl.load_sgraph('james_bond')

g.show(vlabel='id', highlight=['James Bond', 'Moneypenny'], arrows=True)
Exemple #20
0
    results.save('buckets/' + b)
print 'Nearest Neighbors Complete!\n'
# Create Graphs

# reload the SFrame
sf = gl.load_sframe('sydney_sf')

print 'Creating Graphs...'
for i in range(1, 671):
    print 'Creating Graph for B' + str(i)
    cur_edges = gl.load_sframe('buckets/B' + str(i))
    edge_verts = cur_edges['query_label'].append(cur_edges['reference_label'])
    edge_verts = edge_verts.unique()
    cur_verts = sf.filter_by(edge_verts, 'mongo_id')
    g = gl.SGraph(cur_verts, cur_edges,
                  vid_field='mongo_id',
                  src_field='query_label',
                  dst_field='reference_label')
    g.save('graphs/B' + str(i))
print 'Graph Creation Complete!\n'

# Calculate Components
print 'Calculating Components...'
for i in range(1, 671):
    print 'Calculating Components for for B' + str(i)
    g = gl.load_sgraph('graphs/B' + str(i))
    cc = gl.connected_components.create(g)
    cc.save('components/B' + str(i))
print 'Success!'
exit()
def sgraph_to_csv(sgraph_path, output_path):
    sg = gl.load_sgraph(sgraph_path)
    sg.save(output_path, 'csv')
Exemple #22
0
import graphlab as gl

g = gl.load_sgraph('input/test.txt', format='snap')
pr = gl.pagerank.create(g)
pr_out = pr['pagerank']

print "#########"
for pr_out_index in pr_out:
    print pr_out_index

Exemple #23
0
import graphlab as gl

g = gl.load_sgraph('page_graph')

N = len(g.vertices)
beta = 0.02
epsilon = 150

f = open('topk_weight', 'w')

g.vertices['weight'] = 1.0
g.vertices['degree'] = 0


def increment_degree(src, edge, dst):
    src['degree'] += 1
    return (src, edge, dst)


def increment_weight(src, edge, dst):
    dst['weight_new'] += src['weight'] / src['degree']
    return (src, edge, dst)


g = g.triple_apply(increment_degree, mutated_fields=['degree'])

while True:
    g.vertices['weight_new'] = 0
    g.triple_apply(increment_weight, mutated_fields=['weight_new'])
    g.vertices['weight_new'] = beta / N + (
        1 - beta) * (g.vertices['weight_new'] +
Exemple #24
0
    itr = 0
    for i in range(0, 10):
        vector.append(array.index(max(array)) + NUM_FIRST + itr)
        array.remove(max(array))
        itr += 1
    return vector


def getNumRecos(src, edge, dst):
    src['rightRecos'] = len(set(src['recos']).intersection(src['groundTruth']))
    dst['rightRecos'] = len(set(dst['recos']).intersection(dst['groundTruth']))
    return (src, edge, dst)


if __name__ == "__main__":
    graph = gl.load_sgraph("s3://sank/GraphLab/Graph")
    groundTruth = gl.SArray("GroundTruth")

    graph.vertices['recos'] = graph.vertices['prev'].apply(
        lambda x: getRecos(x))

    graph.vertices['groundTruth'] = groundTruth
    graph.vertices['rightRecos'] = 0
    graph = graph.triple_apply(getNumRecos, mutated_fields=['rightRecos'])

    r1 = list(
        graph.vertices.sort('__id')['rightRecos'][NUM_SUPERNODES:]).count(1)
    r2 = list(
        graph.vertices.sort('__id')['rightRecos'][NUM_SUPERNODES:]).count(2)
    r3 = list(
        graph.vertices.sort('__id')['rightRecos'][NUM_SUPERNODES:]).count(3)