Ejemplo n.º 1
0
def kmeansDemo(conn):
    '''
       Demonstrate K-Means
    '''          
    #a) K-Means with random initialization of centroids
    kmeans = KMeans(conn)
    logging.info('KMeans with random cluster initialization')
    mdl, mdl_params = kmeans.generateClusters('public.wine_bool_training_set','indep',3)  
    #Show model params
    mdl_params 
    centroids_random_kmeans = str(mdl.get('centroids'))
    centroids_random_kmeans = centroids_random_kmeans.replace('[','{').replace(']','}')
    
    #b) KMeans Plus Plus 
    logging.info('KMeans Plus Plus ')
    mdl = kmeans.generateClusters('public.wine_bool_training_set','indep',3,'kmeanspp') 
    
    #Show a visualization of the clusters.
    #1) Compute the strength of the relationship between all pairs of points and capture this in a graph

    stmt = '''
              select t1.id as node1, 
                     t2.id as node2, 
                     {madlib_schema}.squared_dist_norm2(t1.indep, t2.indep) as dist 
              from {table_name} t1, {table_name} t2;  
    '''.format(
        table_name='wine_bool_training_set',
        madlib_schema=conn.getMADlibSchema()
    )
    results = psql.read_frame(stmt, conn.getConnection())
    dist_dict = {}    
    edge_set = set()
    for r in range(len(results)):
        node1 = results.get('node1')[r]
        node2 = results.get('node2')[r]
        ed = [node1,node2]
        ed.sort()
        ed = str(ed)
        dist = results.get('dist')[r]        
        #We are building undirected graph, so don't add back edges.
        if(ed not in edge_set):
            edge_set.add(ed)
            if(dist_dict.has_key(node1)):
                dist_dict[node1][node2]=dist
            else:
                dist_dict[node1] = {node2:dist}   
    #2) Only retain those edges in the 90 percentile, prune the remaining (sparse graph).
    dist_arr = list(set(results.get('dist')))
    dist_arr.sort()    
    #3) Display the resulting graph where nodes are colored by their cluster number. 
    # Also, nodes in the same cluster should be physically close to each other.
    #Get cluster allocation for deciding colors
    cluster_membership_query = '''
        select id as instance_id,
               ({madlib_schema}.closest_column(
                    '{centroids}'::double precision[],
                    indep, 
                    '{madlib_schema}.squared_dist_norm2'
                )
               ).column_id as cluster_num
        from {table_name};
    '''.format(
        centroids=centroids_random_kmeans,
        table_name='wine_bool_training_set',
        madlib_schema=conn.getMADlibSchema()
    )

    results = psql.read_frame(cluster_membership_query, conn.getConnection())
    cluster_memberships = dict(zip(results.get('instance_id'),results.get('cluster_num')))
      
    #Visualize
    kmeansViz(dist_dict,dist_arr,cluster_memberships)
Ejemplo n.º 2
0
def kmeansDemo(conn):
    '''
       Demonstrate K-Means
    '''
    #a) K-Means with random initialization of centroids
    kmeans = KMeans(conn)
    logging.info('KMeans with random cluster initialization')
    mdl, mdl_params = kmeans.generateClusters('public.wine_bool_training_set',
                                              'indep', 3)
    #Show model params
    mdl_params
    centroids_random_kmeans = str(mdl.get('centroids'))
    centroids_random_kmeans = centroids_random_kmeans.replace('[',
                                                              '{').replace(
                                                                  ']', '}')

    #b) KMeans Plus Plus
    logging.info('KMeans Plus Plus ')
    mdl = kmeans.generateClusters('public.wine_bool_training_set', 'indep', 3,
                                  'kmeanspp')

    #Show a visualization of the clusters.
    #1) Compute the strength of the relationship between all pairs of points and capture this in a graph

    stmt = '''
              select t1.id as node1, 
                     t2.id as node2, 
                     {madlib_schema}.squared_dist_norm2(t1.indep, t2.indep) as dist 
              from {table_name} t1, {table_name} t2;  
    '''.format(table_name='wine_bool_training_set',
               madlib_schema=conn.getMADlibSchema())
    results = psql.read_frame(stmt, conn.getConnection())
    dist_dict = {}
    edge_set = set()
    for r in range(len(results)):
        node1 = results.get('node1')[r]
        node2 = results.get('node2')[r]
        ed = [node1, node2]
        ed.sort()
        ed = str(ed)
        dist = results.get('dist')[r]
        #We are building undirected graph, so don't add back edges.
        if (ed not in edge_set):
            edge_set.add(ed)
            if (dist_dict.has_key(node1)):
                dist_dict[node1][node2] = dist
            else:
                dist_dict[node1] = {node2: dist}
    #2) Only retain those edges in the 90 percentile, prune the remaining (sparse graph).
    dist_arr = list(set(results.get('dist')))
    dist_arr.sort()
    #3) Display the resulting graph where nodes are colored by their cluster number.
    # Also, nodes in the same cluster should be physically close to each other.
    #Get cluster allocation for deciding colors
    cluster_membership_query = '''
        select id as instance_id,
               ({madlib_schema}.closest_column(
                    '{centroids}'::double precision[],
                    indep, 
                    '{madlib_schema}.squared_dist_norm2'
                )
               ).column_id as cluster_num
        from {table_name};
    '''.format(centroids=centroids_random_kmeans,
               table_name='wine_bool_training_set',
               madlib_schema=conn.getMADlibSchema())

    results = psql.read_frame(cluster_membership_query, conn.getConnection())
    cluster_memberships = dict(
        zip(results.get('instance_id'), results.get('cluster_num')))

    #Visualize
    kmeansViz(dist_dict, dist_arr, cluster_memberships)
Ejemplo n.º 3
0
def kmeansDemo(conn):
    '''
       Demonstrate K-Means
    '''          
    #a) K-Means with random initialization of centroids
    kmeans = KMeans(conn)
    print '\n\nKMeans with random cluster initialization'
    mdl = kmeans.generateClusters('public.wine_bool_training_set','indep',3)   
    centroids_random_kmeans = str(mdl.get('centroids'))
    centroids_random_kmeans = centroids_random_kmeans.replace('[','{').replace(']','}')
    
    #b) KMeans Plus Plus 
    print '\n\nKMeans Plus Plus '
    mdl = kmeans.generateClusters('public.wine_bool_training_set','indep',3,'kmeanspp') 
    
    #Show a visualization of the clusters.
    #1) Compute the strength of the relationship between all pairs of points and capture this in a graph
    cursor = conn.getCursor()   
    cursor.execute(
                   '''
                      select t1.id as node1, 
                             t2.id as node2, 
                             madlib.squared_dist_norm2(t1.indep, t2.indep) as dist 
                      from {table_name} t1, {table_name} t2;  
                   '''.format(table_name='wine_bool_training_set')
                  )
    
    result_set = [row for row in cursor]
    cursor.close()
    dist_dict = {}    
    edge_set = set()
    for r in result_set:
        node1 = r.get('node1')
        node2 = r.get('node2')
        ed = [node1,node2]
        ed.sort()
        ed = str(ed)
        dist = r.get('dist')        
        #We are building undirected graph, so don't add back edges.
        if(ed not in edge_set):
            edge_set.add(ed)
            if(dist_dict.has_key(node1)):
                dist_dict[node1][node2]=dist
            else:
                dist_dict[node1] = {node2:dist}
            
        
    #2) Only retain those edges in the 90 percentile, prune the remaining (sparse graph).
    dist_arr = list(set([r.get('dist') for r in result_set]))
    dist_arr.sort()    
    #3) Display the resulting graph where nodes are colored by their cluster number. 
    # Also, nodes in the same cluster should be physically close to each other.
    #Get cluster allocation for deciding colors
    cluster_membership_query = '''
                                    select id as instance_id,
                                           (madlib.closest_column(
                                                                  '{centroids}'::double precision[],
                                                                  indep, 
                                                                  'madlib.squared_dist_norm2'
                                                                 )
                                           ).column_id as cluster_num
                                    from {table_name};
                               '''.format(
                                           centroids=centroids_random_kmeans,
                                           table_name='wine_bool_training_set'
                                         )

    cluster_memberships = {}                           
    cursor = conn.getCursor()
    cursor.execute(cluster_membership_query)
    for r in cursor:
        cluster_memberships[r.get('instance_id')] = r.get('cluster_num')
    cursor.close()    
    #Visualize
    kmeansViz(dist_dict,dist_arr,cluster_memberships)