Ejemplo n.º 1
0
def run_one_step(dataframe, start_centers):
    """
    Performs one iteration of K-Means.

    This function takes a dataframe with dense feature vectors, a set of centroids, and returns
    a new set of centroids along with the total distance of points to centroids.

    This function calculates for each point the closest centroid and then aggregates the newly
    formed clusters to find the new centroids.

    This function uses Spark to distribute the aggregation amongst the node.

    :param dataframe: a dataframe containing a column of features (an array of doubles)
    :param start_centers: a k x m matrix with k the number of centroids and m the number of features
    :return: a k x m matrix, and a positive double
    """
    # The dimensions in the problem
    (num_centroids, num_features) = np.shape(start_centers)
    # For each feature vector, compute the nearest centroid and the distance to that centroid.
    # The index of the nearest centroid is stored in the 'indexes' column.
    # We also add a column of 1's that will be reduced later to count the number of elements in
    # each cluster.
    with tf.Graph().as_default() as g:
        # The placeholder for the input: we use the block format
        points = tf.placeholder(tf.double,
                                shape=[None, num_features],
                                name='features')
        # The shape of the block is extracted as a TF variable.
        num_points = tf.stack([tf.shape(points)[0]], name="num_points")
        distances = tf_compute_distances(points, start_centers)
        # The outputs of the program.
        # The closest centroids are extracted.
        indexes = tf.argmin(distances, 1, name='indexes')
        # This could be done based on the indexes as well.
        min_distances = tf.reduce_min(distances, 1, name='min_distances')
        counts = tf.tile(tf.constant([1]), num_points, name='count')
        df2 = tfs.map_blocks([indexes, counts, min_distances], dataframe)
    # Perform the reduction: we regroup the points by their centroid indexes.
    gb = df2.groupBy("indexes")
    with tf.Graph().as_default() as g:
        # Look at the documentation of tfs.aggregate for the naming conventions of the placeholders.
        x_input = tfs.block(df2, "features", tf_name="features_input")
        count_input = tfs.block(df2, "count", tf_name="count_input")
        md_input = tfs.block(df2,
                             "min_distances",
                             tf_name="min_distances_input")
        # Each operation is just the sum.
        x = tf.reduce_sum(x_input, [0], name='features')
        count = tf.reduce_sum(count_input, [0], name='count')
        min_distances = tf.reduce_sum(md_input, [0], name='min_distances')
        df3 = tfs.aggregate([x, count, min_distances], gb)
    # Get the new centroids
    df3_c = df3.collect()
    # The new centroids.
    new_centers = np.array(
        [np.array(row.features) / row['count'] for row in df3_c])
    total_distances = np.sum([row['min_distances'] for row in df3_c])
    return (new_centers, total_distances)
Ejemplo n.º 2
0
 def test_groupby_1(self):
     data = [Row(x=float(x), key=str(x % 2)) for x in range(4)]
     df = self.sql.createDataFrame(data)
     gb = df.groupBy("key")
     with tf.Graph().as_default() as g:
         x_input = tfs.block(df, "x", tf_name="x_input")
         x = tf.reduce_sum(x_input, [0], name='x')
         df2 = tfs.aggregate(x, gb)
     data2 = df2.collect()
     assert data2 == [Row(key='0', x=2.0), Row(key='1', x=4.0)], data2
Ejemplo n.º 3
0
def run_one_step(dataframe, start_centers):
    """
    Performs one iteration of K-Means.

    This function takes a dataframe with dense feature vectors, a set of centroids, and returns
    a new set of centroids along with the total distance of points to centroids.

    This function calculates for each point the closest centroid and then aggregates the newly
    formed clusters to find the new centroids.

    This function uses Spark to distribute the aggregation amongst the node.

    :param dataframe: a dataframe containing a column of features (an array of doubles)
    :param start_centers: a k x m matrix with k the number of centroids and m the number of features
    :return: a k x m matrix, and a positive double
    """
    # The dimensions in the problem
    (num_centroids, num_features) = np.shape(start_centers)
    # For each feature vector, compute the nearest centroid and the distance to that centroid.
    # The index of the nearest centroid is stored in the 'indexes' column.
    # We also add a column of 1's that will be reduced later to count the number of elements in
    # each cluster.
    with tf.Graph().as_default() as g:
        # The placeholder for the input: we use the block format
        points = tf.placeholder(tf.double, shape=[None, num_features], name='features')
        # The shape of the block is extracted as a TF variable.
        num_points = tf.stack([tf.shape(points)[0]], name="num_points")
        distances = tf_compute_distances(points, start_centers)
        # The outputs of the program.
        # The closest centroids are extracted.
        indexes = tf.argmin(distances, 1, name='indexes')
        # This could be done based on the indexes as well.
        min_distances = tf.reduce_min(distances, 1, name='min_distances')
        counts = tf.tile(tf.constant([1]), num_points, name='count')
        df2 = tfs.map_blocks([indexes, counts, min_distances], dataframe)
    # Perform the reduction: we regroup the points by their centroid indexes.
    gb = df2.groupBy("indexes")
    with tf.Graph().as_default() as g:
        # Look at the documentation of tfs.aggregate for the naming conventions of the placeholders.
        x_input = tfs.block(df2, "features", tf_name="features_input")
        count_input = tfs.block(df2, "count", tf_name="count_input")
        md_input = tfs.block(df2, "min_distances", tf_name="min_distances_input")
        # Each operation is just the sum.
        x = tf.reduce_sum(x_input, [0], name='features')
        count = tf.reduce_sum(count_input, [0], name='count')
        min_distances = tf.reduce_sum(md_input, [0], name='min_distances')
        df3 = tfs.aggregate([x, count, min_distances], gb)
    # Get the new centroids
    df3_c = df3.collect()
    # The new centroids.
    new_centers = np.array([np.array(row.features) / row['count'] for row in df3_c])
    total_distances = np.sum([row['min_distances'] for row in df3_c])
    return (new_centers, total_distances)
Ejemplo n.º 4
0
def covariance(df, feature_column, num_features, coarse_center):
    with tf.Graph().as_default():
        features = tf.placeholder(tf.double, [None, num_features],
                                  name=feature_column)
        count = tf.identity(tf.ones_like(features)[:, 0], name='count')
        out = tf.identity(tf.map_fn(lambda x: tf.einsum('i,j->ij', x, x),
                                    features,
                                    dtype=tf.double),
                          name='out')
        df1 = tfs.map_blocks(out, df)
    with tf.Graph().as_default():
        features = tf.placeholder(tf.double, [None, num_features],
                                  name=feature_column + '_input')
        out = tf.placeholder(tf.double, [None, num_features, num_features],
                             name='out_input')
        count = tf.placeholder(tf.double, [None], name='count_input')
        expected_mean = tf.identity(tf.reduce_sum(features, axis=0),
                                    name=feature_column)
        expected_out = tf.identity(tf.reduce_sum(out, axis=0), name='out')
        expected_count = tf.identity(tf.reduce_sum(count, axis=0),
                                     name='count')
        df2 = tfs.aggregate([expected_mean, expected_out, expected_count],
                            df1.groupby(coarse_center))
    with tf.Graph().as_default():
        features = tf.placeholder(tf.double, [None, num_features],
                                  name=feature_column)
        out = tf.placeholder(tf.double, [None, num_features, num_features],
                             name='out')
        count = tf.placeholder(tf.double, [None], name='count')
        covariance = tf.identity(tf.map_fn(
            lambda (f, o, c): (o + tf.transpose(o)) /
            (2 * c - 2) - tf.einsum('i,j->ij', f, f), (features, out, count),
            dtype=tf.double),
                                 name='covariance')
        df3 = tfs.map_blocks(covariance, df2)
    return df3
Ejemplo n.º 5
0
    indexes = tf.argmin(distances, 1, name='indexes')
    min_distances = tf.reduce_min(distances, 1, name='min_distances')
    counts = tf.tile(tf.constant([1]), tf.pack([num_points]), name='count')
    df2 = tfs.map_blocks([indexes, counts, min_distances], df0)

# Perform the reduction
gb = df2.groupBy("indexes")
with tf.Graph().as_default() as g:
    # Look at the documentation of tfs.aggregate for the naming conventions of the placeholders.
    x_input = tfs.block(df2, "features", tf_name="features_input")
    count_input = tfs.block(df2, "count", tf_name="count_input")
    md_input = tfs.block(df2, "min_distances", tf_name="min_distances_input")
    x = tf.reduce_sum(x_input, [0], name='features')
    count = tf.reduce_sum(count_input, [0], name='count')
    min_distances = tf.reduce_sum(md_input, [0], name='min_distances')
    df3 = tfs.aggregate([x, count, min_distances], gb)

# Get the new centroids
df3_c = df3.collect()
new_centers = np.array([np.array(row.features) / row['count'] for row in df3_c])
total_distances = np.sum([row['min_distances'] for row in df3_c])


def run_one_step(dataframe, start_centers):
    """
    Performs one iteration of K-Means.

    This function takes a dataframe with dense feature vectors, a set of centroids, and returns
    a new set of centroids along with the total distance of points to centroids.

    This function calculates for each point the closest centroid and then aggregates the newly
Ejemplo n.º 6
0
from tensorframes.core import _java_api
japi = _java_api()
_java_api().initialize_logging()

data = [Row(x=float(x), key=str(x / 3)) for x in range(1, 6)]
df = sqlContext.createDataFrame(data)
tfs.block(df, "x")

data = [Row(x=float(x), key=str(x / 3)) for x in range(1, 6)]
df = sqlContext.createDataFrame(data)
gb = df.groupBy("key")
with tf.Graph().as_default() as g:
    x_input = tfs.block(df, "x", tf_name="x_input")
    x = tf.reduce_sum(x_input, [0], name='x')
    df2 = tfs.aggregate(x, gb)


data = [Row(x=float(x)) for x in range(5)]
df = sqlContext.createDataFrame(data)
with tf.Graph().as_default() as g:
    # The placeholder that corresponds to column 'x'
    x = tf.placeholder(tf.double, shape=[None], name="x")
    # The output that adds 3 to x
    z = tf.add(x, 3, name='z')
    # The resulting dataframe
    df2 = tfs.map_blocks(z, df)

df2.show()

Ejemplo n.º 7
0
col_name = "x"
col_key = "key"
with tf.Graph().as_default() as g:
    x = tfs.block(df, col_name)
    invs = tf.inv(tf.to_double(x), name="invs")
    df2 = tfs.map_blocks([invs, tf.ones_like(invs, name="count")], df)


# The geometric mean
gb = df2.select(col_key, "invs", "count").groupBy("key")
with tf.Graph().as_default() as g:
    x_input = tfs.block(df2, "invs", tf_name="invs_input")
    count_input = tfs.block(df2, "invs", tf_name="count_input")
    x = tf.reduce_sum(x_input, [0], name='invs')
    count = tf.reduce_sum(count_input, [0], name='count')
    df3 = tfs.aggregate([x, count], gb)

with tf.Graph().as_default() as g:
    invs = tfs.block(df2, "invs")
    count = tfs.block(df2, "count")
    geom_mean = tf.div(tf.to_double(count), invs,  name = "harmonic_mean")
    df4 = tfs.map_blocks(geom_mean, df3).select("key", "harmonic_mean")

df4.collect()


with tf.Graph().as_default() as g:
    x = tf.zeros([], tf.int32)
    y = tf.zeros(x)
    print y.graph.as_graph_def()
Ejemplo n.º 8
0
df = tfs.analyze(sqlContext.createDataFrame(data))

# The geometric mean:
# TODO(tjh) make a test out of this, it found some bugs
# - non numeric columns (string)
# - unused columns
# - output that has a child
col_name = "x"
col_key = "key"
with tf.Graph().as_default() as g:
    x = tfs.block(df, col_name)
    invs = tf.inv(tf.to_double(x), name="invs")
    df2 = tfs.map_blocks([invs, tf.ones_like(invs, name="count")], df)

# The geometric mean
gb = df2.select(col_key, "invs", "count").groupBy("key")
with tf.Graph().as_default() as g:
    x_input = tfs.block(df2, "invs", tf_name="invs_input")
    count_input = tfs.block(df2, "invs", tf_name="count_input")
    x = tf.reduce_sum(x_input, [0], name='invs')
    count = tf.reduce_sum(count_input, [0], name='count')
    df3 = tfs.aggregate([x, count], gb)

with tf.Graph().as_default() as g:
    invs = tfs.block(df2, "invs")
    count = tfs.block(df2, "count")
    geom_mean = tf.div(tf.to_double(count), invs, name="harmonic_mean")
    df4 = tfs.map_blocks(geom_mean, df3).select("key", "harmonic_mean")

df4.collect()
Ejemplo n.º 9
0
    indexes = tf.argmin(distances, 1, name='indexes')
    min_distances = tf.reduce_min(distances, 1, name='min_distances')
    counts = tf.tile(tf.constant([1]), tf.pack([num_points]), name='count')
    df2 = tfs.map_blocks([indexes, counts, min_distances], df0)

# Perform the reduction
gb = df2.groupBy("indexes")
with tf.Graph().as_default() as g:
    # Look at the documentation of tfs.aggregate for the naming conventions of the placeholders.
    x_input = tfs.block(df2, "features", tf_name="features_input")
    count_input = tfs.block(df2, "count", tf_name="count_input")
    md_input = tfs.block(df2, "min_distances", tf_name="min_distances_input")
    x = tf.reduce_sum(x_input, [0], name='features')
    count = tf.reduce_sum(count_input, [0], name='count')
    min_distances = tf.reduce_sum(md_input, [0], name='min_distances')
    df3 = tfs.aggregate([x, count, min_distances], gb)

# Get the new centroids
df3_c = df3.collect()
new_centers = np.array(
    [np.array(row.features) / row['count'] for row in df3_c])
total_distances = np.sum([row['min_distances'] for row in df3_c])


def run_one_step(dataframe, start_centers):
    """
    Performs one iteration of K-Means.

    This function takes a dataframe with dense feature vectors, a set of centroids, and returns
    a new set of centroids along with the total distance of points to centroids.
Ejemplo n.º 10
0
def run_one_step(dataframe, start_centers):
    """
    Performs one iteration of K-Means.

    This function takes a dataframe with dense feature vectors, a set of centroids, and returns
    a new set of centroids along with the total distance of points to centroids.

    This function calculates for each point the closest centroid and then aggregates the newly
    formed clusters to find the new centroids.

    :param dataframe: a dataframe containing a column of features (an array of doubles)
    :param start_centers: a k x m matrix with k the number of centroids and m the number of features
    :return: a k x m matrix, and a positive double
    """
    # The dimensions in the problem
    (num_centroids, num_features) = np.shape(start_centers)
    # For each feature vector, compute the nearest centroid and the distance to that centroid.
    # The index of the nearest centroid is stored in the 'indexes' column.
    # We also add a column of 1's that will be reduced later to count the number of elements in
    # each cluster.
    with tf.Graph().as_default() as g:
        # The placeholder for the input: we use the block format
        points = tf.placeholder(tf.double, shape=[None, num_features], name='features')
        # The shape of the block is extracted as a TF variable.
        num_points = tf.shape(points)[0]
        # The centers are embedded in the TF program.
        centers = tf.constant(start_centers)
        # Computation of the minimum distance. This is a standard implementation that follows
        # what MLlib does.
        squares = tf.reduce_sum(tf.square(points), reduction_indices=1)
        center_squares = tf.reduce_sum(tf.square(centers), reduction_indices=1)
        prods = tf.matmul(points, centers, transpose_b = True)
        # This code simply expresses two outer products: center_squares * ones(num_points)
        # and ones(num_centroids) * squares
        t1a = tf.expand_dims(center_squares, 0)
        t1b = tf.pack([num_points, 1])
        t1 = tf.tile(t1a, t1b)
        t2a = tf.expand_dims(squares, 1)
        t2b = tf.pack([1, num_centroids])
        t2 = tf.tile(t2a, t2b)
        distances = t1 + t2 - 2 * prods
        # The outputs of the program.
        # The closest centroids are extracted.
        indexes = tf.argmin(distances, 1, name='indexes')
        # This could be done based on the indexes as well.
        min_distances = tf.reduce_min(distances, 1, name='min_distances')
        counts = tf.tile(tf.constant([1]), tf.pack([num_points]), name='count')
        df2 = tfs.map_blocks([indexes, counts, min_distances], dataframe)
    # Perform the reduction: we regroup the point by their centroid indexes.
    gb = df2.groupBy("indexes")
    with tf.Graph().as_default() as g:
        # Look at the documentation of tfs.aggregate for the naming conventions of the placeholders.
        x_input = tfs.block(df2, "features", tf_name="features_input")
        count_input = tfs.block(df2, "count", tf_name="count_input")
        md_input = tfs.block(df2, "min_distances", tf_name="min_distances_input")
        # Each operation is just the sum.
        x = tf.reduce_sum(x_input, [0], name='features')
        count = tf.reduce_sum(count_input, [0], name='count')
        min_distances = tf.reduce_sum(md_input, [0], name='min_distances')
        df3 = tfs.aggregate([x, count, min_distances], gb)
    # Get the new centroids
    df3_c = df3.collect()
    # The new centroids.
    new_centers = np.array([np.array(row.features) / row['count'] for row in df3_c])
    total_distances = np.sum([row['min_distances'] for row in df3_c])
    return (new_centers, total_distances)
Ejemplo n.º 11
0
from pyspark.sql.functions import *
from pyspark.sql.types import DoubleType, IntegerType, LongType, FloatType

from tensorframes.core import _java_api
japi = _java_api()
_java_api().initialize_logging()

data = [Row(x=float(x), key=str(x / 3)) for x in range(1, 6)]
df = sqlContext.createDataFrame(data)
tfs.block(df, "x")

data = [Row(x=float(x), key=str(x / 3)) for x in range(1, 6)]
df = sqlContext.createDataFrame(data)
gb = df.groupBy("key")
with tf.Graph().as_default() as g:
    x_input = tfs.block(df, "x", tf_name="x_input")
    x = tf.reduce_sum(x_input, [0], name='x')
    df2 = tfs.aggregate(x, gb)

data = [Row(x=float(x)) for x in range(5)]
df = sqlContext.createDataFrame(data)
with tf.Graph().as_default() as g:
    # The placeholder that corresponds to column 'x'
    x = tf.placeholder(tf.double, shape=[None], name="x")
    # The output that adds 3 to x
    z = tf.add(x, 3, name='z')
    # The resulting dataframe
    df2 = tfs.map_blocks(z, df)

df2.show()