Ejemplos de aggregate en Python, ejemplos de tensorframes.aggregate en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: kmeans_demo.py Proyecto: zephyrGit/tensorframes

def run_one_step(dataframe, start_centers):
    """
    Performs one iteration of K-Means.

    This function takes a dataframe with dense feature vectors, a set of centroids, and returns
    a new set of centroids along with the total distance of points to centroids.

    This function calculates for each point the closest centroid and then aggregates the newly
    formed clusters to find the new centroids.

    This function uses Spark to distribute the aggregation amongst the node.

    :param dataframe: a dataframe containing a column of features (an array of doubles)
    :param start_centers: a k x m matrix with k the number of centroids and m the number of features
    :return: a k x m matrix, and a positive double
    """
    # The dimensions in the problem
    (num_centroids, num_features) = np.shape(start_centers)
    # For each feature vector, compute the nearest centroid and the distance to that centroid.
    # The index of the nearest centroid is stored in the 'indexes' column.
    # We also add a column of 1's that will be reduced later to count the number of elements in
    # each cluster.
    with tf.Graph().as_default() as g:
        # The placeholder for the input: we use the block format
        points = tf.placeholder(tf.double,
                                shape=[None, num_features],
                                name='features')
        # The shape of the block is extracted as a TF variable.
        num_points = tf.stack([tf.shape(points)[0]], name="num_points")
        distances = tf_compute_distances(points, start_centers)
        # The outputs of the program.
        # The closest centroids are extracted.
        indexes = tf.argmin(distances, 1, name='indexes')
        # This could be done based on the indexes as well.
        min_distances = tf.reduce_min(distances, 1, name='min_distances')
        counts = tf.tile(tf.constant([1]), num_points, name='count')
        df2 = tfs.map_blocks([indexes, counts, min_distances], dataframe)
    # Perform the reduction: we regroup the points by their centroid indexes.
    gb = df2.groupBy("indexes")
    with tf.Graph().as_default() as g:
        # Look at the documentation of tfs.aggregate for the naming conventions of the placeholders.
        x_input = tfs.block(df2, "features", tf_name="features_input")
        count_input = tfs.block(df2, "count", tf_name="count_input")
        md_input = tfs.block(df2,
                             "min_distances",
                             tf_name="min_distances_input")
        # Each operation is just the sum.
        x = tf.reduce_sum(x_input, [0], name='features')
        count = tf.reduce_sum(count_input, [0], name='count')
        min_distances = tf.reduce_sum(md_input, [0], name='min_distances')
        df3 = tfs.aggregate([x, count, min_distances], gb)
    # Get the new centroids
    df3_c = df3.collect()
    # The new centroids.
    new_centers = np.array(
        [np.array(row.features) / row['count'] for row in df3_c])
    total_distances = np.sum([row['min_distances'] for row in df3_c])
    return (new_centers, total_distances)

Ejemplo n.º 2

0

Mostrar archivo

Archivo: core_test.py Proyecto: phi-dbq/tensorframes

 def test_groupby_1(self):
     data = [Row(x=float(x), key=str(x % 2)) for x in range(4)]
     df = self.sql.createDataFrame(data)
     gb = df.groupBy("key")
     with tf.Graph().as_default() as g:
         x_input = tfs.block(df, "x", tf_name="x_input")
         x = tf.reduce_sum(x_input, [0], name='x')
         df2 = tfs.aggregate(x, gb)
     data2 = df2.collect()
     assert data2 == [Row(key='0', x=2.0), Row(key='1', x=4.0)], data2

Ejemplo n.º 3

0

Mostrar archivo

Archivo: kmeans_demo.py Proyecto: databricks/tensorframes

def run_one_step(dataframe, start_centers):
    """
    Performs one iteration of K-Means.

    This function takes a dataframe with dense feature vectors, a set of centroids, and returns
    a new set of centroids along with the total distance of points to centroids.

    This function calculates for each point the closest centroid and then aggregates the newly
    formed clusters to find the new centroids.

    This function uses Spark to distribute the aggregation amongst the node.

    :param dataframe: a dataframe containing a column of features (an array of doubles)
    :param start_centers: a k x m matrix with k the number of centroids and m the number of features
    :return: a k x m matrix, and a positive double
    """
    # The dimensions in the problem
    (num_centroids, num_features) = np.shape(start_centers)
    # For each feature vector, compute the nearest centroid and the distance to that centroid.
    # The index of the nearest centroid is stored in the 'indexes' column.
    # We also add a column of 1's that will be reduced later to count the number of elements in
    # each cluster.
    with tf.Graph().as_default() as g:
        # The placeholder for the input: we use the block format
        points = tf.placeholder(tf.double, shape=[None, num_features], name='features')
        # The shape of the block is extracted as a TF variable.
        num_points = tf.stack([tf.shape(points)[0]], name="num_points")
        distances = tf_compute_distances(points, start_centers)
        # The outputs of the program.
        # The closest centroids are extracted.
        indexes = tf.argmin(distances, 1, name='indexes')
        # This could be done based on the indexes as well.
        min_distances = tf.reduce_min(distances, 1, name='min_distances')
        counts = tf.tile(tf.constant([1]), num_points, name='count')
        df2 = tfs.map_blocks([indexes, counts, min_distances], dataframe)
    # Perform the reduction: we regroup the points by their centroid indexes.
    gb = df2.groupBy("indexes")
    with tf.Graph().as_default() as g:
        # Look at the documentation of tfs.aggregate for the naming conventions of the placeholders.
        x_input = tfs.block(df2, "features", tf_name="features_input")
        count_input = tfs.block(df2, "count", tf_name="count_input")
        md_input = tfs.block(df2, "min_distances", tf_name="min_distances_input")
        # Each operation is just the sum.
        x = tf.reduce_sum(x_input, [0], name='features')
        count = tf.reduce_sum(count_input, [0], name='count')
        min_distances = tf.reduce_sum(md_input, [0], name='min_distances')
        df3 = tfs.aggregate([x, count, min_distances], gb)
    # Get the new centroids
    df3_c = df3.collect()
    # The new centroids.
    new_centers = np.array([np.array(row.features) / row['count'] for row in df3_c])
    total_distances = np.sum([row['min_distances'] for row in df3_c])
    return (new_centers, total_distances)

Ejemplo n.º 4

0

Mostrar archivo

Archivo: k_covariance.py Proyecto: yupbank/lopq_tf_frames

def covariance(df, feature_column, num_features, coarse_center):
    with tf.Graph().as_default():
        features = tf.placeholder(tf.double, [None, num_features],
                                  name=feature_column)
        count = tf.identity(tf.ones_like(features)[:, 0], name='count')
        out = tf.identity(tf.map_fn(lambda x: tf.einsum('i,j->ij', x, x),
                                    features,
                                    dtype=tf.double),
                          name='out')
        df1 = tfs.map_blocks(out, df)
    with tf.Graph().as_default():
        features = tf.placeholder(tf.double, [None, num_features],
                                  name=feature_column + '_input')
        out = tf.placeholder(tf.double, [None, num_features, num_features],
                             name='out_input')
        count = tf.placeholder(tf.double, [None], name='count_input')
        expected_mean = tf.identity(tf.reduce_sum(features, axis=0),
                                    name=feature_column)
        expected_out = tf.identity(tf.reduce_sum(out, axis=0), name='out')
        expected_count = tf.identity(tf.reduce_sum(count, axis=0),
                                     name='count')
        df2 = tfs.aggregate([expected_mean, expected_out, expected_count],
                            df1.groupby(coarse_center))
    with tf.Graph().as_default():
        features = tf.placeholder(tf.double, [None, num_features],
                                  name=feature_column)
        out = tf.placeholder(tf.double, [None, num_features, num_features],
                             name='out')
        count = tf.placeholder(tf.double, [None], name='count')
        covariance = tf.identity(tf.map_fn(
            lambda (f, o, c): (o + tf.transpose(o)) /
            (2 * c - 2) - tf.einsum('i,j->ij', f, f), (features, out, count),
            dtype=tf.double),
                                 name='covariance')
        df3 = tfs.map_blocks(covariance, df2)
    return df3

Ejemplo n.º 5

0

Mostrar archivo

Archivo: kmeans.py Proyecto: Jay-Zeng/tensorframes

    indexes = tf.argmin(distances, 1, name='indexes')
    min_distances = tf.reduce_min(distances, 1, name='min_distances')
    counts = tf.tile(tf.constant([1]), tf.pack([num_points]), name='count')
    df2 = tfs.map_blocks([indexes, counts, min_distances], df0)

# Perform the reduction
gb = df2.groupBy("indexes")
with tf.Graph().as_default() as g:
    # Look at the documentation of tfs.aggregate for the naming conventions of the placeholders.
    x_input = tfs.block(df2, "features", tf_name="features_input")
    count_input = tfs.block(df2, "count", tf_name="count_input")
    md_input = tfs.block(df2, "min_distances", tf_name="min_distances_input")
    x = tf.reduce_sum(x_input, [0], name='features')
    count = tf.reduce_sum(count_input, [0], name='count')
    min_distances = tf.reduce_sum(md_input, [0], name='min_distances')
    df3 = tfs.aggregate([x, count, min_distances], gb)

# Get the new centroids
df3_c = df3.collect()
new_centers = np.array([np.array(row.features) / row['count'] for row in df3_c])
total_distances = np.sum([row['min_distances'] for row in df3_c])


def run_one_step(dataframe, start_centers):
    """
    Performs one iteration of K-Means.

    This function takes a dataframe with dense feature vectors, a set of centroids, and returns
    a new set of centroids along with the total distance of points to centroids.

    This function calculates for each point the closest centroid and then aggregates the newly

Ejemplo n.º 6

0

Mostrar archivo

Archivo: groupby_scratch.py Proyecto: Jay-Zeng/tensorframes

from tensorframes.core import _java_api
japi = _java_api()
_java_api().initialize_logging()

data = [Row(x=float(x), key=str(x / 3)) for x in range(1, 6)]
df = sqlContext.createDataFrame(data)
tfs.block(df, "x")

data = [Row(x=float(x), key=str(x / 3)) for x in range(1, 6)]
df = sqlContext.createDataFrame(data)
gb = df.groupBy("key")
with tf.Graph().as_default() as g:
    x_input = tfs.block(df, "x", tf_name="x_input")
    x = tf.reduce_sum(x_input, [0], name='x')
    df2 = tfs.aggregate(x, gb)


data = [Row(x=float(x)) for x in range(5)]
df = sqlContext.createDataFrame(data)
with tf.Graph().as_default() as g:
    # The placeholder that corresponds to column 'x'
    x = tf.placeholder(tf.double, shape=[None], name="x")
    # The output that adds 3 to x
    z = tf.add(x, 3, name='z')
    # The resulting dataframe
    df2 = tfs.map_blocks(z, df)

df2.show()

Ejemplo n.º 7

0

Mostrar archivo

Archivo: geom_mean.py Proyecto: Jay-Zeng/tensorframes

col_name = "x"
col_key = "key"
with tf.Graph().as_default() as g:
    x = tfs.block(df, col_name)
    invs = tf.inv(tf.to_double(x), name="invs")
    df2 = tfs.map_blocks([invs, tf.ones_like(invs, name="count")], df)


# The geometric mean
gb = df2.select(col_key, "invs", "count").groupBy("key")
with tf.Graph().as_default() as g:
    x_input = tfs.block(df2, "invs", tf_name="invs_input")
    count_input = tfs.block(df2, "invs", tf_name="count_input")
    x = tf.reduce_sum(x_input, [0], name='invs')
    count = tf.reduce_sum(count_input, [0], name='count')
    df3 = tfs.aggregate([x, count], gb)

with tf.Graph().as_default() as g:
    invs = tfs.block(df2, "invs")
    count = tfs.block(df2, "count")
    geom_mean = tf.div(tf.to_double(count), invs,  name = "harmonic_mean")
    df4 = tfs.map_blocks(geom_mean, df3).select("key", "harmonic_mean")

df4.collect()


with tf.Graph().as_default() as g:
    x = tf.zeros([], tf.int32)
    y = tf.zeros(x)
    print y.graph.as_graph_def()

Ejemplo n.º 8

0

Mostrar archivo

df = tfs.analyze(sqlContext.createDataFrame(data))

# The geometric mean:
# TODO(tjh) make a test out of this, it found some bugs
# - non numeric columns (string)
# - unused columns
# - output that has a child
col_name = "x"
col_key = "key"
with tf.Graph().as_default() as g:
    x = tfs.block(df, col_name)
    invs = tf.inv(tf.to_double(x), name="invs")
    df2 = tfs.map_blocks([invs, tf.ones_like(invs, name="count")], df)

# The geometric mean
gb = df2.select(col_key, "invs", "count").groupBy("key")
with tf.Graph().as_default() as g:
    x_input = tfs.block(df2, "invs", tf_name="invs_input")
    count_input = tfs.block(df2, "invs", tf_name="count_input")
    x = tf.reduce_sum(x_input, [0], name='invs')
    count = tf.reduce_sum(count_input, [0], name='count')
    df3 = tfs.aggregate([x, count], gb)

with tf.Graph().as_default() as g:
    invs = tfs.block(df2, "invs")
    count = tfs.block(df2, "count")
    geom_mean = tf.div(tf.to_double(count), invs, name="harmonic_mean")
    df4 = tfs.map_blocks(geom_mean, df3).select("key", "harmonic_mean")

df4.collect()

Ejemplo n.º 9

0

Mostrar archivo

Archivo: kmeans.py Proyecto: zhuohuwu0603/tensorframes

    indexes = tf.argmin(distances, 1, name='indexes')
    min_distances = tf.reduce_min(distances, 1, name='min_distances')
    counts = tf.tile(tf.constant([1]), tf.pack([num_points]), name='count')
    df2 = tfs.map_blocks([indexes, counts, min_distances], df0)

# Perform the reduction
gb = df2.groupBy("indexes")
with tf.Graph().as_default() as g:
    # Look at the documentation of tfs.aggregate for the naming conventions of the placeholders.
    x_input = tfs.block(df2, "features", tf_name="features_input")
    count_input = tfs.block(df2, "count", tf_name="count_input")
    md_input = tfs.block(df2, "min_distances", tf_name="min_distances_input")
    x = tf.reduce_sum(x_input, [0], name='features')
    count = tf.reduce_sum(count_input, [0], name='count')
    min_distances = tf.reduce_sum(md_input, [0], name='min_distances')
    df3 = tfs.aggregate([x, count, min_distances], gb)

# Get the new centroids
df3_c = df3.collect()
new_centers = np.array(
    [np.array(row.features) / row['count'] for row in df3_c])
total_distances = np.sum([row['min_distances'] for row in df3_c])


def run_one_step(dataframe, start_centers):
    """
    Performs one iteration of K-Means.

    This function takes a dataframe with dense feature vectors, a set of centroids, and returns
    a new set of centroids along with the total distance of points to centroids.

Ejemplo n.º 10

0

Mostrar archivo

Archivo: kmeans_demo.py Proyecto: caomw/tensorframes

def run_one_step(dataframe, start_centers):
    """
    Performs one iteration of K-Means.

    This function takes a dataframe with dense feature vectors, a set of centroids, and returns
    a new set of centroids along with the total distance of points to centroids.

    This function calculates for each point the closest centroid and then aggregates the newly
    formed clusters to find the new centroids.

    :param dataframe: a dataframe containing a column of features (an array of doubles)
    :param start_centers: a k x m matrix with k the number of centroids and m the number of features
    :return: a k x m matrix, and a positive double
    """
    # The dimensions in the problem
    (num_centroids, num_features) = np.shape(start_centers)
    # For each feature vector, compute the nearest centroid and the distance to that centroid.
    # The index of the nearest centroid is stored in the 'indexes' column.
    # We also add a column of 1's that will be reduced later to count the number of elements in
    # each cluster.
    with tf.Graph().as_default() as g:
        # The placeholder for the input: we use the block format
        points = tf.placeholder(tf.double, shape=[None, num_features], name='features')
        # The shape of the block is extracted as a TF variable.
        num_points = tf.shape(points)[0]
        # The centers are embedded in the TF program.
        centers = tf.constant(start_centers)
        # Computation of the minimum distance. This is a standard implementation that follows
        # what MLlib does.
        squares = tf.reduce_sum(tf.square(points), reduction_indices=1)
        center_squares = tf.reduce_sum(tf.square(centers), reduction_indices=1)
        prods = tf.matmul(points, centers, transpose_b = True)
        # This code simply expresses two outer products: center_squares * ones(num_points)
        # and ones(num_centroids) * squares
        t1a = tf.expand_dims(center_squares, 0)
        t1b = tf.pack([num_points, 1])
        t1 = tf.tile(t1a, t1b)
        t2a = tf.expand_dims(squares, 1)
        t2b = tf.pack([1, num_centroids])
        t2 = tf.tile(t2a, t2b)
        distances = t1 + t2 - 2 * prods
        # The outputs of the program.
        # The closest centroids are extracted.
        indexes = tf.argmin(distances, 1, name='indexes')
        # This could be done based on the indexes as well.
        min_distances = tf.reduce_min(distances, 1, name='min_distances')
        counts = tf.tile(tf.constant([1]), tf.pack([num_points]), name='count')
        df2 = tfs.map_blocks([indexes, counts, min_distances], dataframe)
    # Perform the reduction: we regroup the point by their centroid indexes.
    gb = df2.groupBy("indexes")
    with tf.Graph().as_default() as g:
        # Look at the documentation of tfs.aggregate for the naming conventions of the placeholders.
        x_input = tfs.block(df2, "features", tf_name="features_input")
        count_input = tfs.block(df2, "count", tf_name="count_input")
        md_input = tfs.block(df2, "min_distances", tf_name="min_distances_input")
        # Each operation is just the sum.
        x = tf.reduce_sum(x_input, [0], name='features')
        count = tf.reduce_sum(count_input, [0], name='count')
        min_distances = tf.reduce_sum(md_input, [0], name='min_distances')
        df3 = tfs.aggregate([x, count, min_distances], gb)
    # Get the new centroids
    df3_c = df3.collect()
    # The new centroids.
    new_centers = np.array([np.array(row.features) / row['count'] for row in df3_c])
    total_distances = np.sum([row['min_distances'] for row in df3_c])
    return (new_centers, total_distances)

Ejemplo n.º 11

0

Mostrar archivo

from pyspark.sql.functions import *
from pyspark.sql.types import DoubleType, IntegerType, LongType, FloatType

from tensorframes.core import _java_api
japi = _java_api()
_java_api().initialize_logging()

data = [Row(x=float(x), key=str(x / 3)) for x in range(1, 6)]
df = sqlContext.createDataFrame(data)
tfs.block(df, "x")

data = [Row(x=float(x), key=str(x / 3)) for x in range(1, 6)]
df = sqlContext.createDataFrame(data)
gb = df.groupBy("key")
with tf.Graph().as_default() as g:
    x_input = tfs.block(df, "x", tf_name="x_input")
    x = tf.reduce_sum(x_input, [0], name='x')
    df2 = tfs.aggregate(x, gb)

data = [Row(x=float(x)) for x in range(5)]
df = sqlContext.createDataFrame(data)
with tf.Graph().as_default() as g:
    # The placeholder that corresponds to column 'x'
    x = tf.placeholder(tf.double, shape=[None], name="x")
    # The output that adds 3 to x
    z = tf.add(x, 3, name='z')
    # The resulting dataframe
    df2 = tfs.map_blocks(z, df)

df2.show()