Ejemplo n.º 1
0
    def _transform(self, dataset):
        graph_def = self._optimize_for_inference()
        input_mapping = self.getInputMapping()
        output_mapping = self.getOutputMapping()

        graph = tf.Graph()
        with tf.Session(graph=graph):
            analyzed_df = tfs.analyze(dataset)
            out_tnsr_op_names = [
                tfx.op_name(tnsr_name) for tnsr_name, _ in output_mapping
            ]
            # Load graph
            tf.import_graph_def(graph_def=graph_def,
                                name='',
                                return_elements=out_tnsr_op_names)

            # Feed dict maps from placeholder name to DF column name
            feed_dict = {
                self._getSparkDlOpName(tnsr_name): col_name
                for col_name, tnsr_name in input_mapping
            }
            fetches = [
                tfx.get_tensor(tnsr_name, graph)
                for tnsr_name in out_tnsr_op_names
            ]

            out_df = tfs.map_blocks(fetches, analyzed_df, feed_dict=feed_dict)
            # We still have to rename output columns
            for tnsr_name, new_colname in output_mapping:
                old_colname = tfx.op_name(tnsr_name, graph)
                if old_colname != new_colname:
                    out_df = out_df.withColumnRenamed(old_colname, new_colname)

        return out_df
Ejemplo n.º 2
0
def assign_center(df, feature_column, residual_column, assigned_coarse_column,
                  assigned_pq_column, coarse_centers, pq_centers, m):
    """
    Assign the points into corresponding indexes

    :param df: dataframe, contains all points
    :param feature_column: string, the points column within df
    :param residual_column: string, the points residual column to be saved
    :param assigned_coarse_column: string, the output column name for coarse index.
    :param assigned_pq_column: string, the output column name for pq indexes.
    :param coarse_centers: numpy.array, [num_centroids, num_features] the coarse cluster centers
    :param pq_centers: numpy.array, [num_centroids, num_features] the pq cluster centers
    :param m: int, number of groups a point is spitted into for pq
    :return: dataframe, contains two extra columns, `assigned_coarse_column` and `assigned_pq_column`
    """
    df = residual_of_closest(df, feature_column, residual_column,
                             coarse_centers)
    num_features = coarse_centers.shape[1]
    with tf.Graph().as_default():
        points = tf.placeholder(tf.double,
                                shape=[None, num_features],
                                name=feature_column)
        residuals = tf.placeholder(tf.double,
                                   shape=[None, num_features],
                                   name=residual_column)
        assigned_coarse = _assign_center(points, coarse_centers,
                                         assigned_coarse_column)
        assigned_pq = _assign_center(residuals, pq_centers, assigned_pq_column,
                                     m)
        return tfs.map_blocks([assigned_coarse, assigned_pq], df)
Ejemplo n.º 3
0
    def _transform(self, dataset):
        if len([field for field in dataset.schema if field.dataType == DoubleType()]) > 0:
            logger.warn("Detected DoubleType columns in dataframe passed to transform(). In "
                        "Deep Learning Pipelines 1.0 and above, DoubleType columns can only be "
                        "fed to input tensors of type tf.float64. To feed dataframe data to "
                        "tensors of other types (e.g. tf.float32, tf.int32, tf.int64), use the "
                        "corresponding Spark SQL data types (FloatType, IntegerType, LongType).")

        graph_def = self._optimize_for_inference()
        input_mapping = self.getInputMapping()
        output_mapping = self.getOutputMapping()

        graph = tf.Graph()
        with tf.Session(graph=graph):
            analyzed_df = tfs.analyze(dataset)
            out_tnsr_op_names = [tfx.op_name(tnsr_name) for tnsr_name, _ in output_mapping]
            # Load graph
            tf.import_graph_def(graph_def=graph_def, name='', return_elements=out_tnsr_op_names)
            # Feed dict maps from placeholder name to DF column name
            feed_dict = {tfx.op_name(tnsr_name): col_name for col_name, tnsr_name in input_mapping}
            fetches = [tfx.get_tensor(tnsr_name, graph) for tnsr_name in out_tnsr_op_names]
            out_df = tfs.map_blocks(fetches, analyzed_df, feed_dict=feed_dict)
            # We still have to rename output columns
            for tnsr_name, new_colname in output_mapping:
                old_colname = tfx.op_name(tnsr_name, graph)
                if old_colname != new_colname:
                    out_df = out_df.withColumnRenamed(old_colname, new_colname)

        return out_df
Ejemplo n.º 4
0
    def _transform(self, dataset):
        if any([field.dataType == DoubleType() for field in dataset.schema]):
            logger.warning("Detected DoubleType columns in dataframe passed to transform(). In "
                           "Deep Learning Pipelines 1.0 and above, DoubleType columns can only be "
                           "fed to input tensors of type tf.float64. To feed dataframe data to "
                           "tensors of other types (e.g. tf.float32, tf.int32, tf.int64), use the "
                           "corresponding Spark SQL data types (FloatType, IntegerType, LongType).")

        graph_def = self._optimize_for_inference()
        input_mapping = self.getInputMapping()
        output_mapping = self.getOutputMapping()

        graph = tf.Graph()
        with tf.Session(graph=graph):
            analyzed_df = tfs.analyze(dataset)
            out_tnsr_op_names = [tfx.op_name(tnsr_name) for tnsr_name, _ in output_mapping]
            # Load graph
            tf.import_graph_def(graph_def=graph_def, name='', return_elements=out_tnsr_op_names)
            # Feed dict maps from placeholder name to DF column name
            feed_dict = {tfx.op_name(tnsr_name): col_name for col_name, tnsr_name in input_mapping}
            fetches = [tfx.get_tensor(tnsr_name, graph) for tnsr_name in out_tnsr_op_names]
            out_df = tfs.map_blocks(fetches, analyzed_df, feed_dict=feed_dict)
            # We still have to rename output columns
            for tnsr_name, new_colname in output_mapping:
                old_colname = tfx.op_name(tnsr_name, graph)
                if old_colname != new_colname:
                    out_df = out_df.withColumnRenamed(old_colname, new_colname)

        return out_df
Ejemplo n.º 5
0
def residual_of_closest(df,
                        feature_column,
                        residual_column,
                        centers,
                        assigned_column='assigned'):
    """
    Residual between points and their closest center

    :param df: dataframe, contains all points
    :param feature_column: string, the points column within df
    :param residual_column: string, the output column name for residual error between closest center.
    :param centers: numpy.array, [num_centroids, num_features] the k cluster centers
    :param assigned_column: string, the output column name for index of closest center.
    :return: dataframe, contains two extra columns, `residual_column`, `assigned_column`
    """
    df = tfs.analyze(df)
    num_features = centers.shape[1]
    with tf.Graph().as_default():
        points = tf.placeholder(tf.double,
                                shape=[None, num_features],
                                name=feature_column)
        assigned = _assign_center(points, centers)
        residual = _residual_of_assigned(points, assigned, centers,
                                         residual_column)
        return tfs.map_blocks([assigned, residual], df)
Ejemplo n.º 6
0
def run_one_step(dataframe, start_centers):
    """
    Performs one iteration of K-Means.

    This function takes a dataframe with dense feature vectors, a set of centroids, and returns
    a new set of centroids along with the total distance of points to centroids.

    This function calculates for each point the closest centroid and then aggregates the newly
    formed clusters to find the new centroids.

    This function uses Spark to distribute the aggregation amongst the node.

    :param dataframe: a dataframe containing a column of features (an array of doubles)
    :param start_centers: a k x m matrix with k the number of centroids and m the number of features
    :return: a k x m matrix, and a positive double
    """
    # The dimensions in the problem
    (num_centroids, num_features) = np.shape(start_centers)
    # For each feature vector, compute the nearest centroid and the distance to that centroid.
    # The index of the nearest centroid is stored in the 'indexes' column.
    # We also add a column of 1's that will be reduced later to count the number of elements in
    # each cluster.
    with tf.Graph().as_default() as g:
        # The placeholder for the input: we use the block format
        points = tf.placeholder(tf.double,
                                shape=[None, num_features],
                                name='features')
        # The shape of the block is extracted as a TF variable.
        num_points = tf.stack([tf.shape(points)[0]], name="num_points")
        distances = tf_compute_distances(points, start_centers)
        # The outputs of the program.
        # The closest centroids are extracted.
        indexes = tf.argmin(distances, 1, name='indexes')
        # This could be done based on the indexes as well.
        min_distances = tf.reduce_min(distances, 1, name='min_distances')
        counts = tf.tile(tf.constant([1]), num_points, name='count')
        df2 = tfs.map_blocks([indexes, counts, min_distances], dataframe)
    # Perform the reduction: we regroup the points by their centroid indexes.
    gb = df2.groupBy("indexes")
    with tf.Graph().as_default() as g:
        # Look at the documentation of tfs.aggregate for the naming conventions of the placeholders.
        x_input = tfs.block(df2, "features", tf_name="features_input")
        count_input = tfs.block(df2, "count", tf_name="count_input")
        md_input = tfs.block(df2,
                             "min_distances",
                             tf_name="min_distances_input")
        # Each operation is just the sum.
        x = tf.reduce_sum(x_input, [0], name='features')
        count = tf.reduce_sum(count_input, [0], name='count')
        min_distances = tf.reduce_sum(md_input, [0], name='min_distances')
        df3 = tfs.aggregate([x, count, min_distances], gb)
    # Get the new centroids
    df3_c = df3.collect()
    # The new centroids.
    new_centers = np.array(
        [np.array(row.features) / row['count'] for row in df3_c])
    total_distances = np.sum([row['min_distances'] for row in df3_c])
    return (new_centers, total_distances)
Ejemplo n.º 7
0
def run_one_step(dataframe, start_centers):
    """
    Performs one iteration of K-Means.

    This function takes a dataframe with dense feature vectors, a set of centroids, and returns
    a new set of centroids along with the total distance of points to centroids.

    This function calculates for each point the closest centroid and then aggregates the newly
    formed clusters to find the new centroids.

    This function uses Spark to distribute the aggregation amongst the node.

    :param dataframe: a dataframe containing a column of features (an array of doubles)
    :param start_centers: a k x m matrix with k the number of centroids and m the number of features
    :return: a k x m matrix, and a positive double
    """
    # The dimensions in the problem
    (num_centroids, num_features) = np.shape(start_centers)
    # For each feature vector, compute the nearest centroid and the distance to that centroid.
    # The index of the nearest centroid is stored in the 'indexes' column.
    # We also add a column of 1's that will be reduced later to count the number of elements in
    # each cluster.
    with tf.Graph().as_default() as g:
        # The placeholder for the input: we use the block format
        points = tf.placeholder(tf.double, shape=[None, num_features], name='features')
        # The shape of the block is extracted as a TF variable.
        num_points = tf.stack([tf.shape(points)[0]], name="num_points")
        distances = tf_compute_distances(points, start_centers)
        # The outputs of the program.
        # The closest centroids are extracted.
        indexes = tf.argmin(distances, 1, name='indexes')
        # This could be done based on the indexes as well.
        min_distances = tf.reduce_min(distances, 1, name='min_distances')
        counts = tf.tile(tf.constant([1]), num_points, name='count')
        df2 = tfs.map_blocks([indexes, counts, min_distances], dataframe)
    # Perform the reduction: we regroup the points by their centroid indexes.
    gb = df2.groupBy("indexes")
    with tf.Graph().as_default() as g:
        # Look at the documentation of tfs.aggregate for the naming conventions of the placeholders.
        x_input = tfs.block(df2, "features", tf_name="features_input")
        count_input = tfs.block(df2, "count", tf_name="count_input")
        md_input = tfs.block(df2, "min_distances", tf_name="min_distances_input")
        # Each operation is just the sum.
        x = tf.reduce_sum(x_input, [0], name='features')
        count = tf.reduce_sum(count_input, [0], name='count')
        min_distances = tf.reduce_sum(md_input, [0], name='min_distances')
        df3 = tfs.aggregate([x, count, min_distances], gb)
    # Get the new centroids
    df3_c = df3.collect()
    # The new centroids.
    new_centers = np.array([np.array(row.features) / row['count'] for row in df3_c])
    total_distances = np.sum([row['min_distances'] for row in df3_c])
    return (new_centers, total_distances)
Ejemplo n.º 8
0
 def test_map_blocks_1(self):
     data = [Row(x=float(x)) for x in range(10)]
     df = self.sql.createDataFrame(data)
     with tf.Graph().as_default() as g:
         # The placeholder that corresponds to column 'x'
         x = tf.placeholder(tf.double, shape=[None], name="x")
         # The output that adds 3 to x
         z = tf.add(x, 3, name='z')
         # The resulting dataframe
         df2 = tfs.map_blocks(z, df)
     data2 = df2.collect()
     assert data2[0].z == 3.0, data2
Ejemplo n.º 9
0
 def test_map_blocks_trimmed_1(self):
     data = [Row(x=float(x)) for x in range(3)]
     df = self.sql.createDataFrame(data)
     with tf.Graph().as_default() as g:
         # The placeholder that corresponds to column 'x'
         x = tf.placeholder(tf.double, shape=[None], name="x")
         # The output discards the input and return a single row of data
         z = tf.constant([2], name='z')
         # The resulting dataframe
         df2 = tfs.map_blocks(z, df, trim=True)
     data2 = df2.collect()
     assert data2[0].z == 2, data2
Ejemplo n.º 10
0
 def test_map_blocks_trimmed_1(self):
     data = [Row(x=float(x)) for x in range(3)]
     df = self.sql.createDataFrame(data)
     with tf.Graph().as_default() as g:
         # The placeholder that corresponds to column 'x'
         x = tf.placeholder(tf.double, shape=[None], name="x")
         # The output discards the input and return a single row of data
         z = tf.constant([2], name='z')
         # The resulting dataframe
         df2 = tfs.map_blocks(z, df, trim=True)
     data2 = df2.collect()
     assert data2[0].z == 2, data2
Ejemplo n.º 11
0
 def test_map_blocks_2(self):
     data = [dict(x=float(x)) for x in range(10)]
     df = pd.DataFrame(data)
     with tf.Graph().as_default() as g:
         # The placeholder that corresponds to column 'x'
         x = tf.placeholder(tf.double, shape=[None], name="x")
         # The output that adds 3 to x
         z = tf.add(x, 3, name='z')
         # The resulting dataframe
         df2 = tfs.map_blocks(z, df)
     data2 = df2
     assert data2.z[0] == 3.0, data2
Ejemplo n.º 12
0
 def test_map_blocks_0(self):
     data = [Row(x=float(x)) for x in range(10)]
     df = self.sql.createDataFrame(data)
     with tf.Graph().as_default() as g:
         # The placeholder that corresponds to column 'x'
         x = tf.placeholder(tf.double, shape=[None], name="x")
         # The output that adds 3 to x
         y = tf.Variable(3.0, dtype=tf.double, name='y')
         z = tf.add(x, y, name='z')
         # The resulting dataframe
         df2 = tfs.map_blocks(z, df)
     data2 = df2.collect()
     assert data2[0].z == 3.0, data2
Ejemplo n.º 13
0
def covariance(df, feature_column, num_features, coarse_center):
    with tf.Graph().as_default():
        features = tf.placeholder(tf.double, [None, num_features],
                                  name=feature_column)
        count = tf.identity(tf.ones_like(features)[:, 0], name='count')
        out = tf.identity(tf.map_fn(lambda x: tf.einsum('i,j->ij', x, x),
                                    features,
                                    dtype=tf.double),
                          name='out')
        df1 = tfs.map_blocks(out, df)
    with tf.Graph().as_default():
        features = tf.placeholder(tf.double, [None, num_features],
                                  name=feature_column + '_input')
        out = tf.placeholder(tf.double, [None, num_features, num_features],
                             name='out_input')
        count = tf.placeholder(tf.double, [None], name='count_input')
        expected_mean = tf.identity(tf.reduce_sum(features, axis=0),
                                    name=feature_column)
        expected_out = tf.identity(tf.reduce_sum(out, axis=0), name='out')
        expected_count = tf.identity(tf.reduce_sum(count, axis=0),
                                     name='count')
        df2 = tfs.aggregate([expected_mean, expected_out, expected_count],
                            df1.groupby(coarse_center))
    with tf.Graph().as_default():
        features = tf.placeholder(tf.double, [None, num_features],
                                  name=feature_column)
        out = tf.placeholder(tf.double, [None, num_features, num_features],
                             name='out')
        count = tf.placeholder(tf.double, [None], name='count')
        covariance = tf.identity(tf.map_fn(
            lambda (f, o, c): (o + tf.transpose(o)) /
            (2 * c - 2) - tf.einsum('i,j->ij', f, f), (features, out, count),
            dtype=tf.double),
                                 name='covariance')
        df3 = tfs.map_blocks(covariance, df2)
    return df3
Ejemplo n.º 14
0
def tf_serving_with_dataframe(df, model_base_path, model_version=None):
    """

    :param df: spark dataframe, batch input for the model
    :param model_base_path: str, tensorflow saved Model model base path
    :param model_version: int, tensorflow saved Model model version, default None
    :return: spark dataframe, with predicted result.
    """
    import tensorframes as tfs
    g, feed_tensors, fetch_tensors = load_model(model_base_path, model_version)
    with g.as_default():
        df = rename_by_mapping(df, feed_tensors)
        df = tfs.analyze(df)
        df = tfs.map_blocks(fetch_tensors.values(), df)
        df = rename_by_mapping(df, feed_tensors, reverse=True)
        return rename_by_mapping(df, fetch_tensors, reverse=True)
Ejemplo n.º 15
0
def simple_example_1():
    spark = SparkSession.builder.appName(
        'simple-tensorframes-example-1').getOrCreate()
    spark.sparkContext.setLogLevel('WARN')

    rdd = [Row(x=float(x)) for x in range(10)]
    df = spark.createDataFrame(rdd)

    df.show()

    # Execute the tensor graph.
    with tf.Graph().as_default() as graph:
        # A block placeholder.
        x = tfs.block(df, 'x')
        z = tf.add(x, 3, name='z')

        # Tensor -> dataframe.
        df2 = tfs.map_blocks(z, df)

    print('z =', z)
    df2.show()
Ejemplo n.º 16
0
    centers = tf.constant(init_centers)
    squares = tf.reduce_sum(tf.square(points), reduction_indices=1)
    center_squares = tf.reduce_sum(tf.square(centers), reduction_indices=1)
    prods = tf.matmul(points, centers, transpose_b = True)
    t1a = tf.expand_dims(center_squares, 0)
    t1b = tf.pack([num_points, 1])
    t1 = tf.tile(t1a, t1b)
    t2a = tf.expand_dims(squares, 1)
    t2b = tf.pack([1, k])
    t2 = tf.tile(t2a, t2b)
    distances = t1 + t2 - 2 * prods
    # TODO cast
    indexes = tf.argmin(distances, 1, name='indexes')
    min_distances = tf.reduce_min(distances, 1, name='min_distances')
    counts = tf.tile(tf.constant([1]), tf.pack([num_points]), name='count')
    df2 = tfs.map_blocks([indexes, counts, min_distances], df0)

# Perform the reduction
gb = df2.groupBy("indexes")
with tf.Graph().as_default() as g:
    # Look at the documentation of tfs.aggregate for the naming conventions of the placeholders.
    x_input = tfs.block(df2, "features", tf_name="features_input")
    count_input = tfs.block(df2, "count", tf_name="count_input")
    md_input = tfs.block(df2, "min_distances", tf_name="min_distances_input")
    x = tf.reduce_sum(x_input, [0], name='features')
    count = tf.reduce_sum(count_input, [0], name='count')
    min_distances = tf.reduce_sum(md_input, [0], name='min_distances')
    df3 = tfs.aggregate([x, count, min_distances], gb)

# Get the new centroids
df3_c = df3.collect()
Ejemplo n.º 17
0
from tensorframes.core import _java_api
japi = _java_api()
_java_api().initialize_logging()

data = [Row(x=float(x), key=str(x / 3)) for x in range(1, 6)]
df = sqlContext.createDataFrame(data)
tfs.block(df, "x")

data = [Row(x=float(x), key=str(x / 3)) for x in range(1, 6)]
df = sqlContext.createDataFrame(data)
gb = df.groupBy("key")
with tf.Graph().as_default() as g:
    x_input = tfs.block(df, "x", tf_name="x_input")
    x = tf.reduce_sum(x_input, [0], name='x')
    df2 = tfs.aggregate(x, gb)


data = [Row(x=float(x)) for x in range(5)]
df = sqlContext.createDataFrame(data)
with tf.Graph().as_default() as g:
    # The placeholder that corresponds to column 'x'
    x = tf.placeholder(tf.double, shape=[None], name="x")
    # The output that adds 3 to x
    z = tf.add(x, 3, name='z')
    # The resulting dataframe
    df2 = tfs.map_blocks(z, df)

df2.show()

Ejemplo n.º 18
0
def run_one_step2(dataframe, start_centers):
    """
    Performs one iteration of K-Means.

    This function takes a dataframe with dense feature vectors, a set of centroids, and returns
    a new set of centroids along with the total distance of points to centroids.

    This function calculates for each point the closest centroid and then aggregates the newly
    formed clusters to find the new centroids.

    This function performs most of the aggregation in TensorFlow.

    :param dataframe: a dataframe containing a column of features (an array of doubles)
    :param start_centers: a k x m matrix with k the number of centroids and m the number of features
    :return: a k x m matrix, and a positive double
    """
    # The dimensions in the problem
    (num_centroids, _) = np.shape(start_centers)
    # For each feature vector, compute the nearest centroid and the distance to that centroid.
    # The index of the nearest centroid is stored in the 'indexes' column.
    # We also add a column of 1's that will be reduced later to count the number of elements in
    # each cluster.
    with tf.Graph().as_default() as g:
        # The placeholder for the input: we use the block format
        points = tf.placeholder(tf.double, shape=[None, num_features], name='features')
        # The distances
        distances = tf_compute_distances(points, start_centers)
        # The rest of this block performs a pre-aggregation step in TF, to limit the
        # communication between TF and Spark.
        # The closest centroids are extracted.
        indexes = tf.argmin(distances, 1, name='indexes')
        min_distances = tf.reduce_min(distances, 1, name='min_distances')
        num_points = tf.pack([tf.shape(points)[0]], name="num_points")
        counts = tf.tile(tf.constant([1]), num_points, name='count')
        # These compute the aggregate based on the indexes.
        block_points = tf.unsorted_segment_sum(points, indexes, num_centroids, name="block_points")
        block_counts = tf.unsorted_segment_sum(counts, indexes, num_centroids, name="block_counts")
        block_distances = tf.reduce_sum(min_distances, name="block_distances")
        # One leading dimension is added to express the fact that the previous elements are just
        # one row in the final dataframe.
        # The final dataframe has one row per block.
        agg_points = tf.expand_dims(block_points, 0, name="agg_points")
        agg_counts = tf.expand_dims(block_counts, 0, name="agg_counts")
        agg_distances = tf.expand_dims(block_distances, 0, name="agg_distances")
        # Using trimming to drop the original data (we are just returning one row of data per
        # block).
        df2 = tfs.map_blocks([agg_points, agg_counts, agg_distances],
                             dataframe, trim=True)
    # Now we simply collect and sum the elements
    with tf.Graph().as_default() as g:
        # Look at the documentation of tfs.aggregate for the naming conventions of the placeholders.
        x_input = tf.placeholder(tf.double,
                                 shape=[None, num_centroids, num_features],
                                 name='agg_points_input')
        count_input = tf.placeholder(tf.int32,
                                     shape=[None, num_centroids],
                                     name='agg_counts_input')
        md_input = tf.placeholder(tf.double,
                                  shape=[None],
                                  name='agg_distances_input')
        # Each operation is just the sum.
        x = tf.reduce_sum(x_input, [0], name='agg_points')
        count = tf.reduce_sum(count_input, [0], name='agg_counts')
        min_distances = tf.reduce_sum(md_input, [0], name='agg_distances')
        (x_, count_, total_distances) = tfs.reduce_blocks([x, count, min_distances], df2)
    # The new centers
    new_centers = (x_.T / (count_ + 1e-7)).T
    return (new_centers, total_distances)
Ejemplo n.º 19
0
def run_one_step(dataframe, start_centers):
    """
    Performs one iteration of K-Means.

    This function takes a dataframe with dense feature vectors, a set of centroids, and returns
    a new set of centroids along with the total distance of points to centroids.

    This function calculates for each point the closest centroid and then aggregates the newly
    formed clusters to find the new centroids.

    :param dataframe: a dataframe containing a column of features (an array of doubles)
    :param start_centers: a k x m matrix with k the number of centroids and m the number of features
    :return: a k x m matrix, and a positive double
    """
    # The dimensions in the problem
    (num_centroids, num_features) = np.shape(start_centers)
    # For each feature vector, compute the nearest centroid and the distance to that centroid.
    # The index of the nearest centroid is stored in the 'indexes' column.
    # We also add a column of 1's that will be reduced later to count the number of elements in
    # each cluster.
    with tf.Graph().as_default() as g:
        # The placeholder for the input: we use the block format
        points = tf.placeholder(tf.double, shape=[None, num_features], name='features')
        # The shape of the block is extracted as a TF variable.
        num_points = tf.shape(points)[0]
        # The centers are embedded in the TF program.
        centers = tf.constant(start_centers)
        # Computation of the minimum distance. This is a standard implementation that follows
        # what MLlib does.
        squares = tf.reduce_sum(tf.square(points), reduction_indices=1)
        center_squares = tf.reduce_sum(tf.square(centers), reduction_indices=1)
        prods = tf.matmul(points, centers, transpose_b = True)
        # This code simply expresses two outer products: center_squares * ones(num_points)
        # and ones(num_centroids) * squares
        t1a = tf.expand_dims(center_squares, 0)
        t1b = tf.pack([num_points, 1])
        t1 = tf.tile(t1a, t1b)
        t2a = tf.expand_dims(squares, 1)
        t2b = tf.pack([1, num_centroids])
        t2 = tf.tile(t2a, t2b)
        distances = t1 + t2 - 2 * prods
        # The outputs of the program.
        # The closest centroids are extracted.
        indexes = tf.argmin(distances, 1, name='indexes')
        # This could be done based on the indexes as well.
        min_distances = tf.reduce_min(distances, 1, name='min_distances')
        counts = tf.tile(tf.constant([1]), tf.pack([num_points]), name='count')
        df2 = tfs.map_blocks([indexes, counts, min_distances], dataframe)
    # Perform the reduction: we regroup the point by their centroid indexes.
    gb = df2.groupBy("indexes")
    with tf.Graph().as_default() as g:
        # Look at the documentation of tfs.aggregate for the naming conventions of the placeholders.
        x_input = tfs.block(df2, "features", tf_name="features_input")
        count_input = tfs.block(df2, "count", tf_name="count_input")
        md_input = tfs.block(df2, "min_distances", tf_name="min_distances_input")
        # Each operation is just the sum.
        x = tf.reduce_sum(x_input, [0], name='features')
        count = tf.reduce_sum(count_input, [0], name='count')
        min_distances = tf.reduce_sum(md_input, [0], name='min_distances')
        df3 = tfs.aggregate([x, count, min_distances], gb)
    # Get the new centroids
    df3_c = df3.collect()
    # The new centroids.
    new_centers = np.array([np.array(row.features) / row['count'] for row in df3_c])
    total_distances = np.sum([row['min_distances'] for row in df3_c])
    return (new_centers, total_distances)
Ejemplo n.º 20
0
import tensorflow as tf
import tensorframes as tfs
from pyspark.sql import Row

data = [Row(x=float(x)) for x in range(10)]
df = sqlContext.createDataFrame(data)
with tf.Graph().as_default() as g:
    # The TensorFlow placeholder that corresponds to column 'x'.
    # The shape of the placeholder is automatically inferred from the DataFrame.
    x = tfs.block(df, "x")
    # The output that adds 3 to x
    z = tf.add(x, 3, name='z')
    # The resulting dataframe
    df2 = tfs.map_blocks(z, df)

# The transform is lazy as for most DataFrame operations. This will trigger it:
df2.collect()
Ejemplo n.º 21
0
# The input data
data = [Row(x=[float(x), float(2 * x)], key=str(x % 2)) for x in range(1, 6)]
df = sqlContext.createDataFrame(data)
df = tfs.analyze(sqlContext.createDataFrame(data))

# The geometric mean:
# TODO(tjh) make a test out of this, it found some bugs
# - non numeric columns (string)
# - unused columns
# - output that has a child
col_name = "x"
col_key = "key"
with tf.Graph().as_default() as g:
    x = tfs.block(df, col_name)
    invs = tf.inv(tf.to_double(x), name="invs")
    df2 = tfs.map_blocks([invs, tf.ones_like(invs, name="count")], df)


# The geometric mean
gb = df2.select(col_key, "invs", "count").groupBy("key")
with tf.Graph().as_default() as g:
    x_input = tfs.block(df2, "invs", tf_name="invs_input")
    count_input = tfs.block(df2, "invs", tf_name="count_input")
    x = tf.reduce_sum(x_input, [0], name='invs')
    count = tf.reduce_sum(count_input, [0], name='count')
    df3 = tfs.aggregate([x, count], gb)

with tf.Graph().as_default() as g:
    invs = tfs.block(df2, "invs")
    count = tfs.block(df2, "count")
    geom_mean = tf.div(tf.to_double(count), invs,  name = "harmonic_mean")
Ejemplo n.º 22
0
# The input data
data = [Row(x=[float(x), float(2 * x)], key=str(x % 2)) for x in range(1, 6)]
df = sqlContext.createDataFrame(data)
df = tfs.analyze(sqlContext.createDataFrame(data))

# The geometric mean:
# TODO(tjh) make a test out of this, it found some bugs
# - non numeric columns (string)
# - unused columns
# - output that has a child
col_name = "x"
col_key = "key"
with tf.Graph().as_default() as g:
    x = tfs.block(df, col_name)
    invs = tf.inv(tf.to_double(x), name="invs")
    df2 = tfs.map_blocks([invs, tf.ones_like(invs, name="count")], df)

# The geometric mean
gb = df2.select(col_key, "invs", "count").groupBy("key")
with tf.Graph().as_default() as g:
    x_input = tfs.block(df2, "invs", tf_name="invs_input")
    count_input = tfs.block(df2, "invs", tf_name="count_input")
    x = tf.reduce_sum(x_input, [0], name='invs')
    count = tf.reduce_sum(count_input, [0], name='count')
    df3 = tfs.aggregate([x, count], gb)

with tf.Graph().as_default() as g:
    invs = tfs.block(df2, "invs")
    count = tfs.block(df2, "count")
    geom_mean = tf.div(tf.to_double(count), invs, name="harmonic_mean")
    df4 = tfs.map_blocks(geom_mean, df3).select("key", "harmonic_mean")
Ejemplo n.º 23
0
def infer(df, model_file=None, aggregate=True):
    """
    Predict.

    Expects the dataframe to contains a column called `coordinates` of the
    data type array<array<double>>.

    Parameters
    ----------
    model_file: String.
                Path to a .pb tensorflow model file.
    df: A pyspark.sql.dataframe.DataFrame.
        Expects a column called `coordinates` of array<array<double>> type.
    model_file: String.
                Full path to a model .pb file.
                Expects a correspoinding metadata json file in the same directory.
                If not provided, loads the sample model.
    aggregate: Boolean.
               Whether to aggregate piece-wise results into a prediction for the full trace.

    Returns
    -------
    A pyspark.sql.dataframe.DataFrame with two extra columns
    `probas` (array<double>) for the probabilities of each class, and
    `pred_modality` (string) for the class that has the highest probability.
    """

    # Use sample model if a model is not provided.
    if model_file is None:
        dir, _ = os.path.split(__file__)
        model_file = os.path.join(
            dir, "sample_model/sample_model_optimised_frozen.pb")

    # Load model metadata
    metadata = load_model_metadata(model_file)
    assert metadata is not None

    # Preprocess data
    with_ids_and_labels_df = include_id_and_label(
        df)  # To be joined with prediction
    with_ids_and_labels_df.persist()

    with_word_vecs_df, _, _ = include_word_vecs(with_ids_and_labels_df,
                                                metadata)
    with_phrases_df = create_phrases(
        with_word_vecs_df,
        MODEL_INPUT_CONFIG["WORD_VEC_COL"],
        MODEL_INPUT_CONFIG["ID_COL"],
        MODEL_INPUT_CONFIG["WORD_POS_COL"],
        desired_phrase_length=metadata["desired_phrase_length"],
    )
    with_phrases_df.persist()

    # Read in serialized tensorflow graph
    with tf.gfile.FastGFile(model_file, "rb") as f:
        model_graph = f.read()

    with tf.Graph().as_default() as g:
        # Reconstruct tf graph (parse serialised graph)
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(model_graph)

        input_op_name = [
            n.name for n in graph_def.node
            if n.op.startswith("Placeholder") and n.name.startswith("input")
        ][0]
        output_op_name = [
            n.name for n in graph_def.node
            if n.op.startswith("Softmax") and n.name.startswith("output")
        ][0]

        # Add metadata on the input size to the dataframe for tensorframes
        input_shape = [None, *metadata["input_shape"]]
        model_input_df = tfs.append_shape(
            with_phrases_df,
            with_phrases_df[MODEL_INPUT_CONFIG["INPUT_COL"]],
            shape=input_shape,
        )

        # Load graph
        [input_op, output_op] = tf.import_graph_def(
            graph_def, return_elements=[input_op_name, output_op_name])

        # Predict
        model_output_df = tfs.map_blocks(
            output_op.outputs,
            model_input_df,
            feed_dict={input_op.name: MODEL_INPUT_CONFIG["INPUT_COL"]},
        )

        # Rename column
        output_col = list(
            set(model_output_df.columns) - set(with_phrases_df.columns))[
                0]  # Something like 'import/output/Softmax', but might change
        phrasewise_res_df = model_output_df.withColumnRenamed(
            output_col, "probas").withColumn("pred_label",
                                             argmax(col("probas")))

        if aggregate:
            phrasewise_res_df.persist()

            # Average piece-wise probabilities into full-trace probabilities, and
            # find the label with the highest probability.
            with_avg_prob_df = avg_probability(phrasewise_res_df, "id",
                                               "probas",
                                               len(metadata["classes"]))

            # Convert integer labels into string classes
            with_predicted_labels_df = reverse_create_label(
                with_avg_prob_df,
                "sentence_pred_label",
                "pred_modality",
                metadata["classes"],
            ).withColumnRenamed("sentence_probas", "probas")

            # Join prediction with the original dataframe to get the coordinates
            # Left join to handle edge case in which trace has fewer than three
            res_df = with_ids_and_labels_df.join(
                with_predicted_labels_df,
                on=MODEL_INPUT_CONFIG["ID_COL"],
                how="left")

        else:
            # TO-DO: return pieces of coordinates rather than phrases
            res_df = reverse_create_label(phrasewise_res_df, "pred_label",
                                          "pred_modality", metadata["classes"])

        # clean up
        with_ids_and_labels_df.unpersist()
        with_phrases_df.unpersist()
        phrasewise_res_df.unpersist()

        n_pre_infer = df.count()
        n_post_infer = res_df.count()
        if (n_pre_infer is not n_post_infer) and aggregate == True:
            raise Exception("Some traces dropped during inference!")

        res_df.persist()
        return res_df.withColumn(
            "probas",
            coalesce(col("probas"),
                     array([lit(0.0), lit(0.0), lit(0.0)]))).withColumn(
                         "pred_modality",
                         coalesce(col("pred_modality"), lit("NA")))
Ejemplo n.º 24
0
def run_one_step2(dataframe, start_centers):
    """
    Performs one iteration of K-Means.

    This function takes a dataframe with dense feature vectors, a set of centroids, and returns
    a new set of centroids along with the total distance of points to centroids.

    This function calculates for each point the closest centroid and then aggregates the newly
    formed clusters to find the new centroids.

    This function performs most of the aggregation in TensorFlow.

    :param dataframe: a dataframe containing a column of features (an array of doubles)
    :param start_centers: a k x m matrix with k the number of centroids and m the number of features
    :return: a k x m matrix, and a positive double
    """
    # The dimensions in the problem
    (num_centroids, _) = np.shape(start_centers)
    # For each feature vector, compute the nearest centroid and the distance to that centroid.
    # The index of the nearest centroid is stored in the 'indexes' column.
    # We also add a column of 1's that will be reduced later to count the number of elements in
    # each cluster.
    with tf.Graph().as_default() as g:
        # The placeholder for the input: we use the block format
        points = tf.placeholder(tf.double, shape=[None, num_features], name='features')
        # The distances
        distances = tf_compute_distances(points, start_centers)
        # The rest of this block performs a pre-aggregation step in TF, to limit the
        # communication between TF and Spark.
        # The closest centroids are extracted.
        indexes = tf.argmin(distances, 1, name='indexes')
        min_distances = tf.reduce_min(distances, 1, name='min_distances')
        num_points = tf.stack([tf.shape(points)[0]], name="num_points")
        counts = tf.tile(tf.constant([1]), num_points, name='count')
        # These compute the aggregate based on the indexes.
        block_points = tf.unsorted_segment_sum(points, indexes, num_centroids, name="block_points")
        block_counts = tf.unsorted_segment_sum(counts, indexes, num_centroids, name="block_counts")
        block_distances = tf.reduce_sum(min_distances, name="block_distances")
        # One leading dimension is added to express the fact that the previous elements are just
        # one row in the final dataframe.
        # The final dataframe has one row per block.
        agg_points = tf.expand_dims(block_points, 0, name="agg_points")
        agg_counts = tf.expand_dims(block_counts, 0, name="agg_counts")
        agg_distances = tf.expand_dims(block_distances, 0, name="agg_distances")
        # Using trimming to drop the original data (we are just returning one row of data per
        # block).
        df2 = tfs.map_blocks([agg_points, agg_counts, agg_distances],
                             dataframe, trim=True)
    # Now we simply collect and sum the elements
    with tf.Graph().as_default() as g:
        # Look at the documentation of tfs.aggregate for the naming conventions of the placeholders.
        x_input = tf.placeholder(tf.double,
                                 shape=[None, num_centroids, num_features],
                                 name='agg_points_input')
        count_input = tf.placeholder(tf.int32,
                                     shape=[None, num_centroids],
                                     name='agg_counts_input')
        md_input = tf.placeholder(tf.double,
                                  shape=[None],
                                  name='agg_distances_input')
        # Each operation is just the sum.
        x = tf.reduce_sum(x_input, [0], name='agg_points')
        count = tf.reduce_sum(count_input, [0], name='agg_counts')
        min_distances = tf.reduce_sum(md_input, [0], name='agg_distances')
        (x_, count_, total_distances) = tfs.reduce_blocks([x, count, min_distances], df2)
    # The new centers
    new_centers = (x_.T / (count_ + 1e-7)).T
    return (new_centers, total_distances)
Ejemplo n.º 25
0
    centers = tf.constant(init_centers)
    squares = tf.reduce_sum(tf.square(points), reduction_indices=1)
    center_squares = tf.reduce_sum(tf.square(centers), reduction_indices=1)
    prods = tf.matmul(points, centers, transpose_b=True)
    t1a = tf.expand_dims(center_squares, 0)
    t1b = tf.pack([num_points, 1])
    t1 = tf.tile(t1a, t1b)
    t2a = tf.expand_dims(squares, 1)
    t2b = tf.pack([1, k])
    t2 = tf.tile(t2a, t2b)
    distances = t1 + t2 - 2 * prods
    # TODO cast
    indexes = tf.argmin(distances, 1, name='indexes')
    min_distances = tf.reduce_min(distances, 1, name='min_distances')
    counts = tf.tile(tf.constant([1]), tf.pack([num_points]), name='count')
    df2 = tfs.map_blocks([indexes, counts, min_distances], df0)

# Perform the reduction
gb = df2.groupBy("indexes")
with tf.Graph().as_default() as g:
    # Look at the documentation of tfs.aggregate for the naming conventions of the placeholders.
    x_input = tfs.block(df2, "features", tf_name="features_input")
    count_input = tfs.block(df2, "count", tf_name="count_input")
    md_input = tfs.block(df2, "min_distances", tf_name="min_distances_input")
    x = tf.reduce_sum(x_input, [0], name='features')
    count = tf.reduce_sum(count_input, [0], name='count')
    min_distances = tf.reduce_sum(md_input, [0], name='min_distances')
    df3 = tfs.aggregate([x, count, min_distances], gb)

# Get the new centroids
df3_c = df3.collect()
Ejemplo n.º 26
0
def m_kmeans(df,
             feature_column,
             num_centroids_each,
             num_features,
             m_groups,
             max_iter=10):
    """
    M K-means algorithm applied on a dataframe of points

    :param df: dataframe, contains all points
    :param feature_column: string, the points column within df
    :param num_centroids_each: int, k clusters
    :param num_features: int, dimension of a point vector
    :param m_groups: int, number of groups a point is spitted into
    :param max_iter: int, maximum number of iterations

    :return: numpy.array: [num_centroids, num_features], the k cluster centers with m groups concatenated
    """
    initial_centers = df.select(feature_column).take(num_centroids_each)
    centers = np.array(initial_centers).reshape(num_centroids_each,
                                                num_features)
    m_slice = map(lambda r: slice(min(r),
                                  max(r) + 1),
                  np.array_split(xrange(num_features), m_groups))
    slices = np.array_split(xrange(m_groups * num_centroids_each), m_groups)
    df = tfs.analyze(df)

    while max_iter > 0:
        max_iter -= 1

        with tf.Graph().as_default():
            points = tf.placeholder(tf.double,
                                    shape=[None, num_features],
                                    name=feature_column)
            counts, vector_sums = calculate_new_centers_for_m_slice(
                m_slice, points, tf.nn.l2_normalize(centers, dim=1),
                num_centroids_each)
            counts = tf.identity(counts, name='counts')
            vector_sums = tf.identity(vector_sums, name='vector_sums')
            df2 = tfs.map_blocks([counts, vector_sums], df, trim=True)

        with tf.Graph().as_default():
            counts = tf.placeholder(
                tf.int64,
                shape=[None, num_centroids_each * m_groups],
                name='counts_input')
            vector_sums = tf.placeholder(tf.double,
                                         shape=[
                                             None,
                                             num_centroids_each * m_groups,
                                             num_features / m_groups
                                         ],
                                         name='vector_sums_input')
            count = tf.reduce_sum(counts, axis=0, name='counts')
            vector_sum = tf.reduce_sum(vector_sums, axis=0, name='vector_sums')
            d_count, d_vector_sum = tfs.reduce_blocks([count, vector_sum], df2)
            new_centers = d_vector_sum / (d_count[:, np.newaxis] + 1e-7)
            new_centers = np.concatenate([new_centers[i] for i in slices],
                                         axis=1)
        if np.allclose(centers, new_centers):
            break
        else:
            centers = new_centers

    return new_centers