Beispiel #1
0
 def _call_scala(self, func):
     from sparktk.frame.frame import Frame
     scala_dicom = self._get_new_scala()
     results = func(scala_dicom)
     self._metadata = Frame(self._tc, scala_dicom.metadata())
     self._pixeldata = Frame(self._tc, scala_dicom.pixeldata())
     return results
Beispiel #2
0
    def predict(self, frame, ts_column, x_columns):
        """
        New frame with column of predicted y values

        Predict the time series values for a test frame, based on the specified x values.  Creates a new frame
        revision with the existing columns and a new predicted_y column.

        :param frame: (Frame) Frame used for predicting the ts values
        :param ts_column: (str) Name of the time series column
        :param x_columns: (List[str]) Names of the column(s) that contain the values of the exogenous inputs.
        :return: (Frame) A new frame containing the original frame's columns and a column *predictied_y*
        """
        if not isinstance(frame, self._tc.frame.Frame):
            raise TypeError(
                "'frame' parameter should be a spark-tk Frame object.")
        if not isinstance(ts_column, basestring):
            raise TypeError(
                "'ts_column' parameter should be a string (name of the column that has the timeseries value)."
            )
        if not isinstance(x_columns, list) or not all(
                isinstance(c, str) for c in x_columns):
            raise TypeError(
                "'x_columns' parameter should be a list of strings (names of the exogenous columns)."
            )
        elif len(x_columns) <= 0:
            raise ValueError("'x_columns' should not be empty.")
        scala_x_columns = self._tc.jutils.convert.to_scala_vector_string(
            x_columns)
        from sparktk.frame.frame import Frame
        return Frame(
            self._tc,
            self._scala.predict(frame._scala, ts_column, scala_x_columns))
    def __init__(self, tc,  scala_result):
        self._tc = tc
        self._num_features = scala_result.numFeatures()
        self._num_classes = scala_result.numClasses()
        self._coefficients = self._tc.jutils.convert.scala_map_to_python(scala_result.coefficients())
        self._degrees_freedom = self._tc.jutils.convert.scala_map_to_python(scala_result.degreesFreedom())

        scala_option_frame = self._tc.jutils.convert.from_scala_option(scala_result.covarianceMatrix())
        if scala_option_frame:
            from sparktk.frame.frame import Frame
            self._covariance_matrix = Frame(self._tc, scala_option_frame)
        else:
            self._covariance_matrix = None

        scala_option_map = self._tc.jutils.convert.from_scala_option(scala_result.standardErrors())
        if scala_option_map:
            self._standard_errors = self._tc.jutils.convert.scala_map_to_python(scala_option_map)
        else:
            self._standard_errors = None

        scala_option_map = self._tc.jutils.convert.from_scala_option(scala_result.waldStatistic())
        if scala_option_map:
            self._wald_statistic = self._tc.jutils.convert.scala_map_to_python(scala_option_map)
        else:
            self._wald_statistic = None

        scala_option_map = self._tc.jutils.convert.from_scala_option(scala_result.pValue())
        if scala_option_map:
            self._p_value = self._tc.jutils.convert.scala_map_to_python(scala_option_map)
        else:
            self._p_value = None
def loopy_belief_propagation(self, prior, edge_weight, max_iterations=10):
    """

    Performs loopy belief propagation on a graph representing a Potts model. This optimizes based off of
    user provided priors.

    Parameters
    ----------

    :param prior: (String) The name of the column of space delimited string of floats representing the prior distribution on a vertex
    :param edge_weight: (String) The name of the column of weight value on edges
    :param max_iterations: The number of iterations to run for

    Examples
    --------

        >>> vertex_schema = [('id', int), ('label', float), ("prior_val", str), ("was_labeled", int)]
        >>> vertex_rows = [ [1, 1, "0.7 0.3", 1], [2, 1, "0.7 0.3", 1], [3, 5, "0.7 0.3", 0], [4, 5, "0.7 0.3", 0], [5, 5, "0.7 0.3", 1] ]

        >>> edge_schema = [('src', int), ('dst', int), ('weight', int)]
        >>> edge_rows = [ [1, 2, 2], [1, 3, 1], [2, 3, 1], [1, 4, 1], [4, 5, 1] ]

        >>> vertex_frame = tc.frame.create(vertex_rows, vertex_schema)
        >>> edge_frame = tc.frame.create(edge_rows, edge_schema)

        >>> graph = tc.graph.create(vertex_frame, edge_frame)
        >>> vertex_frame.inspect()
        [#]  id  label  prior_val  was_labeled
        ======================================
        [0]   1    1.0  0.7 0.3              1
        [1]   2    1.0  0.7 0.3              1
        [2]   3    5.0  0.7 0.3              0
        [3]   4    5.0  0.7 0.3              0
        [4]   5    5.0  0.7 0.3              1

        >>> result = graph.loopy_belief_propagation("prior_val", "weight", 2)
        >>> result.inspect() 
        [#]  id  label  prior_val  was_labeled
        ======================================
        [0]   1    1.0  0.7 0.3              1
        [1]   2    1.0  0.7 0.3              1
        [2]   3    5.0  0.7 0.3              0
        [3]   4    5.0  0.7 0.3              0
        [4]   5    5.0  0.7 0.3              1
        <BLANKLINE>
        [#]  posterior
        ==============================================
        [0]  [0.9883347610773112,0.011665238922688819]
        [1]  [0.9743014865548763,0.025698513445123698]
        [2]  [0.9396772870897875,0.06032271291021254]
        [3]  [0.9319529856190276,0.06804701438097235]
        [4]  [0.8506957305238876,0.1493042694761125]



    """
    from sparktk.frame.frame import Frame
    return Frame(
        self._tc,
        self._scala.loopyBeliefPropagation(prior, edge_weight, max_iterations))
    def predict(self,
                frame,
                input_source_column_name,
                input_dest_column_name,
                output_user_column_name="user",
                output_product_column_name="product",
                output_rating_column_name="rating"):
        """
        Predicts the given frame based on trained model

        :param frame: (Frame) frame to predict based on generated model
        :param input_source_column_name: (str) source column name.
        :param input_dest_column_name: (str) destination column name.
        :param output_user_column_name: (str) A user column name for the output frame
        :param output_product_column_name: (str) A product  column name for the output frame
        :param output_rating_column_name: (str) A rating column name for the output frame
        :return: (Frame) returns predicted rating frame with specified output columns
        """
        from sparktk.frame.frame import Frame
        return Frame(
            self._tc,
            self._scala.predict(frame._scala, input_source_column_name,
                                input_dest_column_name,
                                output_user_column_name,
                                output_product_column_name,
                                output_rating_column_name))
Beispiel #6
0
    def predict(self,
                frame,
                columns=None,
                mean_centered=None,
                k=None,
                t_squared_index=False):
        """
       Predicts the labels for the observation columns in the given input frame. Creates a new frame
       with the existing columns and a new predicted column.

       Parameters
       ----------

       :param frame: (Frame) Frame used for predicting the values
       :param columns: (List[str]) Names of the observation columns.
       :param mean_centered: (boolean) whether to mean center the columns. Default is true
       :param k: (int) the number of principal components to be computed, must be <= the k used in training.  Default is the trained k
       :param t_squared_index: (boolean) whether the t-square index is to be computed. Default is false
       :return: (Frame) A new frame containing the original frame's columns and a prediction column
       """
        if mean_centered is None:
            mean_centered = self.mean_centered
        from sparktk.frame.frame import Frame
        return Frame(
            self._tc,
            self._scala.predict(
                frame._scala,
                self._tc.jutils.convert.to_scala_option_list_string(columns),
                mean_centered, self._tc.jutils.convert.to_scala_option(k),
                t_squared_index))
Beispiel #7
0
def quantiles(self, column_name, quantiles):
    """
    Returns a new frame with Quantiles and their values.

    Parameters
    ----------

    :param column_name: (str) The column to calculate quantiles on
    :param quantiles: (List[float]) The quantiles being requested
    :return: (Frame) A new frame with two columns (float): requested Quantiles and their respective values.

    Calculates quantiles on the given column.

    Examples
    --------
    <hide>

        >>> data = [[100],[250],[95],[179],[315],[660],[540],[420],[250],[335]]
        >>> schema = [('final_sale_price', int)]

        >>> my_frame = tc.frame.create(data, schema)
        <progress>

    </hide>

    Consider Frame *my_frame*, which accesses a frame that contains a single
    column *final_sale_price*:

        >>> my_frame.inspect()
        [#]  final_sale_price
        =====================
        [0]               100
        [1]               250
        [2]                95
        [3]               179
        [4]               315
        [5]               660
        [6]               540
        [7]               420
        [8]               250
        [9]               335

    To calculate 10th, 50th, and 100th quantile:

        >>> quantiles_frame = my_frame.quantiles('final_sale_price', [10, 50, 100])
        <progress>

    A new Frame containing the requested Quantiles and their respective values
    will be returned:

       >>> quantiles_frame.inspect()
       [#]  Quantiles  final_sale_price_QuantileValue
       ==============================================
       [0]       10.0                            95.0
       [1]       50.0                           250.0
       [2]      100.0                           660.0

    """
    from sparktk.frame.frame import Frame
    return Frame(self._tc, self._scala.quantiles(column_name, self._tc.jutils.convert.to_scala_list_double(quantiles)))
Beispiel #8
0
def covariance_matrix(self, data_column_names):
    """
    Calculate covariance matrix for two or more columns.

    Parameters
    ----------

    :param data_column_names: (List[str]) The names of the column from which to compute the matrix.
                              Names should refer to a single column of type vector, or two or more
                              columns of numeric scalars.
    :return: (Frame) A matrix with the covariance values for the columns.

    Notes
    -----
    This function applies only to columns containing numerical data.

    Examples
    --------
    Consider Frame *my_frame*, which contains the data

        <hide>
        >>> s = [("idnum", int), ("x1", float), ("x2", float), ("x3", float), ("x4", float)]
        >>> rows = [ [0, 1.0, 4.0, 0.0, -1.0], [1, 2.0, 3.0, 0.0, -1.0], [2, 3.0, 2.0, 1.0, -1.0], [3, 4.0, 1.0, 2.0, -1.0], [4, 5.0, 0.0, 2.0, -1.0]]
        >>> my_frame = tc.frame.create(rows, s)
        -etc-

        </hide>
        >>> my_frame.inspect()
         [#]  idnum  x1   x2   x3   x4
        ===============================
        [0]      0  1.0  4.0  0.0  -1.0
        [1]      1  2.0  3.0  0.0  -1.0
        [2]      2  3.0  2.0  1.0  -1.0
        [3]      3  4.0  1.0  2.0  -1.0
        [4]      4  5.0  0.0  2.0  -1.0


    my_frame.covariance_matrix computes the covariance on each pair of columns in the user-provided list.

        >>> cov_matrix = my_frame.covariance_matrix(my_frame.column_names)
        <progress>

        The resulting table (specifying all columns) is:

        >>> cov_matrix.inspect()
        [#]  idnum  x1    x2    x3    x4
        =================================
        [0]    2.5   2.5  -2.5   1.5  0.0
        [1]    2.5   2.5  -2.5   1.5  0.0
        [2]   -2.5  -2.5   2.5  -1.5  0.0
        [3]    1.5   1.5  -1.5   1.0  0.0
        [4]    0.0   0.0   0.0   0.0  0.0

    """
    from sparktk.frame.frame import Frame
    return Frame(self._tc,
                 self._scala.covarianceMatrix(self._tc.jutils.convert.to_scala_list_string(data_column_names)))
Beispiel #9
0
def degree_centrality(self, degree_option='undirected'):
    """
    **Degree Centrality Calculation**

    A fundamental quantity in graph analysis is the degree centrality of a vertex:
    The degree of a vertex is the number of edges adjacent to it, normalized against the
    highest possible degree (number of vertices in graph - 1).

    For a directed edge relation, a vertex has both an out-degree (the number of
    edges leaving the vertex) and an in-degree (the number of edges entering the
    vertex).

    Parameters
    ----------

    :param degree_option: (String) Either in, out or undirected. String describing the direction of edges

    :return: (Frame) Frame containing the vertex id's an their weights

    Examples
    --------

        >>> vertex_schema = [('id', int)]
        >>> edge_schema = [('src', int), ('dst', int)]

        >>> vertex_rows = [ [1], [2], [3], [4], [5] ]
        >>> edge_rows = [ [1, 2], [1, 3], [2, 3], [1, 4], [4, 5] ]
        >>> vertex_frame = tc.frame.create(vertex_rows, vertex_schema)
        >>> edge_frame = tc.frame.create(edge_rows, edge_schema)

        >>> graph = tc.graph.create(vertex_frame, edge_frame)

        >>> result = graph.degree_centrality(degree_option="out")
        >>> result.inspect() 
        [#]  id  degree_centrality
        ==========================
        [0]   1               0.75
        [1]   2               0.25
        [2]   3                0.0
        [3]   4               0.25
        [4]   5                0.0



        >>> result = graph.degree_centrality(degree_option="in")
        >>> result.inspect()
        [#]  id  degree_centrality
        ==========================
        [0]   1                0.0
        [1]   2               0.25
        [2]   3                0.5
        [3]   4               0.25
        [4]   5               0.25

    """
    from sparktk.frame.frame import Frame
    return Frame(self._tc, self._scala.degreeCentrality(degree_option))
Beispiel #10
0
def degrees(self, degree_option='undirected'):
    """
    **Degree Calculation**

    A fundamental quantity in graph analysis is the degree of a vertex:
    The degree of a vertex is the number of edges adjacent to it.

    For a directed edge relation, a vertex has both an out-degree (the number of
    edges leaving the vertex) and an in-degree (the number of edges entering the
    vertex).

    Parameters
    ----------

    :param degree_option: (String) Either in, out or undirected. String describing the direction of edges

    :return: (Frame) Frame containing the vertex id's an their weights

    Examples
    --------

        >>> vertex_schema = [('id', int)]
        >>> edge_schema = [('src', int), ('dst', int)]

        >>> vertex_rows = [ [1], [2], [3], [4], [5] ]
        >>> edge_rows = [ [1, 2], [1, 3], [2, 3], [1, 4], [4, 5] ]
        >>> vertex_frame = tc.frame.create(vertex_rows, vertex_schema)
        >>> edge_frame = tc.frame.create(edge_rows, edge_schema)

        >>> graph = tc.graph.create(vertex_frame, edge_frame)

        >>> result = graph.degrees(degree_option="out")
        >>> result.inspect() 
        [#]  Vertex  Degree
        ===================
        [0]       1       3
        [1]       2       1
        [2]       3       0
        [3]       4       1
        [4]       5       0


        >>> result = graph.degrees(degree_option="in")
        >>> result.inspect()
        [#]  Vertex  Degree
        ===================
        [0]       1       0
        [1]       2       1
        [2]       3       2
        [3]       4       1
        [4]       5       1

    """
    from sparktk.frame.frame import Frame
    return Frame(self._tc, self._scala.degree(degree_option))
Beispiel #11
0
    def predict(self,
                frame,
                mean_centered=True,
                t_squared_index=False,
                observation_columns=None,
                c=None):
        """
        Predicting on a frame's columns using an Intel DAAL PcaModel.

        Parameters
        ----------

        :param frame: (Frame) Frame whose principal components are to be computed.
        :param mean_centered: (Optional(bool)) Option to mean center the columns.
        :param t_squared_index: (Optional(bool)) Indicator for whether the t-squared index is to be computed.
        :param observation_columns: (Option(list[str])) List of observation column name(s) to be used for prediction.
                                    Default is the list of column name(s) used to train the model.
        :param c: (Optional(int)) The number of principal components to be predicted. 'c' cannot be greater than the
                  count used to train the model.  Default is the count used to train the model.
        :return: (Frame) A frame with existing columns and following additional columns:
                 'c' additional columns: containing the projections of V on the the frame
                 't_squared_index': column storing the t-square-index value, if requested
        """
        if not isinstance(frame, Frame):
            raise TypeError(
                "'frame' parameter should be a frame, but received type: %s." %
                type(Frame))
        if not isinstance(mean_centered, bool):
            raise TypeError(
                "'mean_centered' parameter should be a boolean, but received type: %s."
                % type(mean_centered))
        if not isinstance(t_squared_index, bool):
            raise TypeError(
                "'t_squared_index' parameter should be a boolean, but received type: %s."
                % type(t_squared_index))
        if c is not None and not isinstance(c, int):
            raise TypeError(
                "If optional parameter 'c' is specified, it should be an int, but received type: %s."
                % (type(c)))
        if observation_columns is not None and not isinstance(
                observation_columns, list):
            raise TypeError(
                "observation_columns must be a list of strings (or None), but received type: %s"
                % type(observation_columns))

        predict_frame = self._scala.predict(
            frame._scala, mean_centered, t_squared_index,
            self._tc.jutils.convert.to_scala_option_list_string(
                observation_columns),
            self._tc.jutils.convert.to_scala_option(c))

        return Frame(self._tc, predict_frame)
Beispiel #12
0
def page_rank(self,
              convergence_tolerance=None,
              reset_probability=None,
              max_iterations=None):
    """
    **Page Rank**

    Page Rank is a popular statistic that ranks vertices based off of
    connectivity in the global graph

    Exactly 1 of convergence_tolerance and max_iterations must be set (termination criteria)

    Parameters
    ----------

    :convergence_tolerance: (Float) If the difference between successive iterations is less than this, the algorithm terminates. Mutually exclusive with max_iterations
    :reset_probability: (Float) Value for the reset probabiity in the page rank algorithm
    :max_iterations: (Int) Maximum number of iterations the page rank should run before terminating. Mutually exclusive with convergence_tolerance

    :return: (Frame) Frame containing the vertex id's and their page rank 

    Examples
    --------

        >>> vertex_schema = [('id', int)]
        >>> edge_schema = [('src', int), ('dst', int)]

        >>> vertex_rows = [ [1], [2], [3], [4], [5] ]
        >>> edge_rows = [ [1, 2], [1, 3], [2, 3], [1, 4], [4, 5] ]
        >>> vertex_frame = tc.frame.create(vertex_rows, vertex_schema)
        >>> edge_frame = tc.frame.create(edge_rows, edge_schema)

        >>> graph = tc.graph.create(vertex_frame, edge_frame)

        >>> result = graph.page_rank(max_iterations=20)
        >>> result.inspect()
        [#]  Vertex  PageRank
        =====================
        [0]       1      0.15
        [1]       2    0.1925
        [2]       3  0.356125
        [3]       4    0.1925
        [4]       5  0.313625

    """
    from sparktk.frame.frame import Frame
    return Frame(
        self._tc,
        self._scala.pageRank(
            self._tc.jutils.convert.to_scala_option(max_iterations),
            self._tc.jutils.convert.to_scala_option(reset_probability),
            self._tc.jutils.convert.to_scala_option(convergence_tolerance)))
Beispiel #13
0
def ecdf(self, column):
    """
    Builds new frame with columns for data and distribution.

    Parameters
    ----------

    :param column: (str) The name of the input column containing sample.
    :return: (Frame) A new Frame containing each distinct value in the sample and its corresponding ECDF value.

    Generates the :term:`empirical cumulative distribution` for the input column.

    Examples
    --------

    Consider the following sample data set in *frame* 'frame' containing several numbers.

    <hide>
        >>> frame = tc.frame.create([[1], [3], [1], [0], [2], [1], [4], [3]], [('numbers', int)])
        -etc-

    </hide>

        >>> frame.inspect()
        [#]  numbers
        ============
        [0]        1
        [1]        3
        [2]        1
        [3]        0
        [4]        2
        [5]        1
        [6]        4
        [7]        3

        >>> ecdf_frame = frame.ecdf('numbers')
        <progress>

        >>> ecdf_frame.inspect()
        [#]  numbers  numbers_ecdf
        ==========================
        [0]        0         0.125
        [1]        1           0.5
        [2]        2         0.625
        [3]        3         0.875
        [4]        4           1.0

    """
    from sparktk.frame.frame import Frame
    return Frame(self._tc, self._scala.ecdf(column))
def clustering_coefficient(self):
    """
    The clustering coefficient of a vertex provides a measure of how
    tightly clustered that vertex's neighborhood is.
    
    Formally:
    
    .. math::
    
       cc(v)  = \frac{ \| \{ (u,v,w) \in V^3: \ \{u,v\}, \{u, w\}, \{v,w \} \in \
           E \} \| }{\| \{ (u,v,w) \in V^3: \ \{v, u \}, \{v, w\} \in E \} \|}
    
    For further reading on clustering
    coefficients, see http://en.wikipedia.org/wiki/Clustering_coefficient.
    
    This method returns a frame with the vertex id associated with it's local
    clustering coefficient

    Parameters
    ----------

    :return: (Frame) Frame containing the vertex id's and their clustering coefficient

    Examples
    --------

        >>> vertex_schema = [('id', int)]
        >>> edge_schema = [('src', int), ('dst', int)]

        >>> vertex_rows = [ [1], [2], [3], [4], [5] ]
        >>> edge_rows = [ [1, 2], [1, 3], [2, 3], [1, 4], [4, 5] ]
        >>> vertex_frame = tc.frame.create(vertex_rows, vertex_schema)
        >>> edge_frame = tc.frame.create(edge_rows, edge_schema)

        >>> graph = tc.graph.create(vertex_frame, edge_frame)

        >>> result = graph.clustering_coefficient()
        >>> result.inspect()
        [#]  id  clustering_coefficient
        ===============================
        [0]   1          0.333333333333
        [1]   2                     1.0
        [2]   3                     1.0
        [3]   4                     0.0
        [4]   5                     0.0

    """
    from sparktk.frame.frame import Frame
    return Frame(self._tc, self._scala.clusteringCoefficient())
Beispiel #15
0
def import_jdbc(connection_url, table_name, tc=TkContext.implicit):
    """
    Import data from jdbc table into frame.

    Parameters
    ----------

    :param connection_url: (str) JDBC connection url to database server
    :param table_name: (str) JDBC table name
    :return: (Frame) returns frame with jdbc table data

    Examples
    --------
    Load a frame from a jdbc table specifying the connection url to the database server.

    <skip>
        >>> url = "jdbc:postgresql://localhost/postgres"
        >>> tb_name = "demo_test"

        >>> frame = tc.frame.import_jdbc(url, tb_name)
        -etc-

        >>> frame.inspect()
        [#]  a  b    c   d
        ==================
        [0]  1  0.2  -2  5
        [1]  2  0.4  -1  6
        [2]  3  0.6   0  7
        [3]  4  0.8   1  8

        >>> frame.schema
        [(u'a', int), (u'b', float), (u'c', int), (u'd', int)]
    </skip>
    """
    if not isinstance(connection_url, basestring):
        raise ValueError(
            "connection url parameter must be a string, but is {0}.".format(
                type(connection_url)))
    if not isinstance(table_name, basestring):
        raise ValueError(
            "table name parameter must be a string, but is {0}.".format(
                type(table_name)))
    TkContext.validate(tc)

    scala_frame = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.constructors.Import.importJdbc(
        tc.jutils.get_scala_sc(), connection_url, table_name)

    from sparktk.frame.frame import Frame
    return Frame(tc, scala_frame)
def betweenness_centrality(self, edge_weight=None, normalize=True):
    """
    **Betweenness Centrality**

    Calculates the betweenness centrality exactly, with an optional weights parameter
    for the distance between the vertices.


    Parameters
    ----------

    :param edge_weight: (Optional(str)) The name of the column containing the edge weights,
           If none, every edge is assigned a weight of 1.
    :param normalize: (Optional(bool)) If true, normalize the betweenness centrality values
           by the number of pairwise paths possible

    :return: (Frame) Frame containing the vertex IDs and their corresponding betweenness centrality value

    Examples
    --------

        >>> vertex_schema = [('id', int)]
        >>> edge_schema = [('src', int), ('dst', int)]

        >>> vertex_rows = [ [1], [2], [3], [4], [5] ]
        >>> edge_rows = [ [1, 2], [1, 3], [2, 3], [1, 4], [4, 5] ]
        >>> vertex_frame = tc.frame.create(vertex_rows, vertex_schema)
        >>> edge_frame = tc.frame.create(edge_rows, edge_schema)

        >>> graph = tc.graph.create(vertex_frame, edge_frame)

        >>> result = graph.betweenness_centrality()
        >>> result.inspect() 
        [#]  id  betweenness_centrality
        ===============================
        [0]   1          0.666666666667
        [1]   2                     0.0
        [2]   3                     0.0
        [3]   4                     0.5
        [4]   5                     0.0



    """
    from sparktk.frame.frame import Frame
    return Frame(
        self._tc,
        self._scala.betweennessCentrality(
            self._tc.jutils.convert.to_scala_option(edge_weight), normalize))
    def predict(self, frame, observation_columns):
        """
        Predict values for a frame using a trained Linear Regression model

        :param frame: (Frame) The frame to predict on
        :param observation_columns: Optional(List[str]) List of column(s) containing the observations
        :return: (Frame) returns frame with predicted column added
        """
        from sparktk.frame.frame import Frame
        return Frame(
            self._tc,
            self._scala.predict(
                frame._scala,
                self._tc.jutils.convert.to_scala_option_list_string(
                    observation_columns)))
    def predict(self, frame, columns=None):
        """
        Predicts the labels for the observation columns in the given input frame. Creates a new frame
        with the existing columns and a new predicted column.

        Parameters
        ----------

        :param frame: (Frame) Frame used for predicting the values
        :param c: (List[str]) Names of the observation columns.
        :return: (Frame) A new frame containing the original frame's columns and a prediction column
        """
        c = self.__columns_to_option(columns)
        from sparktk.frame.frame import Frame
        return Frame(self._tc, self._scala.predict(frame._scala, c))
Beispiel #19
0
    def predict(self, frame, observation_columns=None, comparison_frame=None):
        """
        Predict values for a frame using a trained CoxPH model

        Parameters
        ----------

        :param frame: (Frame) The frame to predict on
        :param observation_columns: Optional(List[str]) List of column(s) containing the observations. Default is list of covariate columns
        :param comparison_frame: Optional(Frame) Frame to compare against. Default is the training frame
        :return: (Frame) returns frame with predicted column added
        """
        observation_columns = self.__columns_to_option(observation_columns)
        comparison_frame = self.__frame_to_option(comparison_frame)
        from sparktk.frame.frame import Frame
        return Frame(self._tc, self._scala.predict(frame._scala, observation_columns, comparison_frame))
Beispiel #20
0
def label_propagation(self, max_iterations):
    """

    Parameters
    ----------

    Assigns label based off of proximity to different vertices. The labels
    are initially 1 unique label per vertex (the vertex id), and as the
    algorithm runs some of these get erased

    Note this algorithm is neither guaranteed to converge, nor guaranteed to
    converge to the correct value.

    This calls graph frames label propagation which can be found at 

    http://graphframes.github.io/api/scala/index.html#org.graphframes.lib.LabelPropagation

    :return: (Frame) Frame containing the vertex id's and the community they are a member of

    Examples
    --------

        >>> vertex_schema = [('id', int)]
        >>> vertex_rows = [ [1], [2], [3], [4], [5] ]

        >>> edge_rows = [ [1, 2], [1, 3], [2, 3], [1, 4], [4, 5] ]
        >>> edge_schema = [('src', int), ('dst', int)]

        >>> vertex_frame = tc.frame.create(vertex_rows, vertex_schema)
        >>> edge_frame = tc.frame.create(edge_rows, edge_schema)

        >>> graph = tc.graph.create(vertex_frame, edge_frame)

        >>> result = graph.label_propagation(2)
        >>> result.inspect()
        [#]  id  label
        ===================
        [0]   1  8589934594
        [1]   2  8589934593
        [2]   3  8589934593
        [3]   4  8589934593
        [4]   5  8589934594

    """
    from sparktk.frame.frame import Frame
    return Frame(self._tc, self._scala.labelPropagation(max_iterations))
    def predict(self, frame, observation_columns=None):
        """
        Predict labels for data points using trained logistic regression model.

        Predict the labels for a test frame using trained logistic regression model, and create a new frame revision with
        existing columns and a new predicted label's column.

        Parameters
        ----------

        :param frame: (Frame) A frame whose labels are to be predicted. By default, predict is run on the same columns
                              over which the model is trained.
        :param observation_columns: (None or list[str]) Column(s) containing the observations whose labels are
                                            to be predicted. Default is the labels the model was trained on.
        :return: (Frame) Frame containing the original frame's columns and a column with the predicted label.
        """
        columns_option = self._tc.jutils.convert.to_scala_option_list_string(self.__get_observation_columns(observation_columns))
        from sparktk.frame.frame import Frame
        return Frame(self._tc, self._scala.predict(frame._scala, columns_option))
Beispiel #22
0
    def predict(self, frame, observation_columns=None):
        """
        Predicts the labels for the observation columns in the given input frame. Creates a new frame
        with the existing columns and a new predicted column.

        Parameters
       ----------

        :param frame: (Frame) Frame used for predicting the values
        :param observation_columns: (List[str]) Names of the observation columns.
        :return: (Frame) A new frame containing the original frame's columns and a prediction column
        """
        columns_list = affirm_type.list_of_str(observation_columns,
                                               "observation_columns",
                                               allow_none=True)
        columns_option = self._tc.jutils.convert.to_scala_option_list_string(
            columns_list)
        return Frame(self._tc, self._scala.predict(frame._scala,
                                                   columns_option))
Beispiel #23
0
    def predict(self, frame, columns=None):
        """
        Predict the values for the data points.

        Predict the values for a test frame using trained Random Forest Classifier model, and create a new frame revision
        with existing columns and a new predicted value's column.

        Parameters
        ----------

        :param frame: (Frame) A frame whose labels are to be predicted. By default, predict is run on the same columns
                      over which the model is trained.
        :param columns: (Optional(list[str])) Column(s) containing the observations whose labels are to be predicted.
                        By default, we predict the labels over columns the Random Forest model was trained on.
        :return: (Frame) A new frame consisting of the existing columns of the frame and a new column with predicted
                 value for each observation.
        """

        c = self.__columns_to_option(columns)
        from sparktk.frame.frame import Frame
        return Frame(self._tc,self._scala.predict(frame._scala, c))
Beispiel #24
0
    def predict(self, frame, observation_columns=None):
        """
        Predict the values for the data points.

        Predict the values for a test frame using trained Random Forest Classifier model, and create a new frame revision
        with existing columns and a new predicted value's column.

        Parameters
        ----------

        :param frame: (Frame) A frame whose labels are to be predicted. By default, predict is run on the same columns
                      over which the model is trained.
        :param observation_columns: (Optional(list[str])) Column(s) containing the observations whose labels are to be predicted.
                        By default, we predict the labels over columns the Random Forest model was trained on.
        :return: (Frame) A new frame consisting of the existing columns of the frame and a new column with predicted
                 value for each observation.
        """

        require_type(Frame, frame, 'frame')
        column_list = affirm_type.list_of_str(observation_columns, "observation_columns", allow_none=True)
        columns_option = self._tc.jutils.convert.to_scala_option_list_string(column_list)
        return Frame(self._tc, self._scala.predict(frame._scala, columns_option))
def connected_components(self):
    """

    Connected components determines groups all the vertices in a particular graph
    by whether or not there is path between these vertices. This method returns
    a frame with the vertices and their corresponding component

    Parameters
    ----------

    :return: (Frame) Frame containing the vertex id's and their components

    Examples
    --------

        >>> vertex_schema = [('id', int)]
        >>> edge_schema = [('src', int), ('dst', int)]

        >>> vertex_rows = [ [1], [2], [3], [4], [5] ]
        >>> edge_rows = [ [1, 2], [1, 3], [2, 3], [4, 5] ]
        >>> vertex_frame = tc.frame.create(vertex_rows, vertex_schema)
        >>> edge_frame = tc.frame.create(edge_rows, edge_schema)

        >>> graph = tc.graph.create(vertex_frame, edge_frame)

        >>> result = graph.connected_components()
        >>> result.inspect() 
        [#]  id  component
        ==================
        [0]   1          1
        [1]   2          1
        [2]   3          1
        [3]   4          4
        [4]   5          4

    """
    from sparktk.frame.frame import Frame
    return Frame(self._tc, self._scala.connectedComponents())
Beispiel #26
0
    def predict(self, frame, observation_columns=None, label_column=None):
        """
        Predict the cluster assignments for the data points, using the trained model.

        Parameters
        ----------

        :param frame: (Frame)A frame whose labels are to be predicted.
        :param observation_columns: (Optional(list[str])) Column(s) containing the observations whose clusters are to be
                                    predicted.  Default is to predict the clusters over the columns that the KMeans model
                                    was trained on.
        :param label_column: (Optional(str)) Name of the output column with index of cluster each observation belongs to.
        :return: (Frame) A new frame consisting of the existing columns of the frame and the following new columns 'k'
                 columns: Each of the 'k' columns containing squared distance of that observation to the 'k'th cluster
                 center predicted_cluster column: The cluster assignment for the observation.

        """
        if not isinstance(frame, Frame):
            raise TypeError(
                "'frame' parameter should be a frame, but received type: %s." %
                type(frame))
        if observation_columns != None and (
                not isinstance(observation_columns, list) or not all(
                    isinstance(column, basestring)
                    for column in observation_columns)):
            raise TypeError(
                "observation_columns must be a list of strings (or None)")
        if label_column != None and not isinstance(label_column, basestring):
            raise TypeError("label_column must be a string (or None)")

        predict_frame = self._scala.predict(
            frame._scala,
            self._tc.jutils.convert.to_scala_option(label_column),
            self._tc.jutils.convert.to_scala_option_list_string(
                observation_columns))

        return Frame(self._tc, predict_frame)
Beispiel #27
0
def triangle_count(self):
    """
    Counts the number of triangles each vertex is a part of

    Parameters
    ----------

    :return: (Frame) Frame containing the vertex id's and the count of the number of triangle they are in

    Examples
    --------

        >>> vertex_schema = [('id', int)]
        >>> edge_schema = [('src', int), ('dst', int)]

        >>> vertex_rows = [ [1], [2], [3], [4], [5] ]
        >>> edge_rows = [ [1, 2], [1, 3], [2, 3], [1, 4], [4, 5] ]
        >>> vertex_frame = tc.frame.create(vertex_rows, vertex_schema)
        >>> edge_frame = tc.frame.create(edge_rows, edge_schema)

        >>> graph = tc.graph.create(vertex_frame, edge_frame)

        >>> result = graph.triangle_count()
        >>> result.inspect()
        [#]  count  id 
        ==============
        [0]      1   1
        [1]      1   2
        [2]      1   3
        [3]      0   4
        [4]      0   5


    """
    from sparktk.frame.frame import Frame
    return Frame(self._tc, self._scala.triangleCount())
Beispiel #28
0
    def predict(self, frame, columns=None):
        """
        Predict labels for data points using trained multinomial Naive Bayes model.

        Parameters
        ----------

        :param frame: (Frame) A frame whose labels are to be predicted.
        :param columns: (Optional(list[str])) Column(s) containing the observations whose labels are to be predicted.
                        By default, we predict the labels over columns the NaiveBayesModel was trained on.
        :return: (Frame) Frame containing the original frame's columns and a column with the predicted label.
        """
        if not isinstance(frame, Frame):
            raise TypeError(
                "frame parameter must be a sparktk frame, but received: %s" %
                type(frame))
        if columns is not None and not isinstance(columns, list):
            raise TypeError(
                "columns parameter must be a list of strings (or None), but received %s"
                % type(columns))
        scala_columns = self._tc.jutils.convert.to_scala_option_list_string(
            columns)
        return Frame(self._tc, self._scala.predict(frame._scala,
                                                   scala_columns))
Beispiel #29
0
def join_right(self, right, left_on, right_on=None, use_broadcast_left=False):
    """
    join_right performs right join(right outer) operation on one or two frames, creating a new frame.


    Parameters
    ----------

    :param right: (Frame) Another frame to join with
    :param left_on: (List[str]) Names of the columns in the left frame used to match up the two frames.
    :param right_on: (Optional[List[str]])Names of the columns in the right frame used to match up the two frames. Default is the same as the left frame.
    :param use_broadcast_left: (bool) If left table is small enough to fit in the memory of a single machine,
            you can set use_broadcast_left to True to possibly improve performance using broadcast join. Default is False.

    :returns: (Frame) A new frame with the results of the join

    Create a new frame from a SQL JOIN operation with another frame.
    The frame on the 'left' is the currently active frame.
    The frame on the 'right' is another frame.
    This method take column(s) in the left frame and matches its values
    with column(s) in the right frame.
    'right' join works similarly to join_left, except it keeps all the data
    from the right frame and only the data from the left frame when it
    matches.

    Notes
    -----
    When a column is named the same in both frames, it will result in two
    columns in the new frame.
    The column from the *left* frame (originally the current frame) will be
    copied and the column name will have the string "_L" added to it.
    The same thing will happen with the column from the *right* frame,
    except its name has the string "_R" appended. The order of columns
    after this method is called is not guaranteed.

    It is recommended that you rename the columns to meaningful terms prior
    to using the ``join`` method.

    Examples
    --------

    <hide>

    >>> codes = tc.frame.create([[1], [3], [1], [0], [2], [1], [5], [3]], [('numbers', int)])
    -etc-

    >>> colors = tc.frame.create([[1, 'red'], [2, 'yellow'], [3, 'green'], [4, 'blue']], [('numbers', int), ('color', str)])
    -etc-

    >>> country_code_rows = [[1, 354, "a"],[2, 91, "a"],[2, 100, "b"],[3, 47, "a"],[4, 968, "c"],[5, 50, "c"]]
    >>> country_code_schema = [("country_code", int),("area_code", int),("test_str",str)]
    -etc-

    >>> country_name_rows = [[1, "Iceland", "a"],[1, "Ice-land", "a"],[2, "India", "b"],[3, "Norway", "a"],[4, "Oman", "c"],[6, "Germany", "c"]]
    >>> country_names_schema = [("country_code", int),("country_name", str),("test_str",str)]
    -etc-

    >>> country_codes_frame = tc.frame.create(country_code_rows, country_code_schema)
    -etc-

    >>> country_names_frame= tc.frame.create(country_name_rows, country_names_schema)
    -etc-

    </hide>

    Consider two frames: codes and colors

    >>> codes.inspect()
    [#]  numbers
    ============
    [0]        1
    [1]        3
    [2]        1
    [3]        0
    [4]        2
    [5]        1
    [6]        5
    [7]        3


    >>> colors.inspect()
    [#]  numbers  color
    ====================
    [0]        1  red
    [1]        2  yellow
    [2]        3  green
    [3]        4  blue

    >>> j_right = codes.join_right(colors, 'numbers')
    <progress>

    >>> j_right.inspect()
    [#]  numbers_R  color
    ======================
    [0]          1  red
    [1]          1  red
    [2]          1  red
    [3]          2  yellow
    [4]          3  green
    [5]          3  green
    [6]          4  blue


    (The join adds an extra column *_R which is the join column from the right frame; it may be disregarded)

    Consider two frames: country_codes_frame and country_names_frame

    >>> country_codes_frame.inspect()
    [#]  country_code  area_code  test_str
    ======================================
    [0]             1        354  a
    [1]             2         91  a
    [2]             2        100  b
    [3]             3         47  a
    [4]             4        968  c
    [5]             5         50  c


    >>> country_names_frame.inspect()
    [#]  country_code  country_name  test_str
    =========================================
    [0]             1  Iceland       a
    [1]             1  Ice-land      a
    [2]             2  India         b
    [3]             3  Norway        a
    [4]             4  Oman          c
    [5]             6  Germany       c

    Join them on the 'country_code' and 'test_str' columns ('inner' join by default)

    >>> composite_join_right = country_codes_frame.join_right(country_names_frame, ['country_code', 'test_str'])
    <progress>

    >>> composite_join_right.inspect()
    [#]  area_code  country_code_R  country_name  test_str_R
    ========================================================
    [0]       None               6  Germany       c
    [1]        354               1  Iceland       a
    [2]        354               1  Ice-land      a
    [3]        100               2  India         b
    [4]         47               3  Norway        a
    [5]        968               4  Oman          c

    Right join broadcasting left table

    >>> j_right = codes.join_right(colors, 'numbers', use_broadcast_left=True)
    <progress>

    >>> j_right.inspect()
    [#]  numbers_R  color
    ======================
    [0]          1  red
    [1]          1  red
    [2]          1  red
    [3]          2  yellow
    [4]          3  green
    [5]          3  green
    [6]          4  blue

    >>> composite_join_right = country_codes_frame.join_right(country_names_frame, ['country_code', 'test_str'], use_broadcast_left=True)
    <progress>

    >>> composite_join_right.inspect()
    [#]  area_code  country_code_R  country_name  test_str_R
    ========================================================
    [0]        354               1  Iceland       a
    [1]        354               1  Ice-land      a
    [2]        100               2  India         b
    [3]         47               3  Norway        a
    [4]        968               4  Oman          c
    [5]       None               6  Germany       c

    """
    if left_on is None:
        raise ValueError(
            "Please provide column name on which join should be performed")
    elif isinstance(left_on, basestring):
        left_on = [left_on]
    if right_on is None:
        right_on = left_on
    elif isinstance(right_on, basestring):
        right_on = [right_on]
    if len(left_on) != len(right_on):
        raise ValueError("Please provide equal number of join columns")

    from sparktk.frame.frame import Frame
    return Frame(
        self._tc,
        self._scala.joinRight(
            right._scala,
            self._tc.jutils.convert.to_scala_list_string(left_on),
            self._tc.jutils.convert.to_scala_option(
                self._tc.jutils.convert.to_scala_list_string(right_on)),
            use_broadcast_left))
Beispiel #30
0
def power_iteration_clustering(self,
                               source_column,
                               destination_column,
                               similarity_column,
                               k=2,
                               max_iterations=100,
                               initialization_mode="random"):
    """
    Power Iteration Clustering finds a low-dimensional embedding of a dataset using truncated power iteration on a
    normalized pair-wise similarity matrix of the data.

    Parameters
    ----------

    :param source_column: (str) Name of the column containing the source node
    :param destination_column: (str) Name of the column containing the destination node
    :param similarity_column: (str) Name of the column containing the similarity
    :param k: (Optional(int)) Number of clusters to cluster the graph into. Default is 2
    :param max_iterations: (Optional(int)) Maximum number of iterations of the power iteration loop. Default is 100
    :param initialization_mode: (Optional(str)) Initialization mode of power iteration clustering. This can be either
     "random" to use a random vector as vertex properties, or "degree" to use normalized sum similarities. Default is "random".
    :return: (namedtuple) Returns namedtuple containing the results frame(node and cluster), k (number of clusters),
     and cluster_sizes(a map of clusters and respective size)

    Example
    -------

        >>> frame = tc.frame.create([[1,2,1.0],
        ...                         [1,3,0.3],
        ...                         [2,3,0.3],
        ...                         [3,0,0.03],
        ...                         [0,5,0.01],
        ...                         [5,4,0.3],
        ...                         [5,6,1.0],
        ...                         [4,6,0.3]],
        ...                         [('Source', int), ('Destination', int), ('Similarity',float)])

        >>> frame.inspect()
        [#]  Source  Destination  Similarity
        ====================================
        [0]       1            2         1.0
        [1]       1            3         0.3
        [2]       2            3         0.3
        [3]       3            0        0.03
        [4]       0            5        0.01
        [5]       5            4         0.3
        [6]       5            6         1.0
        [7]       4            6         0.3

        >>> x = frame.power_iteration_clustering('Source', 'Destination', 'Similarity', k=3)

        >>> x.frame.inspect()
        [#]  id  cluster
        ================
        [0]   4        2
        [1]   0        3
        [2]   6        2
        [3]   2        1
        [4]   1        1
        [5]   3        1
        [6]   5        2

        >>> x.k
        3
        >>> x.cluster_sizes
        {u'2': 3, u'3': 1, u'1': 3}

    """
    result = self._scala.powerIterationClustering(source_column,
                                                  destination_column,
                                                  similarity_column, k,
                                                  max_iterations,
                                                  initialization_mode)
    k_val = result.k()
    cluster_sizes = self._tc.jutils.convert.scala_map_to_python(
        result.clusterSizes())
    from sparktk.frame.frame import Frame
    py_frame = Frame(self._tc, result.clusterMapFrame())
    return PicResult(frame=py_frame, k=k_val, cluster_sizes=cluster_sizes)