Example #1
0
 def test_list_of_str_bad_len(self):
     try:
        affirm_type.list_of_str(["a", "b", "c"], "a", length=2)
     except ValueError as e:
         msg = str(e)
         expected = "Expected list of length 2."
         self.assertTrue(msg.endswith(expected), "expected error message should have ended with '%s', message =%s" % (expected, msg))
     else:
         self.fail("A ValueError should have been raised")
Example #2
0
 def test_list_of_str_value_error(self):
     x = [1, 2, 3]
     try:
         affirm_type.list_of_str(x, "a")
     except ValueError as e:
         msg = str(e)
         expected = "Expected str or list of str"
         self.assertTrue(expected in msg, "\nexpected=%s\nmessage =%s" % (expected, msg))
     else:
         self.fail("A ValueError should have been raised")
Example #3
0
 def test_list_of_str_type_error(self):
     x = 3.14
     try:
         affirm_type.list_of_str(x, "a")
     except TypeError as e:
         msg = str(e)
         expected = "Expected type str or list of str."
         self.assertTrue(expected in msg, "\nexpected=%s\nmessage =%s" % (expected, msg))
     else:
         self.fail("A TypeError should have been raised")
Example #4
0
def collect(self, columns=None):
    """
    Brings all the rows of data from the frame into a local python list of lists

    (Use the 'take' operation for control over row count and offset of the collected data)

    Parameters
    ----------

    :param columns: (Optional[str or List[str]) If not None, only the given columns' data will be provided.
                    By default, all columns are included.
    :return: (List[List[*]]) the frame data represented as a list of lists

    Examples
    --------

        >>> schema = [('name',str), ('age', int), ('tenure', int), ('phone', str)]
        >>> rows = [['Fred', 39, 16, '555-1234'], ['Susan', 33, 3, '555-0202'], ['Thurston', 65, 26, '555-4510'], ['Judy', 44, 14, '555-2183']]
        >>> frame = tc.frame.create(rows, schema)
        >>> frame.collect()
        [['Fred', 39, 16, '555-1234'], ['Susan', 33, 3, '555-0202'], ['Thurston', 65, 26, '555-4510'], ['Judy', 44, 14, '555-2183']]

        >>> frame.collect(['name', 'phone'])
        [['Fred', '555-1234'], ['Susan', '555-0202'], ['Thurston', '555-4510'], ['Judy', '555-2183']]

        <hide>
        >>> tmp = frame._scala
        >>> frame.collect(['name', 'phone'])
        [[u'Fred', u'555-1234'], [u'Susan', u'555-0202'], [u'Thurston', u'555-4510'], [u'Judy', u'555-2183']]

        </hide>

    """
    if columns is not None:
        affirm_type.list_of_str(columns, "columns")
        if not columns:
            return []
    if self._is_scala:
        scala_data = self._scala.collect(self._tc.jutils.convert.to_scala_option_list_string(columns))
        schema = get_schema_for_columns(self.schema, columns) if columns else self.schema
        data = TakeCollectHelper.scala_rows_to_python(self._tc, scala_data, schema)
    else:
        if columns:
            select = TakeCollectHelper.get_select_columns_function(self.schema, columns)
            data = self._python.rdd.map(select).collect()
        else:
            data = self._python.rdd.collect()
    return data
Example #5
0
def train(frame,
          observation_columns,
          label_column,
          intercept = True,
          num_iterations = 100,
          step_size = 1.0,
          reg_type = None,
          reg_param = 0.01,
          mini_batch_fraction = 1.0):
    """
    Creates a Svm Model by training on the given frame

    Parameters
    ----------

    :param frame: (Frame) frame of training data
    :param observation_columns: (list(str)) Column(s) containing the observations
    :param label_column: (str) Column containing the label for each observation
    :param intercept: (boolean) Flag indicating if the algorithm adds an intercept. Default is true
    :param num_iterations: (int) Number of iterations for SGD. Default is 100
    :param step_size: (float) Initial step size for SGD optimizer for the first step. Default is 1.0
    :param reg_type: (Optional(str)) Regularization "L1" or "L2". Default is "L2"
    :param reg_param: (float) Regularization parameter. Default is 0.01
    :param mini_batch_fraction: (float) Set fraction of data to be used for each SGD iteration. Default is 1.0; corresponding to deterministic/classical gradient descent
    :return: (SvmModel) The SVM trained model (with SGD)

    Notes
    -----
    Support Vector Machine is a supervised algorithm used to perform binary classification. A Support Vector Machine
    constructs a high dimensional hyperplane which is said to achieve a good separation when a hyperplane has the
    largest distance to the nearest training-data point of any class. This model runs the MLLib implementation of SVM
    with SGD optimizer. The SVM model is initialized, trained on columns of a frame, used to predict the labels
    of observations in a frame, and tests the predicted labels against the true labels. During testing, labels of the
    observations are predicted and tested against the true labels using built-in binary Classification Metrics.

    """
    if frame is None:
        raise ValueError("frame cannot be None")

    tc = frame._tc
    _scala_obj = get_scala_obj(tc)
    obs_columns = affirm_type.list_of_str(observation_columns, "observation_columns")
    scala_model = _scala_obj.train(frame._scala,
                                   tc.jutils.convert.to_scala_list_string(obs_columns),
                                   label_column,
                                   intercept,
                                   num_iterations,
                                   step_size,
                                   tc.jutils.convert.to_scala_option(reg_type),
                                   reg_param,
                                   mini_batch_fraction)

    return SvmModel(tc, scala_model)
Example #6
0
def train(frame,
          time_column,
          covariate_columns,
          censor_column,
          convergence_tolerance=1E-6,
          max_steps=100):
    """
    Creates a CoxProportionalHazardsModel by training on the given frame

    Parameters
    ----------

    :param frame: (Frame) A frame to train the model on
    :param time_column: (str) Column name containing the time of occurence of each observation.
    :param covariate_columns: (Seq[str]) List of column(s) containing the covariates.
    :param censor_column: (str) Column name containing censor value of each observation.
    :param convergence_tolerance: (float) Parameter for the convergence tolerance for iterative algorithms. Default is 1E-6
    :param max_steps: (int) Parameter for maximum number of steps. Default is 100
    :return: (CoxProportionalHazardsModel) A trained coxPh model
    """
    from sparktk.frame.frame import Frame
    require_type(Frame, frame, "frame cannot be None")
    require_type.non_empty_str(time_column, "time_column")
    require_type.non_empty_str(censor_column, "censor_column")
    require_type(float, convergence_tolerance, "convergence_tolerance should be float")
    require_type.non_negative_int(max_steps, "max_steps")
    affirm_type.list_of_str(covariate_columns, "covariate_columns")

    tc = frame._tc
    _scala_obj = get_scala_obj(tc)
    scala_covariate_columns = tc.jutils.convert.to_scala_vector_string(covariate_columns)

    scala_model = _scala_obj.train(frame._scala,
                                   time_column,
                                   scala_covariate_columns,
                                   censor_column,
                                   convergence_tolerance,
                                   max_steps)
    return CoxProportionalHazardsModel(tc, scala_model)
Example #7
0
    def predict(self, frame, observation_columns=None):
        """
        Predicts the labels for the observation columns in the given input frame. Creates a new frame
        with the existing columns and a new predicted column.

        Parameters
       ----------

        :param frame: (Frame) Frame used for predicting the values
        :param observation_columns: (List[str]) Names of the observation columns.
        :return: (Frame) A new frame containing the original frame's columns and a prediction column
        """
        columns_list = affirm_type.list_of_str(observation_columns, "observation_columns", allow_none=True)
        columns_option = self._tc.jutils.convert.to_scala_option_list_string(columns_list)
        return Frame(self._tc, self._scala.predict(frame._scala, columns_option))
    def test(self, frame, observation_columns=None, label_column=None):
        """
        Test the frame given the trained model

        Parameters
        ----------

        :param frame: (Frame) The frame to predict on
        :param observation_columns: Optional(List[str]) List of column(s) containing the observations
        :param label_column: Optional(String) Column name containing the label for each observation
        :return: (RegressionTestMetrics) RegressionTestMetrics object consisting of results from model test
        """
        require_type(Frame, frame, 'frame')
        column_list = affirm_type.list_of_str(observation_columns, "observation_columns", allow_none=True)
        obs = self._tc.jutils.convert.to_scala_option_list_string(column_list)
        label = self._tc.jutils.convert.to_scala_option(label_column)
        return RegressionTestMetrics(self._scala.test(frame._scala, obs, label))
Example #9
0
    def predict(self, frame, observation_columns=None):
        """
        Predict the values for the data points.

        Predict the values for a test frame using trained Random Forest Classifier model, and create a new frame revision
        with existing columns and a new predicted value's column.

        Parameters
        ----------

        :param frame: (Frame) A frame whose labels are to be predicted. By default, predict is run on the same columns
                      over which the model is trained.
        :param observation_columns: (Optional(list[str])) Column(s) containing the observations whose labels are to be predicted.
                        By default, we predict the labels over columns the Random Forest model was trained on.
        :return: (Frame) A new frame consisting of the existing columns of the frame and a new column with predicted
                 value for each observation.
        """

        require_type(Frame, frame, 'frame')
        column_list = affirm_type.list_of_str(observation_columns, "observation_columns", allow_none=True)
        columns_option = self._tc.jutils.convert.to_scala_option_list_string(column_list)
        return Frame(self._tc, self._scala.predict(frame._scala, columns_option))
Example #10
0
    def test(self, frame, observation_columns=None, label_column=None):
        """
        Predict test frame labels and return metrics.

        Parameters
        ----------

        :param frame: (Frame) The frame whose labels are to be predicted
        :param observation_columns: (Optional(list[str])) Column(s) containing the observations whose labels are to be predicted.
                                    By default, the same observation column names from training are used
        :param label_column: (str) Column containing the name of the label
                                    By default, the same label column name from training is used
        :return: (ClassificationMetricsValue) Binary classification metrics comprised of:
                accuracy (double)
                The proportion of predictions that are correctly identified
                confusion_matrix (dictionary)
                A table used to describe the performance of a classification model
                f_measure (double)
                The harmonic mean of precision and recall
                precision (double)
                The proportion of predicted positive instances that are correctly identified
                recall (double)
                The proportion of positive instances that are correctly identified.
        """
        require_type(Frame, frame, 'frame')
        column_list = affirm_type.list_of_str(observation_columns,
                                              "observation_columns",
                                              allow_none=True)

        return ClassificationMetricsValue(
            self._tc,
            self._scala.test(
                frame._scala,
                self._tc.jutils.convert.to_scala_option_list_string(
                    column_list),
                self._tc.jutils.convert.to_scala_option(label_column)))
Example #11
0
def take(self, n, offset=0, columns=None):
    """
    Get data subset.

    Take a subset of the currently active Frame.

    (See 'collect' operation to simply get all the data from the Frame)

    Parameters
    ----------

    :param n: (int) The number of rows to get from the frame (warning: do not overwhelm the python session
                    by taking too much)
    :param offset: (Optional[int]) The number of rows to skip before starting to copy.
    :param columns: (Optional[str or list[str]) If not None, only the given columns' data will be provided.
                    By default, all columns are included.
    :return: (list[list[data]]) raw frame data

    Examples
    --------

    <hide>
        >>> schema = [('name',str), ('age', int), ('tenure', int), ('phone', str)]
        >>> rows = [['Fred', 39, 16, '555-1234'], ['Susan', 33, 3, '555-0202'], ['Thurston', 65, 26, '555-4510'], ['Judy', 44, 14, '555-2183']]
        >>> frame = tc.frame.create(rows, schema)
        -etc-
    </hide>

    Consider the following frame:
        >>> frame.inspect()
        [#]  name      age  tenure  phone
        ====================================
        [0]  Fred       39      16  555-1234
        [1]  Susan      33       3  555-0202
        [2]  Thurston   65      26  555-4510
        [3]  Judy       44      14  555-2183

    Use take to get the first two rows and look at the schema and data in the result:

        >>> frame.take(2)
        [['Fred', 39, 16, '555-1234'], ['Susan', 33, 3, '555-0202']]

    Limit the columns in our result to just the name and age column:

        >>> frame.take(2, columns=['name', 'age'])
        [['Fred', 39], ['Susan', 33]]

    <hide>
        >>> tmp = frame._scala  # flip over to scala and try
        >>> frame.take(2, columns=['name', 'age'])
        [[u'Fred', 39], [u'Susan', 33]]

    </hide>

    """
    require_type.non_negative_int(n, "n")
    require_type.non_negative_int(offset, "offset")
    if columns is not None:
        columns = affirm_type.list_of_str(columns, "columns")
        if not columns:
            return []

    if self._is_scala:
        scala_data = self._scala.take(n, offset, self._tc.jutils.convert.to_scala_option_list_string(columns))
        schema = get_schema_for_columns(self.schema, columns) if columns else self.schema
        data = TakeCollectHelper.scala_rows_to_python(self._tc, scala_data, schema)
    else:
        require_type.non_negative_int(n, "n")
        if offset:
            data = _take_offset(self, n, offset, columns)
        elif columns:
            select_columns = TakeCollectHelper.get_select_columns_function(self.schema, columns)
            data = self._python.rdd.map(select_columns).take(n)
        else:
            data = self._python.rdd.take(n)
    return data
Example #12
0
 def __get_observation_columns(self, observation_columns):
     if observation_columns is None:
         return observation_columns
     else:
         return affirm_type.list_of_str(observation_columns, "observation_columns")
 def test_list_of_str_single_string(self):
     x = "uno"
     result = affirm_type.list_of_str(x, "a")
     self.assertEqual([x], result)
 def test_list_of_str(self):
     x = ["uno", "dos", "tres"]
     result = affirm_type.list_of_str(x, "a")
     self.assertEqual(x, result)
Example #15
0
 def test_list_of_float_none(self):
     x = None
     result = affirm_type.list_of_str(x, "a", allow_none=True)
     self.assertEqual(x, result)
Example #16
0
 def test_list_of_float_none(self):
     x = None
     result = affirm_type.list_of_str(x, "a", allow_none=True)
     self.assertEqual(x, result)
Example #17
0
 def test_list_of_str_single_string(self):
     x = "uno"
     result = affirm_type.list_of_str(x, "a")
     self.assertEqual([x], result)
Example #18
0
 def test_list_of_str(self):
     x = ["uno", "dos", "tres"]
     result = affirm_type.list_of_str(x, "a")
     self.assertEqual(x, result)
Example #19
0
def take(self, n, offset=0, columns=None):
    """
    Get data subset.

    Take a subset of the currently active Frame.

    (See 'collect' operation to simply get all the data from the Frame)

    Parameters
    ----------

    :param n: (int) The number of rows to get from the frame (warning: do not overwhelm the python session
                    by taking too much)
    :param offset: (Optional[int]) The number of rows to skip before starting to copy.
    :param columns: (Optional[str or list[str]) If not None, only the given columns' data will be provided.
                    By default, all columns are included.
    :return: (list[list[data]]) raw frame data

    Examples
    --------

    <hide>
        >>> schema = [('name',str), ('age', int), ('tenure', int), ('phone', str)]
        >>> rows = [['Fred', 39, 16, '555-1234'], ['Susan', 33, 3, '555-0202'], ['Thurston', 65, 26, '555-4510'], ['Judy', 44, 14, '555-2183']]
        >>> frame = tc.frame.create(rows, schema)
        -etc-
    </hide>

    Consider the following frame:
        >>> frame.inspect()
        [#]  name      age  tenure  phone
        ====================================
        [0]  Fred       39      16  555-1234
        [1]  Susan      33       3  555-0202
        [2]  Thurston   65      26  555-4510
        [3]  Judy       44      14  555-2183

    Use take to get the first two rows and look at the schema and data in the result:

        >>> frame.take(2)
        [['Fred', 39, 16, '555-1234'], ['Susan', 33, 3, '555-0202']]

    Limit the columns in our result to just the name and age column:

        >>> frame.take(2, columns=['name', 'age'])
        [['Fred', 39], ['Susan', 33]]

    <hide>
        >>> tmp = frame._scala  # flip over to scala and try
        >>> frame.take(2, columns=['name', 'age'])
        [[u'Fred', 39], [u'Susan', 33]]

    </hide>

    """
    require_type.non_negative_int(n, "n")
    require_type.non_negative_int(offset, "offset")
    if columns is not None:
        columns = affirm_type.list_of_str(columns, "columns")
        if not columns:
            return []

    if self._is_scala:
        scala_data = self._scala.take(
            n, offset,
            self._tc.jutils.convert.to_scala_option_list_string(columns))
        schema = get_schema_for_columns(self.schema,
                                        columns) if columns else self.schema
        data = TakeCollectHelper.scala_rows_to_python(self._tc, scala_data,
                                                      schema)
    else:
        require_type.non_negative_int(n, "n")
        if offset:
            data = _take_offset(self, n, offset, columns)
        elif columns:
            select_columns = TakeCollectHelper.get_select_columns_function(
                self.schema, columns)
            data = self._python.rdd.map(select_columns).take(n)
        else:
            data = self._python.rdd.take(n)
    return data
Example #20
0
def train(frame,
          observation_columns,
          label_column,
          num_trees = 1,
          impurity = "variance",
          max_depth = 4,
          max_bins = 100,
          min_instances_per_node = 1,
          sub_sampling_rate = 1.0,
          feature_subset_category = "auto",
          seed = None,
          categorical_features_info = None):
    """
    Creates a Random Forest Regressor Model by training on the given frame

    Parameters
    ----------

    :param frame: (Frame) frame frame of training data
    :param observation_columns: (list(str)) Column(s) containing the observations
    :param label_column: (str) Column name containing the label for each observation
    :param num_trees: (int) Number of tress in the random forest. Default is 1
    :param impurity: (str) Criterion used for information gain calculation. Default value is "variance".
    :param max_depth: (int) Maximum depth of the tree. Default is 4
    :param max_bins: (int) Maximum number of bins used for splitting features.
    :param min_instances_per_node: (int) Minimum number of records each child node must have after a split.
    :param sub_sampling_rate: (double) Fraction between 0..1 of the training data used for learning each decision tree.
    :param feature_subset_category: (str) Subset of observation columns, i.e., features,
                                 to consider when looking for the best split.
                                 Supported values "auto","all","sqrt","log2","onethird".
                                 If "auto" is set, this is based on num_trees: if num_trees == 1, set to "all"
                                 ; if num_trees > 1, set to "sqrt".
    :param seed: (Optional(int)) Random seed for bootstrapping and choosing feature subsets. Default is a randomly chosen seed.
    :param categorical_features_info: (Optional(Dict(str:int))) Arity of categorical features. Entry (name-> k) indicates
                                      that feature 'name' is categorical with 'k' categories indexed from 0:{0,1,...,k-1}

    :return: (RandomForestRegressorModel) The trained random forest regressor model

    Notes
    -----
    Random Forest is a supervised ensemble learning algorithm used to perform regression. A Random Forest
    Regressor model is initialized, trained on columns of a frame, and used to predict the value of each
    observation in the frame. This model runs the Spark ML implementation of Random Forest. During training,
    the decision trees are trained in parallel. During prediction, the average over-all tree's predicted
    value is the predicted value of the random forest.

    """
    require_type(Frame, frame, 'frame')
    column_list = affirm_type.list_of_str(observation_columns, "observation_columns")
    require_type.non_empty_str(label_column, "label_column")
    require_type.non_negative_int(num_trees, "num_trees")
    require_type.non_empty_str(impurity, "impurity")
    require_type.non_negative_int(max_depth, "max_depth")
    require_type.non_negative_int(max_bins, "max_bins")
    require_type.non_negative_int(min_instances_per_node, "min_instances_per_node")
    require_type(float, sub_sampling_rate, "sub_sampling_rate")
    if sub_sampling_rate > 1 or sub_sampling_rate < 0:
        raise ValueError("'sub_sampling_rate' parameter must have a value between 0 and 1")
    require_type.non_empty_str(feature_subset_category, "feature_subset_category")

    tc = frame._tc
    _scala_obj = get_scala_obj(tc)
    seed = int(os.urandom(2).encode('hex'), 16) if seed is None else seed
    scala_model = _scala_obj.train(frame._scala,
                                   tc.jutils.convert.to_scala_list_string(column_list),
                                   label_column,
                                   num_trees,
                                   impurity,
                                   max_depth,
                                   max_bins,
                                   min_instances_per_node,
                                   sub_sampling_rate,
                                   feature_subset_category,
                                   seed,
                                   __get_categorical_features_info(tc, categorical_features_info))

    return RandomForestRegressorModel(tc, scala_model)
Example #21
0
 def test(self, frame, observation_columns=None, label_column=None):
     """test the frame given the trained model"""
     scala_classification_metrics_object = self._scala.test(frame._scala,
             self._tc.jutils.convert.to_scala_option_list_string(affirm_type.list_of_str(observation_columns, "observation_columns", allow_none=True)),
             self._tc.jutils.convert.to_scala_option(label_column))
     return ClassificationMetricsValue(self._tc, scala_classification_metrics_object)