Exemple #1
0
    def __init__(self, tc, source_or_vertices_frame, edges_frame=None):
        self._tc = tc
        self._scala = None

        # (note that the Scala code will validate appropriate frame schemas)

        if isinstance(source_or_vertices_frame, Frame):
            # Python Vertices and Edges Frames
            vertices_frame = source_or_vertices_frame
            require_type(Frame,
                         edges_frame,
                         'edges_frame',
                         "Providing a vertices frame requires also providing an edges frame")
            self._scala = self._create_scala_graph_from_scala_frames(self._tc,
                                                                     vertices_frame._scala,
                                                                     edges_frame._scala)
        else:
            source = source_or_vertices_frame
            require_type(None,
                         edges_frame,
                         'edges_frame',
                         'If edges_frames is provided, then a valid vertex frame must be provided as the first arg, instead of type %s' % type(source))
            if self._is_scala_graph(source):
                # Scala Graph
                self._scala = source
            elif isinstance(source, GraphFrame):
                # python GraphFrame
                scala_graphframe =  source._jvm_graph
                self._scala = self._create_scala_graph_from_scala_graphframe(self._tc, scala_graphframe)
            elif self._is_scala_graphframe(source):
                # scala GraphFrame
                self._scala = self._create_scala_graph_from_scala_graphframe(self._tc, source)
            else:
                raise TypeError("Cannot create from source type %s" % type(source))
Exemple #2
0
    def recommend(self,
                  entity_id,
                  number_of_recommendations=1,
                  recommend_products=True):
        """
        recommend products to users or vice versa

        :param entity_id: (int) A user/product id
        :param number_of_recommendations: (int) Number of recommendations
        :param recommend_products: (bool) True - products for user; false - users for the product
        :return: Returns an array of recommendations (as array of csv-strings)
        """
        require_type(int, entity_id, "entity_id")
        require_type.non_negative_int(number_of_recommendations,
                                      "number_of_recommendations")
        require_type(bool, recommend_products, "recommend_products")

        # returns scala list of scala map
        scala_list_of_scala_map = self._scala.recommend(
            entity_id, number_of_recommendations, recommend_products)

        # First convert to python list of scala map
        python_list_of_scala_map = self._tc.jutils.convert.from_scala_seq(
            scala_list_of_scala_map)

        # Convert to Python list of python map
        python_list_of_python_map = []
        for scala_map in python_list_of_scala_map:
            python_list_of_python_map.append(
                self._tc.jutils.convert.scala_map_to_python(scala_map))

        return python_list_of_python_map
    def recommend(self, entity_id, number_of_recommendations=1, recommend_products=True):
        """
        recommend products to users or vice versa

        :param entity_id: (int) A user/product id
        :param number_of_recommendations: (int) Number of recommendations
        :param recommend_products: (bool) True - products for user; false - users for the product
        :return: Returns an array of recommendations (as array of csv-strings)
        """
        require_type(int, entity_id, "entity_id")
        require_type.non_negative_int(number_of_recommendations, "number_of_recommendations")
        require_type(bool, recommend_products, "recommend_products")

        # returns scala list of scala map
        scala_list_of_scala_map = self._scala.recommend(entity_id, number_of_recommendations, recommend_products)

        # First convert to python list of scala map
        python_list_of_scala_map = self._tc.jutils.convert.from_scala_seq(scala_list_of_scala_map)

        # Convert to Python list of python map
        python_list_of_python_map = []
        for scala_map in python_list_of_scala_map:
            python_list_of_python_map.append(self._tc.jutils.convert.scala_map_to_python(scala_map))

        return python_list_of_python_map
Exemple #4
0
 def test_implicit(self):
     try:
         require_type(int, implicit, "a")
     except ValueError as e:
         self.assertEqual("Missing value for arg 'a'.  This value is normally filled implicitly, however, if this method is called standalone, it must be set explicitly", str(e))
     else:
         self.fail("A ValueError should have been raised")
Exemple #5
0
    def __init__(self, tc, source_or_vertices_frame, edges_frame=None):
        self._tc = tc
        self._scala = None

        # (note that the Scala code will validate appropriate frame schemas)

        if isinstance(source_or_vertices_frame, Frame):
            # Python Vertices and Edges Frames
            vertices_frame = source_or_vertices_frame
            require_type(edges_frame,
                         'edges_frame',
                         Frame,
                         "Providing a vertices frame requires also providing an edges frame")
            self._scala = self.create_scala_graph_from_scala_frames(self._tc,
                                                                    vertices_frame._scala,
                                                                    edges_frame._scala)
        else:
            source = source_or_vertices_frame
            require_type(edges_frame,
                         'edges_frame',
                         None,
                         'If edges_frames is provided, then a valid vertex frame must be provided as the first arg, instead of type %s' % type(source))
            if self._is_scala_graph(source):
                # Scala Graph
                self._scala = source
            elif isinstance(source, GraphFrame):
                # python GraphFrame
                scala_graphframe =  source._jvm_graph
                self._scala = self.create_scala_graph_from_scala_graphframe(self._tc, scala_graphframe)
            elif self._is_scala_graphframe(source):
                # scala GraphFrame
                self._scala = self.create_scala_graph_from_scala_graphframe(self._tc, source)
            else:
                raise TypeError("Cannot create from source type %s" % type(source))
Exemple #6
0
    def validate(tc, arg_name='tc'):
        """
        Raises a ValueError if the tc variable is not of type TkContext

        Since tc is so commonly used as an implicit variable, it's worth the special code here to save a lot of imports otherwise

        """
        require_type(tc, arg_name, TkContext)
    def validate(tc, arg_name='tc'):
        """
        Raises a ValueError if the tc variable is not of type TkContext

        Since tc is so commonly used as an implicit variable, it's worth the special code here to save a lot of imports otherwise

        """
        require_type(tc, arg_name, TkContext)
Exemple #8
0
 def test_basic_negative(self):
     try:
         require_type(int, "12", "a")
     except TypeError as e:
         msg = str(e)
         expected = "Expected type <type 'int'>"
         self.assertTrue(expected in msg, "\nexpected=%s\nmessage =%s" % (expected, msg))
     else:
         self.fail("A TypeError should have been raised")
 def test_implicit(self):
     try:
         require_type(int, implicit, "a")
     except ValueError as e:
         self.assertEqual(
             "Missing value for arg 'a'.  This value is normally filled implicitly, however, if this method is called standalone, it must be set explicitly",
             str(e))
     else:
         self.fail("A ValueError should have been raised")
 def test_basic_negative(self):
     try:
         require_type(int, "12", "a")
     except TypeError as e:
         msg = str(e)
         expected = "Expected type <type 'int'>"
         self.assertTrue(expected in msg,
                         "\nexpected=%s\nmessage =%s" % (expected, msg))
     else:
         self.fail("A TypeError should have been raised")
Exemple #11
0
def export_to_hive(self, hive_table_name, overwrite=False):
    """
    Write current frame to Hive table.

    Table must not exist in Hive. Hive does not support case sensitive table names and columns names.
    Hence column names with uppercase letters will be converted to lower case by Hive.

    Parameters
    ----------

    :param hive_table_name: (str) hive table name
    :param overwrite: (Optional(bool)) Specify whether or not to overwrite the hive table if it already exists.  If
                      overwrite is set to False, and the table already exists, an exception is thrown.

    Example
    --------
        <skip>
        >>> data = [[1, 0.2, -2, 5], [2, 0.4, -1, 6], [3, 0.6, 0, 7], [4, 0.8, 1, 8]]
        >>> schema = [('a', int), ('b', float),('c', int) ,('d', int)]
        >>> my_frame = tc.frame.create(data, schema)
        <progress>

        </skip>

    table_name: (string): table name. It will create new table with given name if it does not exists already.

    <skip>
        >>> my_frame.export_to_hive("demo_test_hive")
        <progress>

    </skip>

    Verify exported frame in hive

    From bash shell

        $hive
        hive> show tables

    You should see demo_test_hive table.

    Run hive> select * from demo_test_hive; (to verify frame).

    To overwrite a table that already exists, set the overwrite parameter to 'True':

        <skip>
        >>> my_frame.export_to_hive("demo_test_hive", overwrite=True)
        </skip>

    """

    require_type.non_empty_str(hive_table_name, "hive_table_name")
    require_type(bool, overwrite, "overwrite")

    self._scala.exportToHive(hive_table_name, overwrite)
Exemple #12
0
    def test(self, frame, observation_columns=None, label_column=None):
        """
        Test the frame given the trained model

        Parameters
        ----------

        :param frame: (Frame) The frame to predict on
        :param observation_columns: Optional(List[str]) List of column(s) containing the observations
        :param label_column: Optional(String) Column name containing the label for each observation
        :return: (RegressionTestMetrics) RegressionTestMetrics object consisting of results from model test
        """
        require_type(Frame, frame, 'frame')
        column_list = affirm_type.list_of_str(observation_columns, "observation_columns", allow_none=True)
        obs = self._tc.jutils.convert.to_scala_option_list_string(column_list)
        label = self._tc.jutils.convert.to_scala_option(label_column)
        return RegressionTestMetrics(self._scala.test(frame._scala, obs, label))
Exemple #13
0
def expand_kwarg_grids(dictionaries):
    """
    Method to expand the dictionary of arguments
    :param dictionaries: Parameters for the model of type (list of dict)
    :return: Expanded list of parameters for the model
    """
    arguments.require_type(list, dictionaries, "dictionaries")
    new_dictionaries = []
    for dictionary in dictionaries:
        for k, v in dictionary.items():
            arguments.require_type(dict, dictionary, "item in dictionaries")
            if isinstance(v, GridValues):
                for a in v.args:
                    d = dictionary.copy()
                    d[k] = a
                    new_dictionaries.append(d)
                break
    if new_dictionaries:
        return expand_kwarg_grids(new_dictionaries)
    return dictionaries
Exemple #14
0
def train(frame,
          time_column,
          covariate_columns,
          censor_column,
          convergence_tolerance=1E-6,
          max_steps=100):
    """
    Creates a CoxProportionalHazardsModel by training on the given frame

    Parameters
    ----------

    :param frame: (Frame) A frame to train the model on
    :param time_column: (str) Column name containing the time of occurence of each observation.
    :param covariate_columns: (Seq[str]) List of column(s) containing the covariates.
    :param censor_column: (str) Column name containing censor value of each observation.
    :param convergence_tolerance: (float) Parameter for the convergence tolerance for iterative algorithms. Default is 1E-6
    :param max_steps: (int) Parameter for maximum number of steps. Default is 100
    :return: (CoxProportionalHazardsModel) A trained coxPh model
    """
    from sparktk.frame.frame import Frame
    require_type(Frame, frame, "frame cannot be None")
    require_type.non_empty_str(time_column, "time_column")
    require_type.non_empty_str(censor_column, "censor_column")
    require_type(float, convergence_tolerance, "convergence_tolerance should be float")
    require_type.non_negative_int(max_steps, "max_steps")
    affirm_type.list_of_str(covariate_columns, "covariate_columns")

    tc = frame._tc
    _scala_obj = get_scala_obj(tc)
    scala_covariate_columns = tc.jutils.convert.to_scala_vector_string(covariate_columns)

    scala_model = _scala_obj.train(frame._scala,
                                   time_column,
                                   scala_covariate_columns,
                                   censor_column,
                                   convergence_tolerance,
                                   max_steps)
    return CoxProportionalHazardsModel(tc, scala_model)
Exemple #15
0
    def predict(self, frame, observation_columns=None):
        """
        Predict the values for the data points.

        Predict the values for a test frame using trained Random Forest Classifier model, and create a new frame revision
        with existing columns and a new predicted value's column.

        Parameters
        ----------

        :param frame: (Frame) A frame whose labels are to be predicted. By default, predict is run on the same columns
                      over which the model is trained.
        :param observation_columns: (Optional(list[str])) Column(s) containing the observations whose labels are to be predicted.
                        By default, we predict the labels over columns the Random Forest model was trained on.
        :return: (Frame) A new frame consisting of the existing columns of the frame and a new column with predicted
                 value for each observation.
        """

        require_type(Frame, frame, 'frame')
        column_list = affirm_type.list_of_str(observation_columns, "observation_columns", allow_none=True)
        columns_option = self._tc.jutils.convert.to_scala_option_list_string(column_list)
        return Frame(self._tc, self._scala.predict(frame._scala, columns_option))
Exemple #16
0
    def validate(tc, arg_name='tc'):
        """
        Validates that the given tc object is indeed a TkContext.  Raises a ValueError if it is not.

        Examples
        --------

            <hide>
            >>> from sparktk import TkContext

            </hide>

            >>> TkContext.validate(tc)

            >>> try:
            ...     TkContext(25)
            ... except TypeError:
            ...     print "Not a TkContext!"
            Not a TkContext!

        """
        # Since tc is so commonly used as an implicit variable, it's worth special code here to save a lot of imports
        require_type(TkContext, tc, arg_name)
Exemple #17
0
    def validate(tc, arg_name='tc'):
        """
        Validates that the given tc object is indeed a TkContext.  Raises a ValueError if it is not.

        Examples
        --------

            <hide>
            >>> from sparktk import TkContext

            </hide>

            >>> TkContext.validate(tc)

            >>> try:
            ...     TkContext(25)
            ... except TypeError:
            ...     print "Not a TkContext!"
            Not a TkContext!

        """
        # Since tc is so commonly used as an implicit variable, it's worth special code here to save a lot of imports
        require_type(TkContext, tc, arg_name)
Exemple #18
0
    def test(self, frame, observation_columns=None, label_column=None):
        """
        Predict test frame labels and return metrics.

        Parameters
        ----------

        :param frame: (Frame) The frame whose labels are to be predicted
        :param observation_columns: (Optional(list[str])) Column(s) containing the observations whose labels are to be predicted.
                                    By default, the same observation column names from training are used
        :param label_column: (str) Column containing the name of the label
                                    By default, the same label column name from training is used
        :return: (ClassificationMetricsValue) Binary classification metrics comprised of:
                accuracy (double)
                The proportion of predictions that are correctly identified
                confusion_matrix (dictionary)
                A table used to describe the performance of a classification model
                f_measure (double)
                The harmonic mean of precision and recall
                precision (double)
                The proportion of predicted positive instances that are correctly identified
                recall (double)
                The proportion of positive instances that are correctly identified.
        """
        require_type(Frame, frame, 'frame')
        column_list = affirm_type.list_of_str(observation_columns,
                                              "observation_columns",
                                              allow_none=True)

        return ClassificationMetricsValue(
            self._tc,
            self._scala.test(
                frame._scala,
                self._tc.jutils.convert.to_scala_option_list_string(
                    column_list),
                self._tc.jutils.convert.to_scala_option(label_column)))
Exemple #19
0
def export_to_jdbc(self, connection_url, table_name, overwrite=False):
    """
    Write current frame to JDBC table

    Parameters
    ----------

    :param connection_url: (str) JDBC connection url to database server
    :param table_name: (str) JDBC table name
    :param overwrite: (Optional(bool)) Specify whether or not to overwrite the existing table, if one already exists with the
                      the same name.  If overwrite is set to False and a table with the same name already exists, an
                      exception is thrown.

    Example
    -------

    <skip>

        >>> from sparktk import TkContext
        >>> c=TkContext(sc)
        >>> data = [[1, 0.2, -2, 5], [2, 0.4, -1, 6], [3, 0.6, 0, 7], [4, 0.8, 1, 8]]
        >>> schema = [('a', int), ('b', float),('c', int) ,('d', int)]
        >>> my_frame = tc.frame.create(data, schema)
        <progress>
    </skip>

    connection_url : (string) : "jdbc:{datasbase_type}://{host}/{database_name}

    Sample connection string for postgres
    ex: jdbc:postgresql://localhost/postgres [standard connection string to connect to default 'postgres' database]

    table_name: (string): table name. It will create new table with given name if it does not exists already.

    <skip>
        >>> my_frame.export_to_jdbc("jdbc:postgresql://localhost/postgres", "demo_test")
        <progress>
    </skip>

    Verify exported frame in postgres

        From bash shell

        $sudo -su ppostgres psql
        postgres=#\d

    You should see demo_test table.

    Run postgres=#select * from demo_test (to verify frame).

     Notes
    -----

        java.sql.SQLException: No suitable driver found for <jdbcUrl>

    If this error is encountered while running your application, then your JDBC library cannot be found by the node
    running the application. If you're running in Local mode, make sure that you have used the --driver-class-path
    parameter. If a Spark cluster is involved, make sure that each cluster member has a copy of library, and that
    each node of the cluster has been restarted since you modified the spark-defaults.conf file.  See this
    [site](https://sparkour.urizone.net/recipes/using-jdbc/).

    Sparktk does not come with any JDBC drivers.  A driver compatible with the JDBC data sink must be supplied when
    creating the TkContext instance:

        <skip>
        >>> tc = sparktk.TkContext(pyspark_submit_args='--jars myJDBCDriver.jar')
        </skip>
    """

    require_type.non_empty_str(connection_url, "connection_url")
    require_type.non_empty_str(table_name, "table_name")
    require_type(bool, overwrite, "overwrite")

    self._scala.exportToJdbc(connection_url, table_name, overwrite)
Exemple #20
0
def import_csv_raw(path, delimiter=",", header=False, tc=TkContext.implicit):
    """
    Creates a frame by importing the data as strings from the specified csv file.  If the csv file has a header row,
    those values will be used as column names.  Otherwise, columns will be named generically, like 'C0', 'C1', 'C2', etc.

    Parameters
    ----------

    :param path: (str) Full path to the csv file
    :param delimiter: (str) A string which indicates the separation of data fields.  This is usually a single character
                      and could be a non-visible character, such as a tab. The default delimiter is a comma (,).
    :param header: (bool) Boolean value indicating if the first line of the file will be used to name columns, and not
                   be included in the data.  The default value is false.
    :return: (Frame) Frame that contains the data from the csv file

    Examples
    --------

    Import raw data from a csv file by specifying the path to the file, delimiter, and header option.  All data will
    be brought in the frame as strings, and columns will be named according to the header row, if there was one.

        >>> file_path = "../datasets/cities.csv"

        >>> frame = tc.frame.import_csv_raw(file_path, delimiter="|", header=True)
        -etc-

        >>> frame.inspect()
        [#]  rank  city         population_2013  population_2010  change  county
        ============================================================================
        [0]  1     Portland     609456           583776           4.40%   Multnomah
        [1]  2     Salem        160614           154637           3.87%   Marion
        [2]  3     Eugene       159190           156185           1.92%   Lane
        [3]  4     Gresham      109397           105594           3.60%   Multnomah
        [4]  5     Hillsboro    97368            91611            6.28%   Washington
        [5]  6     Beaverton    93542            89803            4.16%   Washington
        [6]  15    Grants Pass  35076            34533            1.57%   Josephine
        [7]  16    Oregon City  34622            31859            8.67%   Clackamas
        [8]  17    McMinnville  33131            32187            2.93%   Yamhill
        [9]  18    Redmond      27427            26215            4.62%   Deschutes

        >>> frame.schema
        [('rank', <type 'str'>), ('city', <type 'str'>), ('population_2013', <type 'str'>), ('population_2010', <type 'str'>), ('change', <type 'str'>), ('county', <type 'str'>)]


    """
    TkContext.validate(tc)
    require_type.non_empty_str(path, "path")
    require_type.non_empty_str(delimiter, "delimiter")
    require_type(bool, header, "header")

    df = tc.sql_context.read.format(
        "com.databricks.spark.csv.org.trustedanalytics.sparktk").options(
            delimiter=delimiter,
            header=str(header).lower(),
            inferschema="false").load(path, schema=None)

    df_schema = []

    for column in df.schema.fields:
        try:
            datatype = dtypes.dtypes.get_primitive_type_from_pyspark_type(
                type(column.dataType))
        except ValueError:
            raise TypeError(
                "Unsupported data type ({0}) for column {1}.".format(
                    str(column.dataType), column.name))
        df_schema.append((column.name, datatype))

    jrdd = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(
        df._jdf.rdd())
    rdd = RDD(jrdd, tc.sc)

    from sparktk.frame.frame import Frame  # circular dependency, so import late
    return Frame(tc, rdd, df_schema)
def export_to_tensorflow(self, path, overwrite=False):
    """
    Export frame to TensorFlow Records file on given path

    TensorFlow records are the standard data format for TensorFlow. The recommended format for TensorFlow is a TFRecords file
    containing tf.train.Example protocol buffers. The tf.train.Example protocol buffers encodes (which contain Features as a field).
    https://www.tensorflow.org/how_tos/reading_data

    During export, the API parses Spark SQL DataTypes to TensorFlow compatible DataTypes as below:

    * IntegerType or LongType =>  Int64List
    * FloatType or DoubleType => FloatList
    * ArrayType(Double) [Vector] => FloatList
    * Any other DataType (Ex: String) => BytesList

    Parameters
    ----------

    :param path: (str) HDFS/Local path to export current frame as TensorFlow records
    :param overwrite: (Optional[bool]) Specify whether or not to overwrite the existing file, if a file already exists
                      at the specified path.  If overwrite is set to False, and a file already exists, an exception
                      is thrown.


    Examples
    --------

        >>> file_path = "../datasets/cities.csv"

        >>> frame = tc.frame.import_csv(file_path, "|", header=True)
        -etc-

        >>> frame.sort("rank")

        >>> frame.inspect()
        [#]  rank  city         population_2013  population_2010  change  county
        ============================================================================
        [0]     1  Portland              609456           583776  4.40%   Multnomah
        [1]     2  Salem                 160614           154637  3.87%   Marion
        [2]     3  Eugene                159190           156185  1.92%   Lane
        [3]     4  Gresham               109397           105594  3.60%   Multnomah
        [4]     5  Hillsboro              97368            91611  6.28%   Washington
        [5]     6  Beaverton              93542            89803  4.16%   Washington
        [6]     7  Bend                   81236            76639  6.00%   Deschutes
        [7]     8  Medford                77677            74907  3.70%   Jackson
        [8]     9  Springfield            60177            59403  1.30%   Lane
        [9]    10  Corvallis              55298            54462  1.54%   Benton

        >>> destPath = "../tests/sandbox/output24.tfr"

        >>> import os
        ... if os.path.exists(filename) os.remove(destPath)

        >>> frame.export_to_tensorflow(destPath)

    Check for output24.tfr in specified destination path either on Local or HDFS file system.

    An existing file can be overwritten by setting the overwrite parameter to True when using the export_to_tensorflow
    operation.  To demonstrate this, we will modify the frame, by removing some columns, and then export the frame
    the the same path that was previously used.  Note that if the overwrite parameter is not set to True, an exception
    would be thrown, since there is already a file at the specified path.

        >>> frame.drop_columns(["population_2010", "change"])
        >>> frame.export_to_tensorflow(destPath, overwrite=True)

    """

    require_type.non_empty_str(path, "path")
    require_type(bool, overwrite, "overwrite")

    self._scala.exportToTensorflow(path, overwrite)
Exemple #22
0
def join_cross(self, right):
    """
    The join_cross operation performs a cross join operation on two frames, and returns a frame that contains the
    cartesian product of the two frames. Each row from the current frame is combined with each row from the right frame.

    Parameters
    ----------

    :param right: (Frame) The right frame in the cross join operation.
    :returns: (Frame) A new frame with the results of the cross join.

    Notes
    -----
    The frame returned will contain all columns from the current frame and the right frame.  If a column name in the
    right frame already exists in the current frame, the column from the right frame will have a "_R" suffix.

    The order of columns after this method is called is not guaranteed.  It is recommended that you rename the columns
    to meaningful terms prior to using the join_cross method.

    Examples
    --------

    Start by creating two test frames to use with the cross join operation:

        >>> frame = tc.frame.create([[1],[2],[3]], [("id", int)])
        >>> frame.inspect()
        [#]  id
        =======
        [0]   1
        [1]   2
        [2]   3

        >>> right = tc.frame.create([["a"],["b"],["c"]], [("char", str)])
        >>> right.inspect()
        [#]  char
        =========
        [0]  a
        [1]  b
        [2]  c

    Perform a cross join on the frame with the right frame:

        >>> result = frame.join_cross(right)

    <hide>
        >>> result.sort(["id","char"])
    </hide>

        >>> result.inspect()
        [#]  id  char
        =============
        [0]   1  a
        [1]   1  b
        [2]   1  c
        [3]   2  a
        [4]   2  b
        [5]   2  c
        [6]   3  a
        [7]   3  b
        [8]   3  c

    Note that if the right frame has a column with the same column name as the current frame, the resulting frame
    will include a "_R" suffix in the column name from the right frame.  For example, if we cross join the frame with
    itself, it will result in a frame that has two columns: 'id' and 'id_R'.

        >>> self_cross_join = frame.join_cross(frame)

    <hide>
        >>> self_cross_join.sort(["id","id_R"])
    </hide>

        >>> self_cross_join.inspect()
        [#]  id  id_R
        =============
        [0]   1     1
        [1]   1     2
        [2]   1     3
        [3]   2     1
        [4]   2     2
        [5]   2     3
        [6]   3     1
        [7]   3     2
        [8]   3     3


    """

    from sparktk.frame.frame import Frame

    require_type(Frame, right, "right")

    return Frame(self._tc, self._scala.joinCross(right._scala))
Exemple #23
0
def export_to_json(self, path, count=0, offset=0, overwrite=False):
    """
    Write current frame to HDFS in Json format.

    Parameters
    ----------

    :param path: (str) The HDFS folder path where the files will be created.
    :param count: (Optional[int]) The number of records you want. Default (0), or a non-positive value, is the
                   whole frame.
    :param offset: (Optional[int]) The number of rows to skip before exporting to the file. Default is zero (0).
    :param overwrite: (Optional[bool]) Specify whether or not to overwrite the existing file, if one already
                      exists at the specified path.  If overwrite is set to False and the file already exists,
                      an exception is thrown.

    Example
    -------

    Start out by creating a frame and then exporting it to a json file.

        <hide>
        >>> from setup import get_sandbox_path
        >>> file_path = get_sandbox_path("export_example.json")
        </hide>
        >>> frame = tc.frame.create([[1, 2, 3], [4, 5, 6]])
        >>> frame.inspect()
        [#]  C0  C1  C2
        ===============
        [0]   1   2   3
        [1]   4   5   6

        >>> frame.export_to_json(file_path)

    Import the data from the json file that we just created, and then inspect the data in the frame.

        >>> import json
        >>> # function used for parsing json rows
        >>> def parse_json(row):
        ...     record = json.loads(row.records)
        ...     columns = record.values()
        ...     columns.reverse()
        ...     return columns

        >>> frame2 = tc.frame.import_json(file_path)
        <hide>
        >>> frame2.sort("records")
        </hide>
        >>> frame2.inspect()
        [#]  records
        =================================
        [0]  {"C0":"1","C1":"2","C2":"3"}
        [1]  {"C0":"4","C1":"5","C2":"6"}

    Map columns and parse json into columns:

        >>> frame2 = frame2.map_columns(parse_json, [('C0', int), ('C1', int), ('C2', int)])
        <hide>
        >>> frame2.sort("C0")
        </hide>
        >>> frame2.inspect()
        [#]  C0  C1  C2
        ===============
        [0]   1   2   3
        [1]   4   5   6

    We can also modify the data in the original frame, and then export to the json file again, using the 'overwrite'
    parameter to specify that we want to overwrite the existing file with the new data.

        >>> frame.add_columns(lambda row: row.C2 * 2, ("C3", int))
        <hide>
        >>> frame.sort("C0")
        </hide>
        >>> frame.inspect()
        [#]  C0  C1  C2  C3
        ===================
        [0]   1   2   3   6
        [1]   4   5   6  12

        >>> frame.export_to_json(file_path, overwrite=True)

    Again, import the data from the json file, and inspect the data in the frame.

        >>> frame3 = tc.frame.import_json(file_path)
        <hide>
        >>> frame3.sort("records")
        </hide>
        >>> frame3.inspect()
        [#]  records
        ===========================================
        [0]  {"C0":"1","C1":"2","C2":"3","C3":"6"}
        [1]  {"C0":"4","C1":"5","C2":"6","C3":"12"}

        >>> frame3 = frame3.map_columns(parse_json, [('C0', int), ('C1', int), ('C2', int), ('C3', int)])
        <hide>
        >>> frame3.sort("C0")
        </hide>
        >>> frame3.inspect()
        [#]  C0  C1  C2  C3
        ===================
        [0]  1   2   3    6
        [1]  4   5   6   12

    """

    require_type.non_empty_str(path, "path")
    require_type(int, count, "count")
    require_type(int, offset, "offset")
    require_type(bool, overwrite, "overwrite")

    self._scala.exportToJson(path, count, offset, overwrite)
Exemple #24
0
def export_to_hbase(self,
                    table_name,
                    key_column_name=None,
                    family_name="familyColumn",
                    overwrite=False):
    """
    Write current frame to HBase table.

    Table must exist in HBase.

    Parameters
    ----------

    :param table_name: (str) The name of the HBase table that will contain the exported frame
    :param key_column_name: (Optional[str]) The name of the column to be used as row key in hbase table
    :param family_name: (Optional[str]) The family name of the HBase table that will contain the exported frame
    :param overwrite: (Optional[bool]) Specify whether or not to modify an existing HBase table, if one already
                      exists with the same name.  When the table is modified, columns with the same name will be
                      overwritten, and columns with new names will be added to the table.  If overwrite is False
                      and a table already exists with the same name, an exception is thrown.

    Example
    -------

        >>> data = [[1, 0.2, -2, 5], [2, 0.4, -1, 6], [3, 0.6, 0, 7], [4, 0.8, 1, 8]]
        >>> schema = [('a', int), ('b', float),('c', int) ,('d', int)]
        >>> my_frame = tc.frame.create(data, schema)
    <skip>
        >>> my_frame.export_to_hbase("test_demo_hbase", family_name="test_family")
        <progress>
    </skip>

    Verify exported frame in hbase

    From bash shell

        $hbase shell

        hbase(main):001:0> list

    You should see test_demo_hbase table.

    Run hbase(main):001:0> scan 'test_demo_hbase' (to verify frame).

    Output:

        ROW     COLUMN+CELL
         0      column=test_family:a, timestamp=1464219662295, value=1
         0      column=test_family:b, timestamp=1464219662295, value=0.2
         0      column=test_family:c, timestamp=1464219662295, value=-2
         0      column=test_family:d, timestamp=1464219662295, value=5
         1      column=test_family:a, timestamp=1464219662295, value=2
         1      column=test_family:b, timestamp=1464219662295, value=0.4
         1      column=test_family:c, timestamp=1464219662295, value=-1
         1      column=test_family:d, timestamp=1464219662295, value=6
         2      column=test_family:a, timestamp=1464219662295, value=3
         2      column=test_family:b, timestamp=1464219662295, value=0.6
         2      column=test_family:c, timestamp=1464219662295, value=0
         2      column=test_family:d, timestamp=1464219662295, value=7
         3      column=test_family:a, timestamp=1464219662295, value=4
         3      column=test_family:b, timestamp=1464219662295, value=0.8
         3      column=test_family:c, timestamp=1464219662295, value=1
         3      column=test_family:d, timestamp=1464219662295, value=8
        4 row(s) in 0.1560 seconds

    An existing HBase table can also be modified using the 'overwrite' parameter.  To demonstrate this, we will modify
    the frame to add a column 'e', then export the data to HBase with the same table name, and set the overwrite
    parameter to True.

        >>> my_frame.add_columns(lambda row: row.d * 10, ("e",int))
        >>> my_frame.inspect()
        [#]  a  b    c   d  e
        ======================
        [0]  1  0.2  -2  5  50
        [1]  2  0.4  -1  6  60
        [2]  3  0.6   0  7  70
        [3]  4  0.8   1  8  80

    <skip>
        >>> my_frame.export_to_hbase("test_demo_hbase", family_name="test_family", overwrite=True)
    </skip>

    Run hbase(main):001:0> scan 'test_demo_hbase' (to verify updated data).

    Output:

        ROW     COLUMN+CELL
         0      column=test_family:a, timestamp=1486680202927, value=1
         0      column=test_family:b, timestamp=1486680202927, value=0.2
         0      column=test_family:c, timestamp=1486680202927, value=-2
         0      column=test_family:d, timestamp=1486680202927, value=5
         0      column=test_family:e, timestamp=1486680202927, value=50
         1      column=test_family:a, timestamp=1486680202928, value=2
         1      column=test_family:b, timestamp=1486680202928, value=0.4
         1      column=test_family:c, timestamp=1486680202928, value=-1
         1      column=test_family:d, timestamp=1486680202928, value=6
         1      column=test_family:e, timestamp=1486680202928, value=60
         2      column=test_family:a, timestamp=1486680202927, value=3
         2      column=test_family:b, timestamp=1486680202927, value=0.6
         2      column=test_family:c, timestamp=1486680202927, value=0
         2      column=test_family:d, timestamp=1486680202927, value=7
         2      column=test_family:e, timestamp=1486680202927, value=70
         3      column=test_family:a, timestamp=1486680202928, value=4
         3      column=test_family:b, timestamp=1486680202928, value=0.8
         3      column=test_family:c, timestamp=1486680202928, value=1
         3      column=test_family:d, timestamp=1486680202928, value=8
         3      column=test_family:e, timestamp=1486680202928, value=80
        4 row(s) in 0.0440 seconds

    """
    if not isinstance(table_name, basestring):
        raise ValueError(
            "Unsupported 'table_name' parameter type.  Expected string, but found %s."
            % type(table_name))

    if not isinstance(family_name, basestring):
        raise ValueError(
            "Unsupported 'family_name' parameter type.  Expected string, but found %s."
            % type(family_name))

    require_type(bool, overwrite, "overwrite")

    self._scala.exportToHbase(
        table_name, self._tc.jutils.convert.to_scala_option(key_column_name),
        family_name, overwrite)
Exemple #25
0
 def test_basic(self):
     require_type(int, 1, "a")
     require_type(str, "1", "a")
     require_type(list, [1, 2, 3], "a")
Exemple #26
0
def join_cross(self, right):
    """
    The join_cross operation performs a cross join operation on two frames, and returns a frame that contains the
    cartesian product of the two frames. Each row from the current frame is combined with each row from the right frame.

    Parameters
    ----------

    :param right: (Frame) The right frame in the cross join operation.
    :returns: (Frame) A new frame with the results of the cross join.

    Notes
    -----
    The frame returned will contain all columns from the current frame and the right frame.  If a column name in the
    right frame already exists in the current frame, the column from the right frame will have a "_R" suffix.

    The order of columns after this method is called is not guaranteed.  It is recommended that you rename the columns
    to meaningful terms prior to using the join_cross method.

    Examples
    --------

    Start by creating two test frames to use with the cross join operation:

        >>> frame = tc.frame.create([[1],[2],[3]], [("id", int)])
        >>> frame.inspect()
        [#]  id
        =======
        [0]   1
        [1]   2
        [2]   3

        >>> right = tc.frame.create([["a"],["b"],["c"]], [("char", str)])
        >>> right.inspect()
        [#]  char
        =========
        [0]  a
        [1]  b
        [2]  c

    Perform a cross join on the frame with the right frame:

        >>> result = frame.join_cross(right)

    <hide>
        >>> result.sort(["id","char"])
    </hide>

        >>> result.inspect()
        [#]  id  char
        =============
        [0]   1  a
        [1]   1  b
        [2]   1  c
        [3]   2  a
        [4]   2  b
        [5]   2  c
        [6]   3  a
        [7]   3  b
        [8]   3  c

    Note that if the right frame has a column with the same column name as the current frame, the resulting frame
    will include a "_R" suffix in the column name from the right frame.  For example, if we cross join the frame with
    itself, it will result in a frame that has two columns: 'id' and 'id_R'.

        >>> self_cross_join = frame.join_cross(frame)

    <hide>
        >>> self_cross_join.sort(["id","id_R"])
    </hide>

        >>> self_cross_join.inspect()
        [#]  id  id_R
        =============
        [0]   1     1
        [1]   1     2
        [2]   1     3
        [3]   2     1
        [4]   2     2
        [5]   2     3
        [6]   3     1
        [7]   3     2
        [8]   3     3


    """

    from sparktk.frame.frame import Frame

    require_type(Frame, right, "right")

    return Frame(self._tc, self._scala.joinCross(right._scala))
Exemple #27
0
def train(frame,
          observation_columns,
          label_column,
          num_trees = 1,
          impurity = "variance",
          max_depth = 4,
          max_bins = 100,
          min_instances_per_node = 1,
          sub_sampling_rate = 1.0,
          feature_subset_category = "auto",
          seed = None,
          categorical_features_info = None):
    """
    Creates a Random Forest Regressor Model by training on the given frame

    Parameters
    ----------

    :param frame: (Frame) frame frame of training data
    :param observation_columns: (list(str)) Column(s) containing the observations
    :param label_column: (str) Column name containing the label for each observation
    :param num_trees: (int) Number of tress in the random forest. Default is 1
    :param impurity: (str) Criterion used for information gain calculation. Default value is "variance".
    :param max_depth: (int) Maximum depth of the tree. Default is 4
    :param max_bins: (int) Maximum number of bins used for splitting features.
    :param min_instances_per_node: (int) Minimum number of records each child node must have after a split.
    :param sub_sampling_rate: (double) Fraction between 0..1 of the training data used for learning each decision tree.
    :param feature_subset_category: (str) Subset of observation columns, i.e., features,
                                 to consider when looking for the best split.
                                 Supported values "auto","all","sqrt","log2","onethird".
                                 If "auto" is set, this is based on num_trees: if num_trees == 1, set to "all"
                                 ; if num_trees > 1, set to "sqrt".
    :param seed: (Optional(int)) Random seed for bootstrapping and choosing feature subsets. Default is a randomly chosen seed.
    :param categorical_features_info: (Optional(Dict(str:int))) Arity of categorical features. Entry (name-> k) indicates
                                      that feature 'name' is categorical with 'k' categories indexed from 0:{0,1,...,k-1}

    :return: (RandomForestRegressorModel) The trained random forest regressor model

    Notes
    -----
    Random Forest is a supervised ensemble learning algorithm used to perform regression. A Random Forest
    Regressor model is initialized, trained on columns of a frame, and used to predict the value of each
    observation in the frame. This model runs the Spark ML implementation of Random Forest. During training,
    the decision trees are trained in parallel. During prediction, the average over-all tree's predicted
    value is the predicted value of the random forest.

    """
    require_type(Frame, frame, 'frame')
    column_list = affirm_type.list_of_str(observation_columns, "observation_columns")
    require_type.non_empty_str(label_column, "label_column")
    require_type.non_negative_int(num_trees, "num_trees")
    require_type.non_empty_str(impurity, "impurity")
    require_type.non_negative_int(max_depth, "max_depth")
    require_type.non_negative_int(max_bins, "max_bins")
    require_type.non_negative_int(min_instances_per_node, "min_instances_per_node")
    require_type(float, sub_sampling_rate, "sub_sampling_rate")
    if sub_sampling_rate > 1 or sub_sampling_rate < 0:
        raise ValueError("'sub_sampling_rate' parameter must have a value between 0 and 1")
    require_type.non_empty_str(feature_subset_category, "feature_subset_category")

    tc = frame._tc
    _scala_obj = get_scala_obj(tc)
    seed = int(os.urandom(2).encode('hex'), 16) if seed is None else seed
    scala_model = _scala_obj.train(frame._scala,
                                   tc.jutils.convert.to_scala_list_string(column_list),
                                   label_column,
                                   num_trees,
                                   impurity,
                                   max_depth,
                                   max_bins,
                                   min_instances_per_node,
                                   sub_sampling_rate,
                                   feature_subset_category,
                                   seed,
                                   __get_categorical_features_info(tc, categorical_features_info))

    return RandomForestRegressorModel(tc, scala_model)
Exemple #28
0
def cross_validate(frame,
                   train_descriptors,
                   num_folds=3,
                   verbose=False,
                   tc=TkContext.implicit):
    """
    Computes k-fold cross validation on classification and regression models with the given frame and parameter values
    :param frame: The frame to perform cross-validation on
    :param train_descriptors: Tuple of model and Dictionary of model parameters and their value/values as singleton
            values or a list of type grid_values
    :param num_folds: Number of folds to run the cross-validator on
    :param verbose: Flag indicating if the results of each fold are to be viewed. Default is set to False
    :param tc: spark-tk context (provided implicitly)
    :return: Summary of model's performance consisting of metrics of each combination of train_descriptor values per fold
            and averages across all folds

    Example
    -------

        >>> frame = tc.frame.create([[1,0],[2,0],[3,0],[4,0],[5,0],[6,1],[7,1],[8,1],[9,1],[10,1]],[("data", float),("label",int)])

        >>> frame.inspect()
        [#]  data  label
        ================
        [0]     1      0
        [1]     2      0
        [2]     3      0
        [3]     4      0
        [4]     5      0
        [5]     6      1
        [6]     7      1
        [7]     8      1
        [8]     9      1
        [9]    10      1

        >>> from sparktk.models import grid_values

        >>> result = tc.models.cross_validate(frame,
        ...                                   [(tc.models.classification.svm,
        ...                                     {"observation_columns":"data",
        ...                                      "label_column":"label",
        ...                                      "num_iterations": grid_values(2, 10),
        ...                                      "step_size": 0.01}),
        ...                                    (tc.models.classification.logistic_regression,
        ...                                     {"observation_columns":"data",
        ...                                      "label_column":"label",
        ...                                      "num_iterations": grid_values(2, 10),
        ...                                      "step_size": 0.01})],
        ...                                   num_folds=2,
        ...                                   verbose=True)

        <skip>
        >>> result
        GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 1.0
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              0              5
        f_measure        = 1.0
        precision        = 1.0
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              2              0
        Actual_Neg              2              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              2              0
        Actual_Neg              2              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              2              0
        Actual_Neg              2              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 1.0
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              2              0
        Actual_Neg              0              2
        f_measure        = 1.0
        precision        = 1.0
        recall           = 1.0)
        ******Averages: ******
        GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 1.0
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              0              5
        f_measure        = 1.0
        precision        = 1.0
        recall           = 1.0)

        >>> result.averages
        GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 1.0
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              0              5
        f_measure        = 1.0
        precision        = 1.0
        recall           = 1.0)
        </skip>
    """
    TkContext.validate(tc)
    arguments.require_type(Frame, frame, "frame")

    all_grid_search_results = []
    grid_search_results_accumulator = None
    for train_frame, test_frame in split_data(frame, num_folds, tc):
        scores = grid_search(train_frame, test_frame, train_descriptors, tc=tc)
        if grid_search_results_accumulator is None:
            grid_search_results_accumulator = scores
        else:
            grid_search_results_accumulator._accumulate_matching_points(
                scores.grid_points)
        all_grid_search_results.append(scores)

    # make the accumulator hold averages
    grid_search_results_accumulator._divide_metrics(num_folds)
    return CrossValidationResults(all_grid_search_results,
                                  grid_search_results_accumulator.copy(),
                                  verbose)
Exemple #29
0
def cross_validate(frame, train_descriptors, num_folds=3, verbose=False, tc=TkContext.implicit):
    """
    Computes k-fold cross validation on model with the given frame and parameter values
    :param frame: The frame to perform cross-validation on
    :param train_descriptors: Tuple of model and Dictionary of model parameters and their value/values as singleton
            values or a list of type grid_values
    :param num_folds: Number of folds to run the cross-validator on
    :param verbose: Flag indicating if the results of each fold are to be viewed. Default is set to False
    :param tc: spark-tk context (provided implicitly)
    :return: Summary of model's performance consisting of metrics of each combination of train_descriptor values per fold
            and averages across all folds

    Example
    -------

        >>> frame = tc.frame.create([[1,0],[2,0],[3,0],[4,0],[5,0],[6,1],[7,1],[8,1],[9,1],[10,1]],[("data", float),("label",int)])

        >>> frame.inspect()
        [#]  data  label
        ================
        [0]     1      0
        [1]     2      0
        [2]     3      0
        [3]     4      0
        [4]     5      0
        [5]     6      1
        [6]     7      1
        [7]     8      1
        [8]     9      1
        [9]    10      1

        >>> from sparktk.models import grid_values

        >>> result = tc.models.cross_validate(frame,
        ...                                   [(tc.models.classification.svm,
        ...                                     {"observation_columns":"data",
        ...                                      "label_column":"label",
        ...                                      "num_iterations": grid_values(2, 10),
        ...                                      "step_size": 0.01}),
        ...                                    (tc.models.classification.logistic_regression,
        ...                                     {"observation_columns":"data",
        ...                                      "label_column":"label",
        ...                                      "num_iterations": grid_values(2, 10),
        ...                                      "step_size": 0.01})],
        ...                                   num_folds=2,
        ...                                   verbose=True)

        <skip>
        >>> result
        GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 1.0
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              0              5
        f_measure        = 1.0
        precision        = 1.0
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              2              0
        Actual_Neg              2              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              2              0
        Actual_Neg              2              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              2              0
        Actual_Neg              2              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 1.0
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              2              0
        Actual_Neg              0              2
        f_measure        = 1.0
        precision        = 1.0
        recall           = 1.0)
        ******Averages: ******
        GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 1.0
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              0              5
        f_measure        = 1.0
        precision        = 1.0
        recall           = 1.0)

        >>> result.averages
        GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 1.0
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              0              5
        f_measure        = 1.0
        precision        = 1.0
        recall           = 1.0)
        </skip>
    """
    TkContext.validate(tc)
    arguments.require_type(Frame, frame, "frame")

    all_grid_search_results = []
    grid_search_results_accumulator = None
    for train_frame, test_frame in split_data(frame, num_folds , tc):
        scores = grid_search(train_frame, test_frame, train_descriptors, tc)
        if grid_search_results_accumulator is None:
            grid_search_results_accumulator = scores
        else:
            grid_search_results_accumulator._accumulate_matching_points(scores.grid_points)
        all_grid_search_results.append(scores)

    # make the accumulator hold averages
    grid_search_results_accumulator._divide_metrics(num_folds)
    return CrossValidateClassificationResults(all_grid_search_results,
                                              grid_search_results_accumulator.copy(),
                                              verbose)
Exemple #30
0
def grid_search(train_frame, test_frame, train_descriptors, tc= TkContext.implicit):
    """
    Implements grid search by training the specified model on all combinations of descriptor and testing on test frame
    :param train_frame: The frame to train the model on
    :param test_frame: The frame to test the model on
    :param train_descriptors: Tuple of model and Dictionary of model parameters and their value/values as singleton
            values or a list of type grid_values
    :param tc: spark-tk context passed implicitly
    :return: Summary of metrics for different combinations of the grid and the best performing parameter combination

    Example
    -------

        >>> frame = tc.frame.create([[1,0],[2,0],[3,0],[4,0],[5,0],[6,1],[7,1],[8,1],[9,1],[10,1]],[("data", float),("label",int)])

        >>> frame.inspect()
        [#]  data  label
        ================
        [0]     1      0
        [1]     2      0
        [2]     3      0
        [3]     4      0
        [4]     5      0
        [5]     6      1
        [6]     7      1
        [7]     8      1
        [8]     9      1
        [9]    10      1

        >>> from sparktk.models import grid_values

        >>> grid_result = tc.models.grid_search(frame, frame,
        ...                                    [(tc.models.classification.svm,
        ...                                     {"observation_columns":"data",
        ...                                      "label_column":"label",
        ...                                      "num_iterations": grid_values(2, 10),
        ...                                      "step_size": 0.01}),
        ...                                     (tc.models.classification.logistic_regression,
        ...                                     {"observation_columns":"data",
        ...                                      "label_column":"label",
        ...                                      "num_iterations": grid_values(2, 10),
        ...                                      "step_size": 0.01})])

        >>> grid_result
        GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 1.0
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              0              5
        f_measure        = 1.0
        precision        = 1.0
        recall           = 1.0)

        >>> grid_result.find_best()
        GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 1.0
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              0              5
        f_measure        = 1.0
        precision        = 1.0
        recall           = 1.0)

        >>> grid_result.grid_points
        [GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0),
         GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0),
         GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0),
         GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 1.0
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              0              5
        f_measure        = 1.0
        precision        = 1.0
        recall           = 1.0)]

        >>> grid_result.grid_points[1]
        GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)

    """

    # validate input
    TkContext.validate(tc)
    descriptors = affirm_type.list_of_anything(train_descriptors, "train_descriptors")
    for i in xrange(len(descriptors)):
        item = descriptors[i]
        if not isinstance(item, TrainDescriptor):
            require_type(tuple, item, "item", "grid_search needs a list of items which are either of type TrainDescriptor or tuples of (model, train_kwargs)")
            if len(item) != 2:
                raise value_error("list requires tuples of len 2", item, "item in train_descriptors")
            if not hasattr(item[0], 'train'):
                raise value_error("first item in tuple needs to be a object with a 'train' function", item, "item in train_descriptors")
            descriptors[i] = TrainDescriptor(item[0], item[1])

    arguments.require_type(Frame, train_frame, "frame")
    arguments.require_type(Frame, test_frame, "frame")

    grid_points = []
    for descriptor in descriptors:
        train_method = getattr(descriptor.model_type, "train")
        list_of_kwargs = expand_kwarg_grids([descriptor.kwargs])
        for kwargs in list_of_kwargs:
            train_kwargs = dict(kwargs)
            train_kwargs['frame'] = train_frame
            validate_call(train_method, train_kwargs, ignore_self=True)
            model = descriptor.model_type.train(**train_kwargs)
            test_kwargs = dict(kwargs)
            test_kwargs['frame'] = test_frame
            test_kwargs = extract_call(model.test, test_kwargs, ignore_self=True)
            metrics = model.test(**test_kwargs)
            grid_points.append(GridPoint(descriptor=TrainDescriptor(descriptor.model_type, train_kwargs), metrics=metrics))
    return GridSearchResults(grid_points)
 def test_basic(self):
     require_type(int, 1, "a")
     require_type(str, "1", "a")
     require_type(list, [1, 2, 3], "a")
Exemple #32
0
def import_csv(path, delimiter=",", header=False, schema=None, datetime_format="yyyy-MM-dd'T'HH:mm:ss.SSSX", tc=TkContext.implicit):
    """
    Creates a frame with data from a csv file.

    Parameters
    ----------

    :param path: (str) Full path to the csv file
    :param delimiter: (Optional[str]) A string which indicates the separation of data fields.  This is usually a
                      single character and could be a non-visible character, such as a tab. The default delimiter
                      is a comma (,).
    :param header: (Optional[bool]) Boolean value indicating if the first line of the file will be used to name columns
                   (unless a schema is provided), and not be included in the data.  The default value is false.
    :param schema: (Optional(list[tuple(str, type)] or list[str])) The are different options for specifying a schema:

    * Provide the full schema for the frame as a list of tuples (string column name and data type)
    * Provide the column names as a list of strings.  Column data types will be inferred, based on the data.  The column names specified will override column names that are found in the header row.
    * None, where the schema is automatically inferred based on the data.  Columns are named based on the header, or will be named generically ("C0", "C1", "C2", etc).

    :param datetime_format: (str) String specifying how date/time columns are formatted, using the java.text.SimpleDateFormat
                        specified at https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html
    :return: (Frame) Frame that contains the data from the csv file

    Examples
    --------

    Load a frame from a csv file by specifying the path to the file, delimiter

        >>> file_path = "../datasets/cities.csv"

        >>> frame = tc.frame.import_csv(file_path, "|", header=True)
        -etc-

        >>> frame.inspect()
        [#]  rank  city         population_2013  population_2010  change  county
        ============================================================================
        [0]     1  Portland              609456           583776  4.40%   Multnomah
        [1]     2  Salem                 160614           154637  3.87%   Marion
        [2]     3  Eugene                159190           156185  1.92%   Lane
        [3]     4  Gresham               109397           105594  3.60%   Multnomah
        [4]     5  Hillsboro              97368            91611  6.28%   Washington
        [5]     6  Beaverton              93542            89803  4.16%   Washington
        [6]    15  Grants Pass            35076            34533  1.57%   Josephine
        [7]    16  Oregon City            34622            31859  8.67%   Clackamas
        [8]    17  McMinnville            33131            32187  2.93%   Yamhill
        [9]    18  Redmond                27427            26215  4.62%   Deschutes

        >>> frame.schema
        [('rank', <type 'int'>), ('city', <type 'str'>), ('population_2013', <type 'int'>), ('population_2010', <type 'int'>), ('change', <type 'str'>), ('county', <type 'str'>)]

    The schema parameter can be used to specify a custom schema (column names and data types) or column names (and the
    data types are inferred based on the data).  Here, we will specify the column names, which will override the
    header from the csv file.

        >>> column_names = ["Rank", "City", "2013", "2010", "Percent_Change", "County"]
        >>> frame = tc.frame.import_csv(file_path, "|", header=True, schema=column_names)
        -etc-

        >>> frame.schema
        [('Rank', <type 'int'>), ('City', <type 'str'>), ('2013', <type 'int'>), ('2010', <type 'int'>), ('Percent_Change', <type 'str'>), ('County', <type 'str'>)]

        <hide>
        >>> file_path = "../datasets/unicode.csv"
        >>> schema = [("a", unicode),("b", unicode),("c",unicode)]
        >>> frame = tc.frame.import_csv(file_path, schema=schema, header=False)
        -etc-

        >>> print unicode(frame.get_inspect()).encode('utf-8')  # because this file is UT8 and this docstring is str
        [#]  a  b  c
        ============
        [0]  à  ë  ñ
        [1]  ã  ê  ü

        </hide>

    """
    TkContext.validate(tc)
    require_type.non_empty_str(path, "path")
    require_type.non_empty_str(delimiter, "delimiter")
    require_type(bool, header, "header")
    require_type(str, datetime_format, "datetime_format")

    infer_schema = True
    column_names = []   # custom column names

    if schema is not None:
        if not isinstance(schema, list):
            raise TypeError("Unsupported type %s for schema parameter." % type(schema))
        elif all(isinstance(item, basestring) for item in schema):
            # schema is just column names
            column_names = schema
            schema = None
        else:
            infer_schema = False   # if a custom schema is provided, don't waste time inferring the schema during load
            sparktk_schema.validate(schema)

    header_str = str(header).lower()
    infer_schema_str = str(infer_schema).lower()
    pyspark_schema = None

    if schema is not None:
        fields = []
        for column in schema:
            if dtypes._data_type_to_pyspark_type_table.has_key(column[1]):
                fields.append(StructField(column[0], dtypes._data_type_to_pyspark_type_table[column[1]], True))
            else:
                raise TypeError("Unsupported type {0} in schema for column {1}.".format(column[1], column[0]))
        pyspark_schema = StructType(fields)

    df = tc.sql_context.read.format(
        "com.databricks.spark.csv.org.trustedanalytics.sparktk").options(
            delimiter=delimiter,
            header=header_str,
            dateformat=datetime_format,
            inferschema=infer_schema_str).load(path, schema=pyspark_schema)

    df_schema = []

    if schema is None:
        for i, column in enumerate(df.schema.fields):
            try:
                datatype = dtypes.dtypes.get_primitive_type_from_pyspark_type(type(column.dataType))
            except ValueError:
                raise TypeError("Unsupported data type ({0}) for column {1}.".format(str(column.dataType), column.name))
            column_name = column_names[i] if (i < len(column_names)) else column.name
            df_schema.append((column_name, datatype))
    else:
        df_column_count = len(df.schema.fields)
        custom_column_count = len(schema)
        if (df_column_count != custom_column_count):
            raise ValueError("Bad schema value.  The number of columns in the custom schema ({0}) must match the"
                             "number of columns in the csv file data ({1}).".format(custom_column_count, df_column_count))
        df_schema = schema

    def cast_datetime(row):
        """
        The spark data frame gives uses datetime objects.  Convert them to long (ms since epoch) for our frame.
        """
        data = []
        for column_index in xrange(0, len(df_schema)):
            if df_schema[column_index][1] == dtypes.datetime and isinstance(row[column_index], datetime):
                data.append(long(dtypes.datetime_to_ms(row[column_index])))
            else:
                data.append(row[column_index])
        return data

    jrdd = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(df._jdf.rdd())
    rdd = RDD(jrdd, tc.sc)

    if any(c[1] == dtypes.datetime for c in df_schema):
        # If any columns are date/time we must do this map
        rdd = df.rdd.map(cast_datetime)

    from sparktk.frame.frame import Frame  # circular dependency, so import late
    return Frame(tc, rdd, df_schema)
def train(frame,
          source_column_name,
          dest_column_name,
          weight_column_name,
          max_steps=10,
          regularization=0.5,
          alpha=0.5,
          num_factors=3,
          use_implicit=False,
          num_user_blocks=2,
          num_item_blocks=3,
          checkpoint_iterations=10,
          target_rmse=0.05):
    """
    Create collaborative filtering model by training on given frame

    Parameters
    ----------

    :param frame: (Frame) The frame containing the data to train on
    :param source_column_name: (str) source column name.
    :param dest_column_name: (str) destination column name.
    :param weight_column_name: (str) weight column name.
    :param max_steps: (int) max number of super-steps (max iterations) before the algorithm terminates. Default = 10
    :param regularization: (float) value between 0 .. 1
    :param alpha: (double) value between 0 .. 1
    :param num_factors: (int) number of the desired factors (rank)
    :param use_implicit: (bool) use implicit preference
    :param num_user_blocks: (int) number of user blocks
    :param num_item_blocks: (int) number of item blocks
    :param checkpoint_iterations: (int) Number of iterations between checkpoints
    :param target_rmse: (double) target RMSE
    :return: (CollaborativeFilteringModel) A trained collaborative filtering model
    """
    from sparktk.frame.frame import Frame
    require_type(Frame, frame, 'frame')
    require_type.non_empty_str(source_column_name, "source_column_name")
    require_type.non_empty_str(dest_column_name, "dest_column_name")
    require_type.non_empty_str(weight_column_name, "weight_column_name")
    require_type.non_negative_int(max_steps, "max_steps")
    require_type(float, regularization, "regularization")
    if regularization > 1 or regularization < 0:
        raise ValueError("'regularization' parameter must have a value between 0 and 1")
    require_type(float, alpha, "alpha")
    if alpha > 1 or alpha < 0:
        raise ValueError("'alpha' parameter must have a value between 0 and 1")
    require_type.non_negative_int(num_factors, "num_factors")
    require_type(bool, use_implicit, "use_implicit")
    require_type.non_negative_int(num_user_blocks, "num_user_blocks")
    require_type.non_negative_int(num_item_blocks, "num_item_blocks")
    require_type.non_negative_int(checkpoint_iterations, "checkpoint_iterations")
    require_type(float, target_rmse, "target_rmse")
    tc = frame._tc
    _scala_obj = get_scala_obj(tc)
    scala_model = _scala_obj.train(frame._scala,
                                   source_column_name,
                                   dest_column_name,
                                   weight_column_name,
                                   max_steps,
                                   regularization,
                                   alpha,
                                   num_factors,
                                   use_implicit,
                                   num_user_blocks,
                                   num_item_blocks,
                                   checkpoint_iterations,
                                   target_rmse)
    return CollaborativeFilteringModel(tc, scala_model)
Exemple #34
0
def train(frame,
          source_column_name,
          dest_column_name,
          weight_column_name,
          max_steps=10,
          regularization=0.5,
          alpha=0.5,
          num_factors=3,
          use_implicit=False,
          num_user_blocks=2,
          num_item_blocks=3,
          checkpoint_iterations=10,
          target_rmse=0.05):
    """
    Create collaborative filtering model by training on given frame

    Parameters
    ----------

    :param frame: (Frame) The frame containing the data to train on
    :param source_column_name: (str) source column name.
    :param dest_column_name: (str) destination column name.
    :param weight_column_name: (str) weight column name.
    :param max_steps: (int) max number of super-steps (max iterations) before the algorithm terminates. Default = 10
    :param regularization: (float) value between 0 .. 1
    :param alpha: (double) value between 0 .. 1
    :param num_factors: (int) number of the desired factors (rank)
    :param use_implicit: (bool) use implicit preference
    :param num_user_blocks: (int) number of user blocks
    :param num_item_blocks: (int) number of item blocks
    :param checkpoint_iterations: (int) Number of iterations between checkpoints
    :param target_rmse: (double) target RMSE
    :return: (CollaborativeFilteringModel) A trained collaborative filtering model
    """
    from sparktk.frame.frame import Frame
    require_type(Frame, frame, 'frame')
    require_type.non_empty_str(source_column_name, "source_column_name")
    require_type.non_empty_str(dest_column_name, "dest_column_name")
    require_type.non_empty_str(weight_column_name, "weight_column_name")
    require_type.non_negative_int(max_steps, "max_steps")
    require_type(float, regularization, "regularization")
    if regularization > 1 or regularization < 0:
        raise ValueError(
            "'regularization' parameter must have a value between 0 and 1")
    require_type(float, alpha, "alpha")
    if alpha > 1 or alpha < 0:
        raise ValueError("'alpha' parameter must have a value between 0 and 1")
    require_type.non_negative_int(num_factors, "num_factors")
    require_type(bool, use_implicit, "use_implicit")
    require_type.non_negative_int(num_user_blocks, "num_user_blocks")
    require_type.non_negative_int(num_item_blocks, "num_item_blocks")
    require_type.non_negative_int(checkpoint_iterations,
                                  "checkpoint_iterations")
    require_type(float, target_rmse, "target_rmse")
    tc = frame._tc
    _scala_obj = get_scala_obj(tc)
    scala_model = _scala_obj.train(frame._scala, source_column_name,
                                   dest_column_name, weight_column_name,
                                   max_steps, regularization, alpha,
                                   num_factors, use_implicit, num_user_blocks,
                                   num_item_blocks, checkpoint_iterations,
                                   target_rmse)
    return CollaborativeFilteringModel(tc, scala_model)
Exemple #35
0
def import_csv(path, delimiter=",", header=False, schema=None, datetime_format="yyyy-MM-dd'T'HH:mm:ss.SSSX", tc=TkContext.implicit):
    """
    Creates a frame with data from a csv file.

    Parameters
    ----------

    :param path: (str) Full path to the csv file
    :param delimiter: (Optional[str]) A string which indicates the separation of data fields.  This is usually a
                      single character and could be a non-visible character, such as a tab. The default delimiter
                      is a comma (,).
    :param header: (Optional[bool]) Boolean value indicating if the first line of the file will be used to name columns
                   (unless a schema is provided), and not be included in the data.  The default value is false.
    :param schema: (Optional(list[tuple(str, type)] or list[str])) The are different options for specifying a schema:

    * Provide the full schema for the frame as a list of tuples (string column name and data type)
    * Provide the column names as a list of strings.  Column data types will be inferred, based on thedata.  The column names specified will override column names that are found in the header row.
    * None, where the schema is automatically inferred based on the data.  Columns are named based on the header, or will be named generically ("C0", "C1", "C2", etc).

    :param datetime_format: (str) String specifying how date/time columns are formatted, using the java.text.SimpleDateFormat
                        specified at https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html
    :return: (Frame) Frame that contains the data from the csv file

    Examples
    --------

    Load a frame from a csv file by specifying the path to the file, delimiter

        >>> file_path = "../datasets/cities.csv"

        >>> frame = tc.frame.import_csv(file_path, "|", header=True)
        -etc-

        >>> frame.inspect()
        [#]  rank  city         population_2013  population_2010  change  county
        ============================================================================
        [0]     1  Portland              609456           583776  4.40%   Multnomah
        [1]     2  Salem                 160614           154637  3.87%   Marion
        [2]     3  Eugene                159190           156185  1.92%   Lane
        [3]     4  Gresham               109397           105594  3.60%   Multnomah
        [4]     5  Hillsboro              97368            91611  6.28%   Washington
        [5]     6  Beaverton              93542            89803  4.16%   Washington
        [6]    15  Grants Pass            35076            34533  1.57%   Josephine
        [7]    16  Oregon City            34622            31859  8.67%   Clackamas
        [8]    17  McMinnville            33131            32187  2.93%   Yamhill
        [9]    18  Redmond                27427            26215  4.62%   Deschutes

        >>> frame.schema
        [('rank', <type 'int'>), ('city', <type 'str'>), ('population_2013', <type 'int'>), ('population_2010', <type 'int'>), ('change', <type 'str'>), ('county', <type 'str'>)]

    The schema parameter can be used to specify a custom schema (column names and data types) or column names (and the
    data types are inferred based on the data).  Here, we will specify the column names, which will override the
    header from the csv file.

        >>> column_names = ["Rank", "City", "2013", "2010", "Percent_Change", "County"]
        >>> frame = tc.frame.import_csv(file_path, "|", header=True, schema=column_names)
        -etc-

        >>> frame.schema
        [('Rank', <type 'int'>), ('City', <type 'str'>), ('2013', <type 'int'>), ('2010', <type 'int'>), ('Percent_Change', <type 'str'>), ('County', <type 'str'>)]

        <hide>
        >>> file_path = "../datasets/unicode.csv"
        >>> schema = [("a", unicode),("b", unicode),("c",unicode)]
        >>> frame = tc.frame.import_csv(file_path, schema=schema, header=False)
        -etc-

        >>> frame.inspect()
        [#]  a  b  c
        ============
        [0]  à  ë  ñ
        [1]  ã  ê  ü

        </hide>

    """
    TkContext.validate(tc)
    require_type.non_empty_str(path, "path")
    require_type.non_empty_str(delimiter, "delimiter")
    require_type(bool, header, "header")
    require_type(str, datetime_format, "datetime_format")

    infer_schema = True
    column_names = []   # custom column names

    if schema is not None:
        if not isinstance(schema, list):
            raise TypeError("Unsupported type %s for schema parameter." % type(schema))
        elif all(isinstance(item, basestring) for item in schema):
            # schema is just column names
            column_names = schema
            schema = None
        else:
            infer_schema = False   # if a custom schema is provided, don't waste time inferring the schema during load
            sparktk_schema.validate(schema)

    header_str = str(header).lower()
    infer_schema_str = str(infer_schema).lower()
    pyspark_schema = None

    if schema is not None:
        fields = []
        for column in schema:
            if dtypes._data_type_to_pyspark_type_table.has_key(column[1]):
                fields.append(StructField(column[0], dtypes._data_type_to_pyspark_type_table[column[1]], True))
            else:
                raise TypeError("Unsupported type {0} in schema for column {1}.".format(column[1], column[0]))
        pyspark_schema = StructType(fields)

    df = tc.sql_context.read.format(
        "com.databricks.spark.csv.org.trustedanalytics.sparktk").options(
            delimiter=delimiter,
            header=header_str,
            dateformat=datetime_format,
            inferschema=infer_schema_str).load(path, schema=pyspark_schema)

    df_schema = []

    if schema is None:
        for i, column in enumerate(df.schema.fields):
            try:
                datatype = dtypes.dtypes.get_primitive_type_from_pyspark_type(type(column.dataType))
            except ValueError:
                raise TypeError("Unsupported data type ({0}) for column {1}.".format(str(column.dataType), column.name))
            column_name = column_names[i] if (i < len(column_names)) else column.name
            df_schema.append((column_name, datatype))
    else:
        df_column_count = len(df.schema.fields)
        custom_column_count = len(schema)
        if (df_column_count != custom_column_count):
            raise ValueError("Bad schema value.  The number of columns in the custom schema ({0}) must match the"
                             "number of columns in the csv file data ({1}).".format(custom_column_count, df_column_count))
        df_schema = schema

    def cast_datetime(row):
        """
        The spark data frame gives uses datetime objects.  Convert them to long (ms since epoch) for our frame.
        """
        data = []
        for column_index in xrange(0, len(df_schema)):
            if df_schema[column_index][1] == dtypes.datetime and isinstance(row[column_index], datetime):
                data.append(long(dtypes.datetime_to_ms(row[column_index])))
            else:
                data.append(row[column_index])
        return data

    jrdd = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(df._jdf.rdd())
    rdd = RDD(jrdd, tc.sc)

    if any(c[1] == dtypes.datetime for c in df_schema):
        # If any columns are date/time we must do this map
        rdd = df.rdd.map(cast_datetime)

    from sparktk.frame.frame import Frame  # circular dependency, so import late
    return Frame(tc, rdd, df_schema)
Exemple #36
0
def import_csv_raw(path, delimiter=",", header=False, tc=TkContext.implicit):
    """
    Creates a frame by importing the data as strings from the specified csv file.  If the csv file has a header row,
    those values will be used as column names.  Otherwise, columns will be named generically, like 'C0', 'C1', 'C2', etc.

    Parameters
    ----------

    :param path: (str) Full path to the csv file
    :param delimiter: (str) A string which indicates the separation of data fields.  This is usually a single character
                      and could be a non-visible character, such as a tab. The default delimiter is a comma (,).
    :param header: (bool) Boolean value indicating if the first line of the file will be used to name columns, and not
                   be included in the data.  The default value is false.
    :return: (Frame) Frame that contains the data from the csv file

    Examples
    --------

    Import raw data from a csv file by specifying the path to the file, delimiter, and header option.  All data will
    be brought in the frame as strings, and columns will be named according to the header row, if there was one.

        >>> file_path = "../datasets/cities.csv"

        >>> frame = tc.frame.import_csv_raw(file_path, delimiter="|", header=True)
        -etc-

        >>> frame.inspect()
        [#]  rank  city         population_2013  population_2010  change  county
        ============================================================================
        [0]  1     Portland     609456           583776           4.40%   Multnomah
        [1]  2     Salem        160614           154637           3.87%   Marion
        [2]  3     Eugene       159190           156185           1.92%   Lane
        [3]  4     Gresham      109397           105594           3.60%   Multnomah
        [4]  5     Hillsboro    97368            91611            6.28%   Washington
        [5]  6     Beaverton    93542            89803            4.16%   Washington
        [6]  15    Grants Pass  35076            34533            1.57%   Josephine
        [7]  16    Oregon City  34622            31859            8.67%   Clackamas
        [8]  17    McMinnville  33131            32187            2.93%   Yamhill
        [9]  18    Redmond      27427            26215            4.62%   Deschutes

        >>> frame.schema
        [('rank', <type 'str'>), ('city', <type 'str'>), ('population_2013', <type 'str'>), ('population_2010', <type 'str'>), ('change', <type 'str'>), ('county', <type 'str'>)]


    """
    TkContext.validate(tc)
    require_type.non_empty_str(path, "path")
    require_type.non_empty_str(delimiter, "delimiter")
    require_type(bool, header, "header")

    df = tc.sql_context.read.format(
        "com.databricks.spark.csv.org.trustedanalytics.sparktk").options(
        delimiter=delimiter,
        header=str(header).lower(),
        inferschema="false").load(path, schema=None)

    df_schema = []

    for column in df.schema.fields:
        try:
            datatype = dtypes.dtypes.get_primitive_type_from_pyspark_type(type(column.dataType))
        except ValueError:
            raise TypeError("Unsupported data type ({0}) for column {1}.".format(str(column.dataType), column.name))
        df_schema.append((column.name, datatype))

    jrdd = tc.sc._jvm.org.trustedanalytics.sparktk.frame.internal.rdd.PythonJavaRdd.scalaToPython(df._jdf.rdd())
    rdd = RDD(jrdd, tc.sc)

    from sparktk.frame.frame import Frame  # circular dependency, so import late
    return Frame(tc, rdd, df_schema)