Esempio n. 1
0
def get_movie_graph(tc=_TkContext.implicit):
    _TkContext.validate(tc)
    global _movie_graph
    if _movie_graph is None:
        viewers = tc.frame.create(
            [['fred', 0], ['wilma', 0], ['pebbles', 1], ['betty', 0],
             ['barney', 0], ['bamm bamm', 1]],
            schema=[('id', str), ('kids', int)])

        titles = [
            'Croods', 'Jurassic Park', '2001', 'Ice Age', 'Land Before Time'
        ]

        movies = tc.frame.create([[t] for t in titles], schema=[('id', str)])

        vertices = viewers.copy()
        vertices.append(movies)

        edges = tc.frame.create(
            [['fred', 'Croods', 5], ['fred', 'Jurassic Park', 5],
             ['fred', '2001', 2], ['fred', 'Ice Age', 4],
             ['wilma', 'Jurassic Park', 3], ['wilma', '2001', 5],
             ['wilma', 'Ice Age', 4], ['pebbles', 'Croods', 4],
             ['pebbles', 'Land Before Time', 3], ['pebbles', 'Ice Age', 5],
             ['betty', 'Croods', 5], ['betty', 'Jurassic Park', 3],
             ['betty', 'Land Before Time', 4], ['betty', 'Ice Age', 3],
             ['barney', 'Croods', 5], ['barney', 'Jurassic Park', 5],
             ['barney', 'Land Before Time', 3], ['barney', 'Ice Age', 5],
             ['bamm bamm', 'Croods', 5], ['bamm bamm', 'Land Before Time', 3]],
            schema=['src', 'dst', 'rating'])

        _movie_graph = tc.graph.create(vertices, edges)

    return _movie_graph
Esempio n. 2
0
def tc(request):
    global global_tc
    with lock:
        if global_tc is None:
            from sparktk import TkContext
            from sparktk import create_sc
            from sparktk.tests import utils
            import daaltk
            #from sparktk.loggers import loggers
            #loggers.set("d", "sparktk.sparkconf")

            # Get path to sparktk jars from SPARKTK_HOME
            if os.environ.has_key('SPARKTK_HOME'):
                sparktk_dir = os.environ.get('SPARKTK_HOME', None)
            else:
                raise RuntimeError("SPARKTK_HOME must be defined.")

            sc = create_sc(
                other_libs=[daaltk],
                master='local[2]',
                sparktk_home=sparktk_dir,
                app_name="pytest-pyspark-local-testing",
                extra_conf_dict={"spark.hadoop.fs.default.name": "file:///"})
            request.addfinalizer(lambda: sc.stop())

            global_tc = TkContext(sc,
                                  other_libs=[daaltk],
                                  sparktk_home=sparktk_dir)
            global_tc.testing = utils
    return global_tc
Esempio n. 3
0
def get_cities_frame(tc=_TkContext.implicit):
    """Creates a small frame of city data"""
    _TkContext.validate(tc)

    global _cities_frame
    if _cities_frame is None:

        schema = zip('rank|city|population_2013|population_2010|change|county'.split('|'),
                     [int, str, int, int, str, str])
        data = [[field for field in line.split('|')] for line in """1|Portland|609456|583776|4.40%|Multnomah
2|Salem|160614|154637|3.87%|Marion
3|Eugene|159190|156185|1.92%|Lane
4|Gresham|109397|105594|3.60%|Multnomah
5|Hillsboro|97368|91611|6.28%|Washington
6|Beaverton|93542|89803|4.16%|Washington
15|Grants Pass|35076|34533|1.57%|Josephine
16|Oregon City|34622|31859|8.67%|Clackamas
17|McMinnville|33131|32187|2.93%|Yamhill
18|Redmond|27427|26215|4.62%|Deschutes
19|Tualatin|26879|26054|4.17%|Washington
20|West Linn|25992|25109|3.52%|Clackamas
7|Bend|81236|76639|6.00%|Deschutes
8|Medford|77677|74907|3.70%|Jackson
9|Springfield|60177|59403|1.30%|Lane
10|Corvallis|55298|54462|1.54%|Benton
11|Albany|51583|50158|2.84%|Linn
12|Tigard|50444|48035|5.02%|Washington
13|Lake Oswego|37610|36619|2.71%|Clackamas
14|Keizer|37064|36478|1.61%|Marion""".split('\n')]

        _cities_frame = tc.frame.create(data, schema)

    return _cities_frame
Esempio n. 4
0
def get_cities_frame(tc=_TkContext.implicit):
    """Creates a small frame of city data"""
    _TkContext.validate(tc)

    global _cities_frame
    if _cities_frame is None:

        schema = zip(
            'rank|city|population_2013|population_2010|change|county'.split(
                '|'), [int, str, int, int, str, str])
        data = [[field for field in line.split('|')]
                for line in """1|Portland|609456|583776|4.40%|Multnomah
2|Salem|160614|154637|3.87%|Marion
3|Eugene|159190|156185|1.92%|Lane
4|Gresham|109397|105594|3.60%|Multnomah
5|Hillsboro|97368|91611|6.28%|Washington
6|Beaverton|93542|89803|4.16%|Washington
15|Grants Pass|35076|34533|1.57%|Josephine
16|Oregon City|34622|31859|8.67%|Clackamas
17|McMinnville|33131|32187|2.93%|Yamhill
18|Redmond|27427|26215|4.62%|Deschutes
19|Tualatin|26879|26054|4.17%|Washington
20|West Linn|25992|25109|3.52%|Clackamas
7|Bend|81236|76639|6.00%|Deschutes
8|Medford|77677|74907|3.70%|Jackson
9|Springfield|60177|59403|1.30%|Lane
10|Corvallis|55298|54462|1.54%|Benton
11|Albany|51583|50158|2.84%|Linn
12|Tigard|50444|48035|5.02%|Washington
13|Lake Oswego|37610|36619|2.71%|Clackamas
14|Keizer|37064|36478|1.61%|Marion""".split('\n')]

        _cities_frame = tc.frame.create(data, schema)

    return _cities_frame
Esempio n. 5
0
def train(ts,
          p,
          d,
          q,
          include_intercept=True,
          method="css-cgd",
          init_params=None,
          tc=TkContext.implicit):
    """
    Creates Autoregressive Integrated Moving Average (ARIMA) Model from the specified time series values.

    Given a time series, fits an non-seasonal Autoregressive Integrated Moving Average (ARIMA) model of
    order (p, d, q) where p represents the autoregression terms, d represents the order of differencing, and q
    represents the moving average error terms.  If includeIntercept is true, the model is fitted with an intercept.

    Parameters
    ----------

    :param ts: (List[float]) Time series to which to fit an ARIMA(p, d, q) model.
    :param p: (int) Autoregressive order
    :param d: (int) Differencing order
    :param q: (int) Moving average order
    :param include_intercept: (Optional(boolean)) If True, the model is fit with an intercept.  Default is True.
    :param method: (Optional(string)) Objective function and optimization method.  Current options are:
                   'css-bobyqa' and 'css-cgd'.  Both optimize the log likelihood in terms of the conditional
                   sum of squares.  The first uses BOBYQA for optimization, while the second uses conjugate
                   gradient descent.  Default is 'css-cgd'.
    :param init_params: (Optional(List[float]) A set of user provided initial parameters for optimization. If the
                        list is empty (default), initialized using Hannan-Rissanen algorithm. If provided, order
                        of parameter should be: intercept term, AR parameters (in increasing order of lag), MA
                        parameters (in increasing order of lag).
    :return: (ArimaModel) Trained ARIMA model
    """
    if not isinstance(ts, list):
        raise TypeError("'ts' parameter must be a list")
    if not isinstance(p, int):
        raise TypeError("'p' parameter must be an integer.")
    if not isinstance(d, int):
        raise TypeError("'d' parameter must be an integer.")
    if not isinstance(q, int):
        raise TypeError("'q' parameter must be an integer.")
    if not isinstance(include_intercept, bool):
        raise TypeError("'include_intercept' parameter must be a boolean")
    if not isinstance(method, basestring):
        raise TypeError("'method' parameter must be a string")
    if init_params is not None:
        if not isinstance(init_params, list):
            raise TypeError("'init_params' parameter must be a list")
    TkContext.validate(tc)

    _scala_obj = get_scala_obj(tc)
    scala_ts = tc.jutils.convert.to_scala_list_double(ts)
    scala_init_params = tc.jutils.convert.to_scala_option_list_double(
        init_params)
    scala_model = _scala_obj.train(scala_ts, p, d, q, include_intercept,
                                   method, scala_init_params)

    return ArimaModel(tc, scala_model)
Esempio n. 6
0
def train(ts, p, d, q, include_intercept=True, method="css-cgd", init_params=None, tc=TkContext.implicit):
    """
    Creates Autoregressive Integrated Moving Average (ARIMA) Model from the specified time series values.

    Given a time series, fits an non-seasonal Autoregressive Integrated Moving Average (ARIMA) model of
    order (p, d, q) where p represents the autoregression terms, d represents the order of differencing, and q
    represents the moving average error terms.  If includeIntercept is true, the model is fitted with an intercept.

    Parameters
    ----------

    :param ts: (List[float]) Time series to which to fit an ARIMA(p, d, q) model.
    :param p: (int) Autoregressive order
    :param d: (int) Differencing order
    :param q: (int) Moving average order
    :param include_intercept: (Optional(boolean)) If True, the model is fit with an intercept.  Default is True.
    :param method: (Optional(string)) Objective function and optimization method.  Current options are:
                   'css-bobyqa' and 'css-cgd'.  Both optimize the log likelihood in terms of the conditional
                   sum of squares.  The first uses BOBYQA for optimization, while the second uses conjugate
                   gradient descent.  Default is 'css-cgd'.
    :param init_params: (Optional(List[float]) A set of user provided initial parameters for optimization. If the
                        list is empty (default), initialized using Hannan-Rissanen algorithm. If provided, order
                        of parameter should be: intercept term, AR parameters (in increasing order of lag), MA
                        parameters (in increasing order of lag).
    :return: (ArimaModel) Trained ARIMA model
    """
    if not isinstance(ts, list):
        raise TypeError("'ts' parameter must be a list")
    if not isinstance(p, int):
        raise TypeError("'p' parameter must be an integer.")
    if not isinstance(d, int):
        raise TypeError("'d' parameter must be an integer.")
    if not isinstance(q, int):
        raise TypeError("'q' parameter must be an integer.")
    if not isinstance(include_intercept, bool):
        raise TypeError("'include_intercept' parameter must be a boolean")
    if not isinstance(method, basestring):
        raise TypeError("'method' parameter must be a string")
    if init_params is not None:
        if not isinstance(init_params, list):
            raise TypeError("'init_params' parameter must be a list")
    TkContext.validate(tc)

    _scala_obj = get_scala_obj(tc)
    scala_ts = tc.jutils.convert.to_scala_list_double(ts)
    scala_init_params = tc.jutils.convert.to_scala_option_list_double(init_params)
    scala_model = _scala_obj.train(scala_ts, p, d, q, include_intercept, method, scala_init_params)

    return ArimaModel(tc, scala_model)
Esempio n. 7
0
def tc(request):
    global global_tc
    with lock:
        if global_tc is None:
            from sparktk import TkContext
            from sparktk import create_sc
            from sparktk.tests import utils
            #from sparktk.loggers import loggers
            #loggers.set("d", "sparktk.sparkconf")
            sc = create_sc(master='local[2]',
                           app_name="pytest-pyspark-local-testing",
                           extra_conf={"spark.hadoop.fs.default.name": "file:///"})
            request.addfinalizer(lambda: sc.stop())
            global_tc = TkContext(sc)
            global_tc.testing = utils
    return global_tc
Esempio n. 8
0
def tc(request):
    global global_tc
    with lock:
        if global_tc is None:
            from sparktk import TkContext
            from sparktk import create_sc
            from sparktk.tests import utils
            #from sparktk.loggers import loggers
            #loggers.set("d", "sparktk.sparkconf")
            sc = create_sc(master='local[2]',
                           app_name="pytest-pyspark-local-testing",
                           extra_conf_dict={"spark.hadoop.fs.default.name": "file:///"})
            request.addfinalizer(lambda: sc.stop())
            global_tc = TkContext(sc)
            global_tc.testing = utils
    return global_tc
Esempio n. 9
0
    def test_frame_basic(self):
        """Documentation test for classifiers"""

        # The general workflow will be build a frame, run some analytics
        # on the frame
        # First Step, construct a frame
        # Construct a frame to be uploaded, this is done using plain python
        # lists uploaded to the server
        # The following frame could represent some ordered list (such as
        # customer orders) and a value associated with the order.
        # The order is sorted on, and then the order value is accumulated
        # Cumulative sum finds the sum up to and including a given order
        # Create context
        tc = TkContext()

        # Create the frame using a list object
        frame = tc.frame.create(data=[[0, 100], [3, 20], [1, 25], [2, 90]],
                                schema=[("order", int), ("value", int)])
        print frame.inspect()

        # Sort on order, note this is a side effect based operation
        frame.sort('order')

        # calculate the cumulative sum
        frame.cumulative_sum('value')
        print frame.inspect()

        # Fetch the results, and validate they are what you would expect
        result = frame.take(frame.count())
        self.assertItemsEqual(
            result.data,
            [[0, 100, 100], [3, 20, 235], [1, 25, 125], [2, 90, 215]])
Esempio n. 10
0
def get_movie_graph(tc=_TkContext.implicit):
    _TkContext.validate(tc)
    global _movie_graph
    if _movie_graph is None:
        viewers = tc.frame.create(
            [["fred", 0], ["wilma", 0], ["pebbles", 1], ["betty", 0], ["barney", 0], ["bamm bamm", 1]],
            schema=[("id", str), ("kids", int)],
        )

        titles = ["Croods", "Jurassic Park", "2001", "Ice Age", "Land Before Time"]

        movies = tc.frame.create([[t] for t in titles], schema=[("id", str)])

        vertices = viewers.copy()
        vertices.append(movies)

        edges = tc.frame.create(
            [
                ["fred", "Croods", 5],
                ["fred", "Jurassic Park", 5],
                ["fred", "2001", 2],
                ["fred", "Ice Age", 4],
                ["wilma", "Jurassic Park", 3],
                ["wilma", "2001", 5],
                ["wilma", "Ice Age", 4],
                ["pebbles", "Croods", 4],
                ["pebbles", "Land Before Time", 3],
                ["pebbles", "Ice Age", 5],
                ["betty", "Croods", 5],
                ["betty", "Jurassic Park", 3],
                ["betty", "Land Before Time", 4],
                ["betty", "Ice Age", 3],
                ["barney", "Croods", 5],
                ["barney", "Jurassic Park", 5],
                ["barney", "Land Before Time", 3],
                ["barney", "Ice Age", 5],
                ["bamm bamm", "Croods", 5],
                ["bamm bamm", "Land Before Time", 3],
            ],
            schema=["src", "dst", "rating"],
        )

        _movie_graph = tc.graph.create(vertices, edges)

    return _movie_graph
Esempio n. 11
0
def get_movie_graph(tc=_TkContext.implicit):
    _TkContext.validate(tc)
    global _movie_graph
    if _movie_graph is None:
        viewers = tc.frame.create([['fred', 0],
                                   ['wilma', 0],
                                   ['pebbles', 1],
                                   ['betty', 0],
                                   ['barney', 0],
                                   ['bamm bamm', 1]],
                                  schema=[('id', str), ('kids', int)])

        titles = ['Croods', 'Jurassic Park', '2001', 'Ice Age', 'Land Before Time']

        movies = tc.frame.create([[t] for t in titles], schema=[('id', str)])

        vertices = viewers.copy()
        vertices.append(movies)

        edges = tc.frame.create([['fred','Croods',5],
                                 ['fred','Jurassic Park',5],
                                 ['fred','2001',2],
                                 ['fred','Ice Age',4],
                                 ['wilma','Jurassic Park',3],
                                 ['wilma','2001',5],
                                 ['wilma','Ice Age',4],
                                 ['pebbles','Croods',4],
                                 ['pebbles','Land Before Time',3],
                                 ['pebbles','Ice Age',5],
                                 ['betty','Croods',5],
                                 ['betty','Jurassic Park',3],
                                 ['betty','Land Before Time',4],
                                 ['betty','Ice Age',3],
                                 ['barney','Croods',5],
                                 ['barney','Jurassic Park',5],
                                 ['barney','Land Before Time',3],
                                 ['barney','Ice Age',5],
                                 ['bamm bamm','Croods',5],
                                 ['bamm bamm','Land Before Time',3]],
                                schema=['src', 'dst', 'rating'])

        _movie_graph = tc.graph.create(vertices, edges)

    return _movie_graph
Esempio n. 12
0
def tc(request):
    global global_tc
    with lock:
        if global_tc is None:
            from sparktk import TkContext
            from sparktk import create_sc
            #from sparktk.loggers import loggers
            #loggers.set("d", "sparktk.sparkconf")
            sc = create_sc(master='local[2]',
                           app_name="pytest-pyspark-local-testing")
            request.addfinalizer(lambda: sc.stop())
            global_tc = TkContext(sc)
    return global_tc
Esempio n. 13
0
def load(path, tc=TkContext.implicit):
    """load RandomForestRegressorModel from given path"""
    TkContext.validate(tc)
    return tc.load(path, RandomForestRegressorModel)
Esempio n. 14
0
def load(path, tc=TkContext.implicit):
    """load LogisticRegressionModel from given path"""
    TkContext.validate(tc)
    return tc.load(path, LogisticRegressionModel)
Esempio n. 15
0
def load(path, tc=TkContext.implicit):
    """load GaussianMixtureModel from given path"""
    TkContext.validate(tc)
    return tc.load(path, GaussianMixtureModel)
Esempio n. 16
0
def load(path, tc=TkContext.implicit):
    """load KMeansModel from given path"""
    TkContext.validate(tc)
    return tc.load(path, CollaborativeFilteringModel)
Esempio n. 17
0
    def test_model_class_doc(self):
        """Generate a naive bayes dataset, use sparktk to train a model and verify"""
        # Naive bayes is a machine learning algorithm
        # We can use it to classify some item with properties into a group probabilistically
        # The general work flow is to generate a dataset
        # then we calculate the coefficient table and probabilities
        # Finally we build a frame of the data and create a naive bayes model
        # then we test the result of the naive bayes model test

        # Generate naive bayes dataset
        numCoeffs = random.randint(2, 10)
        coefficients = []
        schema = []
        obsCols = []
        dataRows = []
        coeffTable = []
        # the number of rows of data we will generate
        numDiceRolls = random.randint(3, 30)

        # Generate the coefficient table and schema
        for index in range(0, numCoeffs):
            coefficients.append(random.uniform(0, 1))
            schema.append(("x" + str(index), int))
            obsCols.append("x" + str(index))
        schema.append(("x" + str(numCoeffs), int))

        # get all permutations of 0, 1 of length numCoeffs
        binaryPermutations = list(itertools.product(range(2),
                                                    repeat=numCoeffs))

        # now we compute the probability for each row
        # and add the probability for each row as a column to the table
        for element in binaryPermutations:
            product = 1
            element = list(element)
            for i in range(0, numCoeffs):
                if element[i] is 1:
                    product = coefficients[i] * product
                if element[i] is 0:
                    product = (1 - coefficients[i]) * product
            element.append(product)
            coeffTable.append(list(element))

        # Now we use the coefficient table to geneate the actual data
        for row in coeffTable:
            probability = row[len(row) - 1]
            for n in range(0, numDiceRolls):
                newRow = row
                randomResult = random.uniform(0, 1)
            if probability >= randomResult:
                newRow[len(newRow) - 1] = 1
            else:
                newRow[len(newRow) - 1] = 0
            dataRows.append(newRow)

        # Finally we create the frame and model
        # and check that it performs as we would expect
        # We create a sparktk context
        context = TkContext()
        # Then we create a frame from the data
        frame = context.frame.create(dataRows, schema=schema)
        # we train a naive bayes model
        # we give the model lots of information on both data and outcomes
        # in this way it learns which outcome to expect from data
        nb_model = context.models.classification.naive_bayes.train(
            frame, obsCols, "x" + str(numCoeffs - 1))
        # then we test the model
        # meaning we try to see how it behaves in predicting outcomes
        # from data that it has been trained to recognize patterns in
        predicted_frame = nb_model.predict(frame)
        result = nb_model.test(predicted_frame)

        # Lastly we check the result of the model test
        self.assertAlmostEqual(1, result.precision)
        self.assertAlmostEqual(1, result.accuracy)
        self.assertAlmostEqual(1, result.recall)
        self.assertAlmostEqual(1, result.f_measure)
Esempio n. 18
0
def load(path, tc=TkContext.implicit):
    """load LinearRegressionModel from given path"""
    TkContext.validate(tc)
    return tc.load(path, LinearRegressionModel)
Esempio n. 19
0
def create(data, schema=None, validate_schema=False, tc=TkContext.implicit):
    """
    Creates a frame from the given data and schema.  If no schema data types are provided, the schema is inferred
    based on the data in the first 100 rows.

    If schema validation is enabled, all data is is checked to ensure that it matches the schema.  If the data does
    not match the schema's data type, it attempts to cast the data to the proper data type.  When the data is unable
    to be casted to the schema's data type, the item will be missing (None) in the frame.

    Parameters
    ----------

    :param data: (List of row data or RDD) Data source
    :param schema: (Optional(list[tuple(str, type)] or list[str])) Optionally specify a schema (list of tuples of
                   string column names and data type), column names (list of strings, and the column data types will
                   be inferred) or None (column data types will be inferred and column names will be numbered like C0,
                   C1, C2, etc).  Note that unless validate_schema is enabled, no attempt is made to check or convert
                   the data to the data type specified by the schema.  If the data provided does not match the schema
                   (and validate_schema is disabled), errors may be encountered when using certain frame operations.
    :param validate_schema: (Optional(bool)) When True, all data is checked to ensure that it matches the schema.
                            If the data does not match the schema's data type, it attempts to cast the data to the
                            proper data type.  When the data is unable to be casted to the schema's data type, a
                            missing value (None) is inserted in it's place. It is recommended that validate_schema is
                            enabled, unless it is certain that all of the data matches the specified schema.
                            Defaults to False.
    :param tc: TkContext
    :return: (Frame) Frame loaded with the specified data


    Examples
    --------

    Create a frame with the specified data.

        >>> data = [["Bob", 30, 8], ["Jim", 45, 9.5], ["Sue", 25, 7], ["George", 15, 6], ["Jennifer", 18, 8.5]]
        >>> frame = tc.frame.create(data)

    Since no schema is provided, the schema will be inferred.  Note that the data set had a mix of strings and
    integers in the third column.  The schema will use the most general data type from the data that it sees, so in
    this example, the column is treated as a float.

        >>> frame.schema
        [('C0', <type 'str'>), ('C1', <type 'int'>), ('C2', <type 'float'>)]

        >>> frame.inspect()
        [#]  C0        C1  C2
        ======================
        [0]  Bob       30    8
        [1]  Jim       45  9.5
        [2]  Sue       25    7
        [3]  George    15    6
        [4]  Jennifer  18  8.5

    We could also enable schema validation, which checks the data against the schema.  If the data does not match the
    schema's data type, it attempts to cast the data to the proper data type.

        >>> frame = tc.frame.create(data, validate_schema=True)

    In this example with schema validation enabled, the integers in column C2 get casted to floats:

        >>> frame.inspect()
        [#]  C0        C1  C2
        ======================
        [0]  Bob       30  8.0
        [1]  Jim       45  9.5
        [2]  Sue       25  7.0
        [3]  George    15  6.0
        [4]  Jennifer  18  8.5

    We could also provide a list of column names when creating the frame.  When a list of column names is provided,
    the data types for the schema are still inferred, but the columns in the schema are labeled with the specified names.

        >>> frame = tc.frame.create(data, schema=["name", "age", "shoe_size"], validate_schema=True)

        >>> frame.schema
        [('name', <type 'str'>), ('age', <type 'int'>), ('shoe_size', <type 'float'>)]

        >>> frame.inspect()
        [#]  name      age  shoe_size
        =============================
        [0]  Bob        30        8.0
        [1]  Jim        45        9.5
        [2]  Sue        25        7.0
        [3]  George     15        6.0
        [4]  Jennifer   18        8.5

    Note that if a value cannot be parsed as the specified data type in the schema, it will show up as missing (None),
    if validate_schema is enabled.  For example, consider the following frame where columns are defined as integers,
    but the data specified has a string in the second row.

        >>> data = [[1, 2, 3], [4, "five", 6]]
        >>> schema = [("a", int), ("b", int), ("c", int)]

        >>> frame = tc.frame.create(data, schema, validate_schema = True)

        >>> frame.inspect()
        [#]  a  b     c
        ===============
        [0]  1     2  3
        [1]  4  None  6

    Note that the spot where the string was located, has it's value missing (None) since it couldn't be parsed to an
    integer.  If validate_schema was disabled, no attempt is made to parse the data to the data type specified by the
    schema, and further frame operations may fail due to the data type discrepancy.

    """
    TkContext.validate(tc)
    if data is None:
        data = []
    if not isinstance(data, list)\
            and not isinstance(data, (RDD, DataFrame))\
            and not tc._jutils.is_jvm_instance_of(data, tc.sc._jvm.org.apache.spark.rdd.RDD)\
            and not tc._jutils.is_jvm_instance_of(data, tc.sc._jvm.org.apache.spark.sql.DataFrame):
        raise TypeError(
            "Invalid data source. Expected the data parameter to be a 2-dimensional list (list of row data) or an RDD or DataFrame, but received: %s"
            % type(data))
    from sparktk.frame.frame import Frame
    return Frame(tc, data, schema, validate_schema)
# See the License for the specific language governing permissions and
# limitations under the License.
#

# ## importing the sparktk and tap_catalog libraries give you the capability of creating machine learning models, performing data wrangling, and publishing of the model to the data catalog.

# In[ ]:

import sparktk
import tap_catalog
from sparktk import TkContext
from tap_catalog import DataCatalog

print "SparkTK installation path = %s" % (sparktk.__path__)

tc = TkContext()

# ## Reading in the data to train the model
# ## You must change the hdfs path to the path of the datafile

# In[ ]:

ds = "hdfs://nameservice1/org/29ace093-e11f-4f0b-b254-3f8e973476e5/brokers/userspace/694b3da9-c21a-4063-bf16-e072ac47f881/30fc50da-f065-41d8-a510-77d0b7683a47/000000_1"
sc = [("label", float), ("feature1", float), ("feature2", float),
      ("feature3", float), ("feature4", float), ("feature5", float),
      ("feature6", float), ("feature7", float), ("feature8", float),
      ("feature9", float)]

frame = tc.frame.import_csv(ds, schema=sc)

frame.inspect()
Esempio n. 21
0
def create(data, schema=None, validate_schema=False, tc=TkContext.implicit):
    """
    Creates a frame from the given data and schema.  If no schema data types are provided, the schema is inferred
    based on the data in the first 100 rows.

    If schema validation is enabled, all data is is checked to ensure that it matches the schema.  If the data does
    not match the schema's data type, it attempts to cast the data to the proper data type.  When the data is unable
    to be casted to the schema's data type, the item will be missing (None) in the frame.

    Parameters
    ----------

    :param data: (List of row data or RDD) Data source
    :param schema: (Optional(list[tuple(str, type)] or list[str])) There are different options for specifying a schema.
                   Note that unless validate_schema is enabled, no attempt is made to check or convert the data to the
                   data type specified by the schema.  If the data provided does not match the schema (and
                   validate_schema is disabled), errors may be encountered when using certain frame operations.

    *  Provide the full schema for the frame as a list of tuples (string column name and data type)
    *  Provide the column names as a list of strings.  Column data types will be inferred, based on the data.
    *  None, where the schema is automatically inferred based on the data.  Columns will be named
    generically ("C0", "C1", "C2", etc).

    :param validate_schema: (Optional(bool)) When True, all data is checked to ensure that it matches the schema.
                            If the data does not match the schema's data type, it attempts to cast the data to the
                            proper data type.  When the data is unable to be casted to the schema's data type, a
                            missing value (None) is inserted in it's place. It is recommended that validate_schema is
                            enabled, unless it is certain that all of the data matches the specified schema.
                            Defaults to False.
    :param tc: TkContext
    :return: (Frame) Frame loaded with the specified data


    Examples
    --------

    Create a frame with the specified data.

        >>> data = [["Bob", 30, 8], ["Jim", 45, 9.5], ["Sue", 25, 7], ["George", 15, 6], ["Jennifer", 18, 8.5]]
        >>> frame = tc.frame.create(data)

    Since no schema is provided, the schema will be inferred.  Note that the data set had a mix of strings and
    integers in the third column.  The schema will use the most general data type from the data that it sees, so in
    this example, the column is treated as a float.

        >>> frame.schema
        [('C0', <type 'str'>), ('C1', <type 'int'>), ('C2', <type 'float'>)]

        >>> frame.inspect()
        [#]  C0        C1  C2
        ======================
        [0]  Bob       30    8
        [1]  Jim       45  9.5
        [2]  Sue       25    7
        [3]  George    15    6
        [4]  Jennifer  18  8.5

    We could also enable schema validation, which checks the data against the schema.  If the data does not match the
    schema's data type, it attempts to cast the data to the proper data type.

        >>> frame = tc.frame.create(data, validate_schema=True)

    In this example with schema validation enabled, the integers in column C2 get casted to floats:

        >>> frame.inspect()
        [#]  C0        C1  C2
        ======================
        [0]  Bob       30  8.0
        [1]  Jim       45  9.5
        [2]  Sue       25  7.0
        [3]  George    15  6.0
        [4]  Jennifer  18  8.5

    We could also provide a list of column names when creating the frame.  When a list of column names is provided,
    the data types for the schema are still inferred, but the columns in the schema are labeled with the specified names.

        >>> frame = tc.frame.create(data, schema=["name", "age", "shoe_size"], validate_schema=True)

        >>> frame.schema
        [('name', <type 'str'>), ('age', <type 'int'>), ('shoe_size', <type 'float'>)]

        >>> frame.inspect()
        [#]  name      age  shoe_size
        =============================
        [0]  Bob        30        8.0
        [1]  Jim        45        9.5
        [2]  Sue        25        7.0
        [3]  George     15        6.0
        [4]  Jennifer   18        8.5

    Note that if a value cannot be parsed as the specified data type in the schema, it will show up as missing (None),
    if validate_schema is enabled.  For example, consider the following frame where columns are defined as integers,
    but the data specified has a string in the second row.

        >>> data = [[1, 2, 3], [4, "five", 6]]
        >>> schema = [("a", int), ("b", int), ("c", int)]

        >>> frame = tc.frame.create(data, schema, validate_schema = True)

        >>> frame.inspect()
        [#]  a  b     c
        ===============
        [0]  1     2  3
        [1]  4  None  6

    Note that the spot where the string was located, has it's value missing (None) since it couldn't be parsed to an
    integer.  If validate_schema was disabled, no attempt is made to parse the data to the data type specified by the
    schema, and further frame operations may fail due to the data type discrepancy.

    """
    TkContext.validate(tc)
    if data is None:
        data = []
    if not isinstance(data, list)\
            and not isinstance(data, (RDD, DataFrame))\
            and not tc._jutils.is_jvm_instance_of(data, tc.sc._jvm.org.apache.spark.rdd.RDD)\
            and not tc._jutils.is_jvm_instance_of(data, tc.sc._jvm.org.apache.spark.sql.DataFrame):
        raise TypeError("Invalid data source. Expected the data parameter to be a 2-dimensional list (list of row data) or an RDD or DataFrame, but received: %s" % type(data))
    from sparktk.frame.frame import Frame
    return Frame(tc, data, schema, validate_schema)
Esempio n. 22
0
def load(path, tc=TkContext.implicit):
    """load GaussianMixtureModel from given path"""
    TkContext.validate(tc)
    return tc.load(path, GaussianMixtureModel)
Esempio n. 23
0
def load(path, tc=TkContext.implicit):
    """load RandomForestClassifierModel from given path"""
    TkContext.validate(tc)
    return tc.load(path, RandomForestClassifierModel)
Esempio n. 24
0
def load(path, tc=TkContext.implicit):
    """load NaiveBayesModel from given path"""
    TkContext.validate(tc)
    return tc.load(path, NaiveBayesModel)
Esempio n. 25
0
def load(path, tc=TkContext.implicit):
    """load CoxProportionalHazardsModel from given path"""
    TkContext.validate(tc)
    return tc.load(path, CoxProportionalHazardsModel)
Esempio n. 26
0
def load(path, tc=TkContext.implicit):
    """load Dicom from given path"""
    TkContext.validate(tc)
    return tc.load(path, Dicom)
Esempio n. 27
0
def load(path, tc=TkContext.implicit):
    """load Frame from given path"""
    TkContext.validate(tc)
    return tc.load(path, Frame)
Esempio n. 28
0
def load(path, tc=TkContext.implicit):
    """load Dicom from given path"""
    TkContext.validate(tc)
    return tc.load(path, Dicom)
Esempio n. 29
0
def cross_validate(frame, train_descriptors, num_folds=3, verbose=False, tc=TkContext.implicit):
    """
    Computes k-fold cross validation on model with the given frame and parameter values
    :param frame: The frame to perform cross-validation on
    :param train_descriptors: Tuple of model and Dictionary of model parameters and their value/values as singleton
            values or a list of type grid_values
    :param num_folds: Number of folds to run the cross-validator on
    :param verbose: Flag indicating if the results of each fold are to be viewed. Default is set to False
    :param tc: spark-tk context (provided implicitly)
    :return: Summary of model's performance consisting of metrics of each combination of train_descriptor values per fold
            and averages across all folds

    Example
    -------

        >>> frame = tc.frame.create([[1,0],[2,0],[3,0],[4,0],[5,0],[6,1],[7,1],[8,1],[9,1],[10,1]],[("data", float),("label",int)])

        >>> frame.inspect()
        [#]  data  label
        ================
        [0]     1      0
        [1]     2      0
        [2]     3      0
        [3]     4      0
        [4]     5      0
        [5]     6      1
        [6]     7      1
        [7]     8      1
        [8]     9      1
        [9]    10      1

        >>> from sparktk.models import grid_values

        >>> result = tc.models.cross_validate(frame,
        ...                                   [(tc.models.classification.svm,
        ...                                     {"observation_columns":"data",
        ...                                      "label_column":"label",
        ...                                      "num_iterations": grid_values(2, 10),
        ...                                      "step_size": 0.01}),
        ...                                    (tc.models.classification.logistic_regression,
        ...                                     {"observation_columns":"data",
        ...                                      "label_column":"label",
        ...                                      "num_iterations": grid_values(2, 10),
        ...                                      "step_size": 0.01})],
        ...                                   num_folds=2,
        ...                                   verbose=True)

        <skip>
        >>> result
        GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 1.0
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              0              5
        f_measure        = 1.0
        precision        = 1.0
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              2              0
        Actual_Neg              2              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              2              0
        Actual_Neg              2              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              2              0
        Actual_Neg              2              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 1.0
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              2              0
        Actual_Neg              0              2
        f_measure        = 1.0
        precision        = 1.0
        recall           = 1.0)
        ******Averages: ******
        GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 1.0
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              0              5
        f_measure        = 1.0
        precision        = 1.0
        recall           = 1.0)

        >>> result.averages
        GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 1.0
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              0              5
        f_measure        = 1.0
        precision        = 1.0
        recall           = 1.0)
        </skip>
    """
    TkContext.validate(tc)
    arguments.require_type(Frame, frame, "frame")

    all_grid_search_results = []
    grid_search_results_accumulator = None
    for train_frame, test_frame in split_data(frame, num_folds , tc):
        scores = grid_search(train_frame, test_frame, train_descriptors, tc)
        if grid_search_results_accumulator is None:
            grid_search_results_accumulator = scores
        else:
            grid_search_results_accumulator._accumulate_matching_points(scores.grid_points)
        all_grid_search_results.append(scores)

    # make the accumulator hold averages
    grid_search_results_accumulator._divide_metrics(num_folds)
    return CrossValidateClassificationResults(all_grid_search_results,
                                              grid_search_results_accumulator.copy(),
                                              verbose)
Esempio n. 30
0
def load(path, tc=TkContext.implicit):
    """load Graph from given path"""
    TkContext.validate(tc)
    return tc.load(path, Graph)
Esempio n. 31
0
def cross_validate(frame,
                   train_descriptors,
                   num_folds=3,
                   verbose=False,
                   tc=TkContext.implicit):
    """
    Computes k-fold cross validation on classification and regression models with the given frame and parameter values
    :param frame: The frame to perform cross-validation on
    :param train_descriptors: Tuple of model and Dictionary of model parameters and their value/values as singleton
            values or a list of type grid_values
    :param num_folds: Number of folds to run the cross-validator on
    :param verbose: Flag indicating if the results of each fold are to be viewed. Default is set to False
    :param tc: spark-tk context (provided implicitly)
    :return: Summary of model's performance consisting of metrics of each combination of train_descriptor values per fold
            and averages across all folds

    Example
    -------

        >>> frame = tc.frame.create([[1,0],[2,0],[3,0],[4,0],[5,0],[6,1],[7,1],[8,1],[9,1],[10,1]],[("data", float),("label",int)])

        >>> frame.inspect()
        [#]  data  label
        ================
        [0]     1      0
        [1]     2      0
        [2]     3      0
        [3]     4      0
        [4]     5      0
        [5]     6      1
        [6]     7      1
        [7]     8      1
        [8]     9      1
        [9]    10      1

        >>> from sparktk.models import grid_values

        >>> result = tc.models.cross_validate(frame,
        ...                                   [(tc.models.classification.svm,
        ...                                     {"observation_columns":"data",
        ...                                      "label_column":"label",
        ...                                      "num_iterations": grid_values(2, 10),
        ...                                      "step_size": 0.01}),
        ...                                    (tc.models.classification.logistic_regression,
        ...                                     {"observation_columns":"data",
        ...                                      "label_column":"label",
        ...                                      "num_iterations": grid_values(2, 10),
        ...                                      "step_size": 0.01})],
        ...                                   num_folds=2,
        ...                                   verbose=True)

        <skip>
        >>> result
        GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 1.0
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              0              5
        f_measure        = 1.0
        precision        = 1.0
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              2              0
        Actual_Neg              2              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              2              0
        Actual_Neg              2              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              2              0
        Actual_Neg              2              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 1.0
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              2              0
        Actual_Neg              0              2
        f_measure        = 1.0
        precision        = 1.0
        recall           = 1.0)
        ******Averages: ******
        GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 1.0
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              0              5
        f_measure        = 1.0
        precision        = 1.0
        recall           = 1.0)

        >>> result.averages
        GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 1.0
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              0              5
        f_measure        = 1.0
        precision        = 1.0
        recall           = 1.0)
        </skip>
    """
    TkContext.validate(tc)
    arguments.require_type(Frame, frame, "frame")

    all_grid_search_results = []
    grid_search_results_accumulator = None
    for train_frame, test_frame in split_data(frame, num_folds, tc):
        scores = grid_search(train_frame, test_frame, train_descriptors, tc=tc)
        if grid_search_results_accumulator is None:
            grid_search_results_accumulator = scores
        else:
            grid_search_results_accumulator._accumulate_matching_points(
                scores.grid_points)
        all_grid_search_results.append(scores)

    # make the accumulator hold averages
    grid_search_results_accumulator._divide_metrics(num_folds)
    return CrossValidationResults(all_grid_search_results,
                                  grid_search_results_accumulator.copy(),
                                  verbose)
Esempio n. 32
0
def load(path, tc=TkContext.implicit):
    """load Graph from given path"""
    TkContext.validate(tc)
    return tc.load(path, Graph)
Esempio n. 33
0
def load(path, tc=TkContext.implicit):
    """load Frame from given path"""
    TkContext.validate(tc)
    return tc.load(path, Frame)
Esempio n. 34
0
def load(path, tc=TkContext.implicit):
    """load PcaModel from given path"""
    TkContext.validate(tc)
    return tc.load(path, PcaModel)
Esempio n. 35
0
def grid_search(train_frame, test_frame, train_descriptors, tc= TkContext.implicit):
    """
    Implements grid search by training the specified model on all combinations of descriptor and testing on test frame
    :param train_frame: The frame to train the model on
    :param test_frame: The frame to test the model on
    :param train_descriptors: Tuple of model and Dictionary of model parameters and their value/values as singleton
            values or a list of type grid_values
    :param tc: spark-tk context passed implicitly
    :return: Summary of metrics for different combinations of the grid and the best performing parameter combination

    Example
    -------

        >>> frame = tc.frame.create([[1,0],[2,0],[3,0],[4,0],[5,0],[6,1],[7,1],[8,1],[9,1],[10,1]],[("data", float),("label",int)])

        >>> frame.inspect()
        [#]  data  label
        ================
        [0]     1      0
        [1]     2      0
        [2]     3      0
        [3]     4      0
        [4]     5      0
        [5]     6      1
        [6]     7      1
        [7]     8      1
        [8]     9      1
        [9]    10      1

        >>> from sparktk.models import grid_values

        >>> grid_result = tc.models.grid_search(frame, frame,
        ...                                    [(tc.models.classification.svm,
        ...                                     {"observation_columns":"data",
        ...                                      "label_column":"label",
        ...                                      "num_iterations": grid_values(2, 10),
        ...                                      "step_size": 0.01}),
        ...                                     (tc.models.classification.logistic_regression,
        ...                                     {"observation_columns":"data",
        ...                                      "label_column":"label",
        ...                                      "num_iterations": grid_values(2, 10),
        ...                                      "step_size": 0.01})])

        >>> grid_result
        GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)
        GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 1.0
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              0              5
        f_measure        = 1.0
        precision        = 1.0
        recall           = 1.0)

        >>> grid_result.find_best()
        GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 1.0
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              0              5
        f_measure        = 1.0
        precision        = 1.0
        recall           = 1.0)

        >>> grid_result.grid_points
        [GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0),
         GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0),
         GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0),
         GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 1.0
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              0              5
        f_measure        = 1.0
        precision        = 1.0
        recall           = 1.0)]

        >>> grid_result.grid_points[1]
        GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy         = 0.5
        confusion_matrix =             Predicted_Pos  Predicted_Neg
        Actual_Pos              5              0
        Actual_Neg              5              0
        f_measure        = 0.666666666667
        precision        = 0.5
        recall           = 1.0)

    """

    # validate input
    TkContext.validate(tc)
    descriptors = affirm_type.list_of_anything(train_descriptors, "train_descriptors")
    for i in xrange(len(descriptors)):
        item = descriptors[i]
        if not isinstance(item, TrainDescriptor):
            require_type(tuple, item, "item", "grid_search needs a list of items which are either of type TrainDescriptor or tuples of (model, train_kwargs)")
            if len(item) != 2:
                raise value_error("list requires tuples of len 2", item, "item in train_descriptors")
            if not hasattr(item[0], 'train'):
                raise value_error("first item in tuple needs to be a object with a 'train' function", item, "item in train_descriptors")
            descriptors[i] = TrainDescriptor(item[0], item[1])

    arguments.require_type(Frame, train_frame, "frame")
    arguments.require_type(Frame, test_frame, "frame")

    grid_points = []
    for descriptor in descriptors:
        train_method = getattr(descriptor.model_type, "train")
        list_of_kwargs = expand_kwarg_grids([descriptor.kwargs])
        for kwargs in list_of_kwargs:
            train_kwargs = dict(kwargs)
            train_kwargs['frame'] = train_frame
            validate_call(train_method, train_kwargs, ignore_self=True)
            model = descriptor.model_type.train(**train_kwargs)
            test_kwargs = dict(kwargs)
            test_kwargs['frame'] = test_frame
            test_kwargs = extract_call(model.test, test_kwargs, ignore_self=True)
            metrics = model.test(**test_kwargs)
            grid_points.append(GridPoint(descriptor=TrainDescriptor(descriptor.model_type, train_kwargs), metrics=metrics))
    return GridSearchResults(grid_points)
Esempio n. 36
0
def load(path, tc=TkContext.implicit):
    """load KMeansModel from given path"""
    TkContext.validate(tc)
    return tc.load(path, KMeansModel)
Esempio n. 37
0
def load(path, tc=TkContext.implicit):
    """load KMeansModel from given path"""
    TkContext.validate(tc)
    return tc.load(path, CollaborativeFilteringModel)