def get_movie_graph(tc=_TkContext.implicit): _TkContext.validate(tc) global _movie_graph if _movie_graph is None: viewers = tc.frame.create( [['fred', 0], ['wilma', 0], ['pebbles', 1], ['betty', 0], ['barney', 0], ['bamm bamm', 1]], schema=[('id', str), ('kids', int)]) titles = [ 'Croods', 'Jurassic Park', '2001', 'Ice Age', 'Land Before Time' ] movies = tc.frame.create([[t] for t in titles], schema=[('id', str)]) vertices = viewers.copy() vertices.append(movies) edges = tc.frame.create( [['fred', 'Croods', 5], ['fred', 'Jurassic Park', 5], ['fred', '2001', 2], ['fred', 'Ice Age', 4], ['wilma', 'Jurassic Park', 3], ['wilma', '2001', 5], ['wilma', 'Ice Age', 4], ['pebbles', 'Croods', 4], ['pebbles', 'Land Before Time', 3], ['pebbles', 'Ice Age', 5], ['betty', 'Croods', 5], ['betty', 'Jurassic Park', 3], ['betty', 'Land Before Time', 4], ['betty', 'Ice Age', 3], ['barney', 'Croods', 5], ['barney', 'Jurassic Park', 5], ['barney', 'Land Before Time', 3], ['barney', 'Ice Age', 5], ['bamm bamm', 'Croods', 5], ['bamm bamm', 'Land Before Time', 3]], schema=['src', 'dst', 'rating']) _movie_graph = tc.graph.create(vertices, edges) return _movie_graph
def tc(request): global global_tc with lock: if global_tc is None: from sparktk import TkContext from sparktk import create_sc from sparktk.tests import utils import daaltk #from sparktk.loggers import loggers #loggers.set("d", "sparktk.sparkconf") # Get path to sparktk jars from SPARKTK_HOME if os.environ.has_key('SPARKTK_HOME'): sparktk_dir = os.environ.get('SPARKTK_HOME', None) else: raise RuntimeError("SPARKTK_HOME must be defined.") sc = create_sc( other_libs=[daaltk], master='local[2]', sparktk_home=sparktk_dir, app_name="pytest-pyspark-local-testing", extra_conf_dict={"spark.hadoop.fs.default.name": "file:///"}) request.addfinalizer(lambda: sc.stop()) global_tc = TkContext(sc, other_libs=[daaltk], sparktk_home=sparktk_dir) global_tc.testing = utils return global_tc
def get_cities_frame(tc=_TkContext.implicit): """Creates a small frame of city data""" _TkContext.validate(tc) global _cities_frame if _cities_frame is None: schema = zip('rank|city|population_2013|population_2010|change|county'.split('|'), [int, str, int, int, str, str]) data = [[field for field in line.split('|')] for line in """1|Portland|609456|583776|4.40%|Multnomah 2|Salem|160614|154637|3.87%|Marion 3|Eugene|159190|156185|1.92%|Lane 4|Gresham|109397|105594|3.60%|Multnomah 5|Hillsboro|97368|91611|6.28%|Washington 6|Beaverton|93542|89803|4.16%|Washington 15|Grants Pass|35076|34533|1.57%|Josephine 16|Oregon City|34622|31859|8.67%|Clackamas 17|McMinnville|33131|32187|2.93%|Yamhill 18|Redmond|27427|26215|4.62%|Deschutes 19|Tualatin|26879|26054|4.17%|Washington 20|West Linn|25992|25109|3.52%|Clackamas 7|Bend|81236|76639|6.00%|Deschutes 8|Medford|77677|74907|3.70%|Jackson 9|Springfield|60177|59403|1.30%|Lane 10|Corvallis|55298|54462|1.54%|Benton 11|Albany|51583|50158|2.84%|Linn 12|Tigard|50444|48035|5.02%|Washington 13|Lake Oswego|37610|36619|2.71%|Clackamas 14|Keizer|37064|36478|1.61%|Marion""".split('\n')] _cities_frame = tc.frame.create(data, schema) return _cities_frame
def get_cities_frame(tc=_TkContext.implicit): """Creates a small frame of city data""" _TkContext.validate(tc) global _cities_frame if _cities_frame is None: schema = zip( 'rank|city|population_2013|population_2010|change|county'.split( '|'), [int, str, int, int, str, str]) data = [[field for field in line.split('|')] for line in """1|Portland|609456|583776|4.40%|Multnomah 2|Salem|160614|154637|3.87%|Marion 3|Eugene|159190|156185|1.92%|Lane 4|Gresham|109397|105594|3.60%|Multnomah 5|Hillsboro|97368|91611|6.28%|Washington 6|Beaverton|93542|89803|4.16%|Washington 15|Grants Pass|35076|34533|1.57%|Josephine 16|Oregon City|34622|31859|8.67%|Clackamas 17|McMinnville|33131|32187|2.93%|Yamhill 18|Redmond|27427|26215|4.62%|Deschutes 19|Tualatin|26879|26054|4.17%|Washington 20|West Linn|25992|25109|3.52%|Clackamas 7|Bend|81236|76639|6.00%|Deschutes 8|Medford|77677|74907|3.70%|Jackson 9|Springfield|60177|59403|1.30%|Lane 10|Corvallis|55298|54462|1.54%|Benton 11|Albany|51583|50158|2.84%|Linn 12|Tigard|50444|48035|5.02%|Washington 13|Lake Oswego|37610|36619|2.71%|Clackamas 14|Keizer|37064|36478|1.61%|Marion""".split('\n')] _cities_frame = tc.frame.create(data, schema) return _cities_frame
def train(ts, p, d, q, include_intercept=True, method="css-cgd", init_params=None, tc=TkContext.implicit): """ Creates Autoregressive Integrated Moving Average (ARIMA) Model from the specified time series values. Given a time series, fits an non-seasonal Autoregressive Integrated Moving Average (ARIMA) model of order (p, d, q) where p represents the autoregression terms, d represents the order of differencing, and q represents the moving average error terms. If includeIntercept is true, the model is fitted with an intercept. Parameters ---------- :param ts: (List[float]) Time series to which to fit an ARIMA(p, d, q) model. :param p: (int) Autoregressive order :param d: (int) Differencing order :param q: (int) Moving average order :param include_intercept: (Optional(boolean)) If True, the model is fit with an intercept. Default is True. :param method: (Optional(string)) Objective function and optimization method. Current options are: 'css-bobyqa' and 'css-cgd'. Both optimize the log likelihood in terms of the conditional sum of squares. The first uses BOBYQA for optimization, while the second uses conjugate gradient descent. Default is 'css-cgd'. :param init_params: (Optional(List[float]) A set of user provided initial parameters for optimization. If the list is empty (default), initialized using Hannan-Rissanen algorithm. If provided, order of parameter should be: intercept term, AR parameters (in increasing order of lag), MA parameters (in increasing order of lag). :return: (ArimaModel) Trained ARIMA model """ if not isinstance(ts, list): raise TypeError("'ts' parameter must be a list") if not isinstance(p, int): raise TypeError("'p' parameter must be an integer.") if not isinstance(d, int): raise TypeError("'d' parameter must be an integer.") if not isinstance(q, int): raise TypeError("'q' parameter must be an integer.") if not isinstance(include_intercept, bool): raise TypeError("'include_intercept' parameter must be a boolean") if not isinstance(method, basestring): raise TypeError("'method' parameter must be a string") if init_params is not None: if not isinstance(init_params, list): raise TypeError("'init_params' parameter must be a list") TkContext.validate(tc) _scala_obj = get_scala_obj(tc) scala_ts = tc.jutils.convert.to_scala_list_double(ts) scala_init_params = tc.jutils.convert.to_scala_option_list_double( init_params) scala_model = _scala_obj.train(scala_ts, p, d, q, include_intercept, method, scala_init_params) return ArimaModel(tc, scala_model)
def train(ts, p, d, q, include_intercept=True, method="css-cgd", init_params=None, tc=TkContext.implicit): """ Creates Autoregressive Integrated Moving Average (ARIMA) Model from the specified time series values. Given a time series, fits an non-seasonal Autoregressive Integrated Moving Average (ARIMA) model of order (p, d, q) where p represents the autoregression terms, d represents the order of differencing, and q represents the moving average error terms. If includeIntercept is true, the model is fitted with an intercept. Parameters ---------- :param ts: (List[float]) Time series to which to fit an ARIMA(p, d, q) model. :param p: (int) Autoregressive order :param d: (int) Differencing order :param q: (int) Moving average order :param include_intercept: (Optional(boolean)) If True, the model is fit with an intercept. Default is True. :param method: (Optional(string)) Objective function and optimization method. Current options are: 'css-bobyqa' and 'css-cgd'. Both optimize the log likelihood in terms of the conditional sum of squares. The first uses BOBYQA for optimization, while the second uses conjugate gradient descent. Default is 'css-cgd'. :param init_params: (Optional(List[float]) A set of user provided initial parameters for optimization. If the list is empty (default), initialized using Hannan-Rissanen algorithm. If provided, order of parameter should be: intercept term, AR parameters (in increasing order of lag), MA parameters (in increasing order of lag). :return: (ArimaModel) Trained ARIMA model """ if not isinstance(ts, list): raise TypeError("'ts' parameter must be a list") if not isinstance(p, int): raise TypeError("'p' parameter must be an integer.") if not isinstance(d, int): raise TypeError("'d' parameter must be an integer.") if not isinstance(q, int): raise TypeError("'q' parameter must be an integer.") if not isinstance(include_intercept, bool): raise TypeError("'include_intercept' parameter must be a boolean") if not isinstance(method, basestring): raise TypeError("'method' parameter must be a string") if init_params is not None: if not isinstance(init_params, list): raise TypeError("'init_params' parameter must be a list") TkContext.validate(tc) _scala_obj = get_scala_obj(tc) scala_ts = tc.jutils.convert.to_scala_list_double(ts) scala_init_params = tc.jutils.convert.to_scala_option_list_double(init_params) scala_model = _scala_obj.train(scala_ts, p, d, q, include_intercept, method, scala_init_params) return ArimaModel(tc, scala_model)
def tc(request): global global_tc with lock: if global_tc is None: from sparktk import TkContext from sparktk import create_sc from sparktk.tests import utils #from sparktk.loggers import loggers #loggers.set("d", "sparktk.sparkconf") sc = create_sc(master='local[2]', app_name="pytest-pyspark-local-testing", extra_conf={"spark.hadoop.fs.default.name": "file:///"}) request.addfinalizer(lambda: sc.stop()) global_tc = TkContext(sc) global_tc.testing = utils return global_tc
def tc(request): global global_tc with lock: if global_tc is None: from sparktk import TkContext from sparktk import create_sc from sparktk.tests import utils #from sparktk.loggers import loggers #loggers.set("d", "sparktk.sparkconf") sc = create_sc(master='local[2]', app_name="pytest-pyspark-local-testing", extra_conf_dict={"spark.hadoop.fs.default.name": "file:///"}) request.addfinalizer(lambda: sc.stop()) global_tc = TkContext(sc) global_tc.testing = utils return global_tc
def test_frame_basic(self): """Documentation test for classifiers""" # The general workflow will be build a frame, run some analytics # on the frame # First Step, construct a frame # Construct a frame to be uploaded, this is done using plain python # lists uploaded to the server # The following frame could represent some ordered list (such as # customer orders) and a value associated with the order. # The order is sorted on, and then the order value is accumulated # Cumulative sum finds the sum up to and including a given order # Create context tc = TkContext() # Create the frame using a list object frame = tc.frame.create(data=[[0, 100], [3, 20], [1, 25], [2, 90]], schema=[("order", int), ("value", int)]) print frame.inspect() # Sort on order, note this is a side effect based operation frame.sort('order') # calculate the cumulative sum frame.cumulative_sum('value') print frame.inspect() # Fetch the results, and validate they are what you would expect result = frame.take(frame.count()) self.assertItemsEqual( result.data, [[0, 100, 100], [3, 20, 235], [1, 25, 125], [2, 90, 215]])
def get_movie_graph(tc=_TkContext.implicit): _TkContext.validate(tc) global _movie_graph if _movie_graph is None: viewers = tc.frame.create( [["fred", 0], ["wilma", 0], ["pebbles", 1], ["betty", 0], ["barney", 0], ["bamm bamm", 1]], schema=[("id", str), ("kids", int)], ) titles = ["Croods", "Jurassic Park", "2001", "Ice Age", "Land Before Time"] movies = tc.frame.create([[t] for t in titles], schema=[("id", str)]) vertices = viewers.copy() vertices.append(movies) edges = tc.frame.create( [ ["fred", "Croods", 5], ["fred", "Jurassic Park", 5], ["fred", "2001", 2], ["fred", "Ice Age", 4], ["wilma", "Jurassic Park", 3], ["wilma", "2001", 5], ["wilma", "Ice Age", 4], ["pebbles", "Croods", 4], ["pebbles", "Land Before Time", 3], ["pebbles", "Ice Age", 5], ["betty", "Croods", 5], ["betty", "Jurassic Park", 3], ["betty", "Land Before Time", 4], ["betty", "Ice Age", 3], ["barney", "Croods", 5], ["barney", "Jurassic Park", 5], ["barney", "Land Before Time", 3], ["barney", "Ice Age", 5], ["bamm bamm", "Croods", 5], ["bamm bamm", "Land Before Time", 3], ], schema=["src", "dst", "rating"], ) _movie_graph = tc.graph.create(vertices, edges) return _movie_graph
def get_movie_graph(tc=_TkContext.implicit): _TkContext.validate(tc) global _movie_graph if _movie_graph is None: viewers = tc.frame.create([['fred', 0], ['wilma', 0], ['pebbles', 1], ['betty', 0], ['barney', 0], ['bamm bamm', 1]], schema=[('id', str), ('kids', int)]) titles = ['Croods', 'Jurassic Park', '2001', 'Ice Age', 'Land Before Time'] movies = tc.frame.create([[t] for t in titles], schema=[('id', str)]) vertices = viewers.copy() vertices.append(movies) edges = tc.frame.create([['fred','Croods',5], ['fred','Jurassic Park',5], ['fred','2001',2], ['fred','Ice Age',4], ['wilma','Jurassic Park',3], ['wilma','2001',5], ['wilma','Ice Age',4], ['pebbles','Croods',4], ['pebbles','Land Before Time',3], ['pebbles','Ice Age',5], ['betty','Croods',5], ['betty','Jurassic Park',3], ['betty','Land Before Time',4], ['betty','Ice Age',3], ['barney','Croods',5], ['barney','Jurassic Park',5], ['barney','Land Before Time',3], ['barney','Ice Age',5], ['bamm bamm','Croods',5], ['bamm bamm','Land Before Time',3]], schema=['src', 'dst', 'rating']) _movie_graph = tc.graph.create(vertices, edges) return _movie_graph
def tc(request): global global_tc with lock: if global_tc is None: from sparktk import TkContext from sparktk import create_sc #from sparktk.loggers import loggers #loggers.set("d", "sparktk.sparkconf") sc = create_sc(master='local[2]', app_name="pytest-pyspark-local-testing") request.addfinalizer(lambda: sc.stop()) global_tc = TkContext(sc) return global_tc
def load(path, tc=TkContext.implicit): """load RandomForestRegressorModel from given path""" TkContext.validate(tc) return tc.load(path, RandomForestRegressorModel)
def load(path, tc=TkContext.implicit): """load LogisticRegressionModel from given path""" TkContext.validate(tc) return tc.load(path, LogisticRegressionModel)
def load(path, tc=TkContext.implicit): """load GaussianMixtureModel from given path""" TkContext.validate(tc) return tc.load(path, GaussianMixtureModel)
def load(path, tc=TkContext.implicit): """load KMeansModel from given path""" TkContext.validate(tc) return tc.load(path, CollaborativeFilteringModel)
def test_model_class_doc(self): """Generate a naive bayes dataset, use sparktk to train a model and verify""" # Naive bayes is a machine learning algorithm # We can use it to classify some item with properties into a group probabilistically # The general work flow is to generate a dataset # then we calculate the coefficient table and probabilities # Finally we build a frame of the data and create a naive bayes model # then we test the result of the naive bayes model test # Generate naive bayes dataset numCoeffs = random.randint(2, 10) coefficients = [] schema = [] obsCols = [] dataRows = [] coeffTable = [] # the number of rows of data we will generate numDiceRolls = random.randint(3, 30) # Generate the coefficient table and schema for index in range(0, numCoeffs): coefficients.append(random.uniform(0, 1)) schema.append(("x" + str(index), int)) obsCols.append("x" + str(index)) schema.append(("x" + str(numCoeffs), int)) # get all permutations of 0, 1 of length numCoeffs binaryPermutations = list(itertools.product(range(2), repeat=numCoeffs)) # now we compute the probability for each row # and add the probability for each row as a column to the table for element in binaryPermutations: product = 1 element = list(element) for i in range(0, numCoeffs): if element[i] is 1: product = coefficients[i] * product if element[i] is 0: product = (1 - coefficients[i]) * product element.append(product) coeffTable.append(list(element)) # Now we use the coefficient table to geneate the actual data for row in coeffTable: probability = row[len(row) - 1] for n in range(0, numDiceRolls): newRow = row randomResult = random.uniform(0, 1) if probability >= randomResult: newRow[len(newRow) - 1] = 1 else: newRow[len(newRow) - 1] = 0 dataRows.append(newRow) # Finally we create the frame and model # and check that it performs as we would expect # We create a sparktk context context = TkContext() # Then we create a frame from the data frame = context.frame.create(dataRows, schema=schema) # we train a naive bayes model # we give the model lots of information on both data and outcomes # in this way it learns which outcome to expect from data nb_model = context.models.classification.naive_bayes.train( frame, obsCols, "x" + str(numCoeffs - 1)) # then we test the model # meaning we try to see how it behaves in predicting outcomes # from data that it has been trained to recognize patterns in predicted_frame = nb_model.predict(frame) result = nb_model.test(predicted_frame) # Lastly we check the result of the model test self.assertAlmostEqual(1, result.precision) self.assertAlmostEqual(1, result.accuracy) self.assertAlmostEqual(1, result.recall) self.assertAlmostEqual(1, result.f_measure)
def load(path, tc=TkContext.implicit): """load LinearRegressionModel from given path""" TkContext.validate(tc) return tc.load(path, LinearRegressionModel)
def create(data, schema=None, validate_schema=False, tc=TkContext.implicit): """ Creates a frame from the given data and schema. If no schema data types are provided, the schema is inferred based on the data in the first 100 rows. If schema validation is enabled, all data is is checked to ensure that it matches the schema. If the data does not match the schema's data type, it attempts to cast the data to the proper data type. When the data is unable to be casted to the schema's data type, the item will be missing (None) in the frame. Parameters ---------- :param data: (List of row data or RDD) Data source :param schema: (Optional(list[tuple(str, type)] or list[str])) Optionally specify a schema (list of tuples of string column names and data type), column names (list of strings, and the column data types will be inferred) or None (column data types will be inferred and column names will be numbered like C0, C1, C2, etc). Note that unless validate_schema is enabled, no attempt is made to check or convert the data to the data type specified by the schema. If the data provided does not match the schema (and validate_schema is disabled), errors may be encountered when using certain frame operations. :param validate_schema: (Optional(bool)) When True, all data is checked to ensure that it matches the schema. If the data does not match the schema's data type, it attempts to cast the data to the proper data type. When the data is unable to be casted to the schema's data type, a missing value (None) is inserted in it's place. It is recommended that validate_schema is enabled, unless it is certain that all of the data matches the specified schema. Defaults to False. :param tc: TkContext :return: (Frame) Frame loaded with the specified data Examples -------- Create a frame with the specified data. >>> data = [["Bob", 30, 8], ["Jim", 45, 9.5], ["Sue", 25, 7], ["George", 15, 6], ["Jennifer", 18, 8.5]] >>> frame = tc.frame.create(data) Since no schema is provided, the schema will be inferred. Note that the data set had a mix of strings and integers in the third column. The schema will use the most general data type from the data that it sees, so in this example, the column is treated as a float. >>> frame.schema [('C0', <type 'str'>), ('C1', <type 'int'>), ('C2', <type 'float'>)] >>> frame.inspect() [#] C0 C1 C2 ====================== [0] Bob 30 8 [1] Jim 45 9.5 [2] Sue 25 7 [3] George 15 6 [4] Jennifer 18 8.5 We could also enable schema validation, which checks the data against the schema. If the data does not match the schema's data type, it attempts to cast the data to the proper data type. >>> frame = tc.frame.create(data, validate_schema=True) In this example with schema validation enabled, the integers in column C2 get casted to floats: >>> frame.inspect() [#] C0 C1 C2 ====================== [0] Bob 30 8.0 [1] Jim 45 9.5 [2] Sue 25 7.0 [3] George 15 6.0 [4] Jennifer 18 8.5 We could also provide a list of column names when creating the frame. When a list of column names is provided, the data types for the schema are still inferred, but the columns in the schema are labeled with the specified names. >>> frame = tc.frame.create(data, schema=["name", "age", "shoe_size"], validate_schema=True) >>> frame.schema [('name', <type 'str'>), ('age', <type 'int'>), ('shoe_size', <type 'float'>)] >>> frame.inspect() [#] name age shoe_size ============================= [0] Bob 30 8.0 [1] Jim 45 9.5 [2] Sue 25 7.0 [3] George 15 6.0 [4] Jennifer 18 8.5 Note that if a value cannot be parsed as the specified data type in the schema, it will show up as missing (None), if validate_schema is enabled. For example, consider the following frame where columns are defined as integers, but the data specified has a string in the second row. >>> data = [[1, 2, 3], [4, "five", 6]] >>> schema = [("a", int), ("b", int), ("c", int)] >>> frame = tc.frame.create(data, schema, validate_schema = True) >>> frame.inspect() [#] a b c =============== [0] 1 2 3 [1] 4 None 6 Note that the spot where the string was located, has it's value missing (None) since it couldn't be parsed to an integer. If validate_schema was disabled, no attempt is made to parse the data to the data type specified by the schema, and further frame operations may fail due to the data type discrepancy. """ TkContext.validate(tc) if data is None: data = [] if not isinstance(data, list)\ and not isinstance(data, (RDD, DataFrame))\ and not tc._jutils.is_jvm_instance_of(data, tc.sc._jvm.org.apache.spark.rdd.RDD)\ and not tc._jutils.is_jvm_instance_of(data, tc.sc._jvm.org.apache.spark.sql.DataFrame): raise TypeError( "Invalid data source. Expected the data parameter to be a 2-dimensional list (list of row data) or an RDD or DataFrame, but received: %s" % type(data)) from sparktk.frame.frame import Frame return Frame(tc, data, schema, validate_schema)
# See the License for the specific language governing permissions and # limitations under the License. # # ## importing the sparktk and tap_catalog libraries give you the capability of creating machine learning models, performing data wrangling, and publishing of the model to the data catalog. # In[ ]: import sparktk import tap_catalog from sparktk import TkContext from tap_catalog import DataCatalog print "SparkTK installation path = %s" % (sparktk.__path__) tc = TkContext() # ## Reading in the data to train the model # ## You must change the hdfs path to the path of the datafile # In[ ]: ds = "hdfs://nameservice1/org/29ace093-e11f-4f0b-b254-3f8e973476e5/brokers/userspace/694b3da9-c21a-4063-bf16-e072ac47f881/30fc50da-f065-41d8-a510-77d0b7683a47/000000_1" sc = [("label", float), ("feature1", float), ("feature2", float), ("feature3", float), ("feature4", float), ("feature5", float), ("feature6", float), ("feature7", float), ("feature8", float), ("feature9", float)] frame = tc.frame.import_csv(ds, schema=sc) frame.inspect()
def create(data, schema=None, validate_schema=False, tc=TkContext.implicit): """ Creates a frame from the given data and schema. If no schema data types are provided, the schema is inferred based on the data in the first 100 rows. If schema validation is enabled, all data is is checked to ensure that it matches the schema. If the data does not match the schema's data type, it attempts to cast the data to the proper data type. When the data is unable to be casted to the schema's data type, the item will be missing (None) in the frame. Parameters ---------- :param data: (List of row data or RDD) Data source :param schema: (Optional(list[tuple(str, type)] or list[str])) There are different options for specifying a schema. Note that unless validate_schema is enabled, no attempt is made to check or convert the data to the data type specified by the schema. If the data provided does not match the schema (and validate_schema is disabled), errors may be encountered when using certain frame operations. * Provide the full schema for the frame as a list of tuples (string column name and data type) * Provide the column names as a list of strings. Column data types will be inferred, based on the data. * None, where the schema is automatically inferred based on the data. Columns will be named generically ("C0", "C1", "C2", etc). :param validate_schema: (Optional(bool)) When True, all data is checked to ensure that it matches the schema. If the data does not match the schema's data type, it attempts to cast the data to the proper data type. When the data is unable to be casted to the schema's data type, a missing value (None) is inserted in it's place. It is recommended that validate_schema is enabled, unless it is certain that all of the data matches the specified schema. Defaults to False. :param tc: TkContext :return: (Frame) Frame loaded with the specified data Examples -------- Create a frame with the specified data. >>> data = [["Bob", 30, 8], ["Jim", 45, 9.5], ["Sue", 25, 7], ["George", 15, 6], ["Jennifer", 18, 8.5]] >>> frame = tc.frame.create(data) Since no schema is provided, the schema will be inferred. Note that the data set had a mix of strings and integers in the third column. The schema will use the most general data type from the data that it sees, so in this example, the column is treated as a float. >>> frame.schema [('C0', <type 'str'>), ('C1', <type 'int'>), ('C2', <type 'float'>)] >>> frame.inspect() [#] C0 C1 C2 ====================== [0] Bob 30 8 [1] Jim 45 9.5 [2] Sue 25 7 [3] George 15 6 [4] Jennifer 18 8.5 We could also enable schema validation, which checks the data against the schema. If the data does not match the schema's data type, it attempts to cast the data to the proper data type. >>> frame = tc.frame.create(data, validate_schema=True) In this example with schema validation enabled, the integers in column C2 get casted to floats: >>> frame.inspect() [#] C0 C1 C2 ====================== [0] Bob 30 8.0 [1] Jim 45 9.5 [2] Sue 25 7.0 [3] George 15 6.0 [4] Jennifer 18 8.5 We could also provide a list of column names when creating the frame. When a list of column names is provided, the data types for the schema are still inferred, but the columns in the schema are labeled with the specified names. >>> frame = tc.frame.create(data, schema=["name", "age", "shoe_size"], validate_schema=True) >>> frame.schema [('name', <type 'str'>), ('age', <type 'int'>), ('shoe_size', <type 'float'>)] >>> frame.inspect() [#] name age shoe_size ============================= [0] Bob 30 8.0 [1] Jim 45 9.5 [2] Sue 25 7.0 [3] George 15 6.0 [4] Jennifer 18 8.5 Note that if a value cannot be parsed as the specified data type in the schema, it will show up as missing (None), if validate_schema is enabled. For example, consider the following frame where columns are defined as integers, but the data specified has a string in the second row. >>> data = [[1, 2, 3], [4, "five", 6]] >>> schema = [("a", int), ("b", int), ("c", int)] >>> frame = tc.frame.create(data, schema, validate_schema = True) >>> frame.inspect() [#] a b c =============== [0] 1 2 3 [1] 4 None 6 Note that the spot where the string was located, has it's value missing (None) since it couldn't be parsed to an integer. If validate_schema was disabled, no attempt is made to parse the data to the data type specified by the schema, and further frame operations may fail due to the data type discrepancy. """ TkContext.validate(tc) if data is None: data = [] if not isinstance(data, list)\ and not isinstance(data, (RDD, DataFrame))\ and not tc._jutils.is_jvm_instance_of(data, tc.sc._jvm.org.apache.spark.rdd.RDD)\ and not tc._jutils.is_jvm_instance_of(data, tc.sc._jvm.org.apache.spark.sql.DataFrame): raise TypeError("Invalid data source. Expected the data parameter to be a 2-dimensional list (list of row data) or an RDD or DataFrame, but received: %s" % type(data)) from sparktk.frame.frame import Frame return Frame(tc, data, schema, validate_schema)
def load(path, tc=TkContext.implicit): """load RandomForestClassifierModel from given path""" TkContext.validate(tc) return tc.load(path, RandomForestClassifierModel)
def load(path, tc=TkContext.implicit): """load NaiveBayesModel from given path""" TkContext.validate(tc) return tc.load(path, NaiveBayesModel)
def load(path, tc=TkContext.implicit): """load CoxProportionalHazardsModel from given path""" TkContext.validate(tc) return tc.load(path, CoxProportionalHazardsModel)
def load(path, tc=TkContext.implicit): """load Dicom from given path""" TkContext.validate(tc) return tc.load(path, Dicom)
def load(path, tc=TkContext.implicit): """load Frame from given path""" TkContext.validate(tc) return tc.load(path, Frame)
def cross_validate(frame, train_descriptors, num_folds=3, verbose=False, tc=TkContext.implicit): """ Computes k-fold cross validation on model with the given frame and parameter values :param frame: The frame to perform cross-validation on :param train_descriptors: Tuple of model and Dictionary of model parameters and their value/values as singleton values or a list of type grid_values :param num_folds: Number of folds to run the cross-validator on :param verbose: Flag indicating if the results of each fold are to be viewed. Default is set to False :param tc: spark-tk context (provided implicitly) :return: Summary of model's performance consisting of metrics of each combination of train_descriptor values per fold and averages across all folds Example ------- >>> frame = tc.frame.create([[1,0],[2,0],[3,0],[4,0],[5,0],[6,1],[7,1],[8,1],[9,1],[10,1]],[("data", float),("label",int)]) >>> frame.inspect() [#] data label ================ [0] 1 0 [1] 2 0 [2] 3 0 [3] 4 0 [4] 5 0 [5] 6 1 [6] 7 1 [7] 8 1 [8] 9 1 [9] 10 1 >>> from sparktk.models import grid_values >>> result = tc.models.cross_validate(frame, ... [(tc.models.classification.svm, ... {"observation_columns":"data", ... "label_column":"label", ... "num_iterations": grid_values(2, 10), ... "step_size": 0.01}), ... (tc.models.classification.logistic_regression, ... {"observation_columns":"data", ... "label_column":"label", ... "num_iterations": grid_values(2, 10), ... "step_size": 0.01})], ... num_folds=2, ... verbose=True) <skip> >>> result GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 1.0 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 0 5 f_measure = 1.0 precision = 1.0 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 2 0 Actual_Neg 2 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 2 0 Actual_Neg 2 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 2 0 Actual_Neg 2 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 1.0 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 2 0 Actual_Neg 0 2 f_measure = 1.0 precision = 1.0 recall = 1.0) ******Averages: ****** GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 1.0 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 0 5 f_measure = 1.0 precision = 1.0 recall = 1.0) >>> result.averages GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 1.0 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 0 5 f_measure = 1.0 precision = 1.0 recall = 1.0) </skip> """ TkContext.validate(tc) arguments.require_type(Frame, frame, "frame") all_grid_search_results = [] grid_search_results_accumulator = None for train_frame, test_frame in split_data(frame, num_folds , tc): scores = grid_search(train_frame, test_frame, train_descriptors, tc) if grid_search_results_accumulator is None: grid_search_results_accumulator = scores else: grid_search_results_accumulator._accumulate_matching_points(scores.grid_points) all_grid_search_results.append(scores) # make the accumulator hold averages grid_search_results_accumulator._divide_metrics(num_folds) return CrossValidateClassificationResults(all_grid_search_results, grid_search_results_accumulator.copy(), verbose)
def load(path, tc=TkContext.implicit): """load Graph from given path""" TkContext.validate(tc) return tc.load(path, Graph)
def cross_validate(frame, train_descriptors, num_folds=3, verbose=False, tc=TkContext.implicit): """ Computes k-fold cross validation on classification and regression models with the given frame and parameter values :param frame: The frame to perform cross-validation on :param train_descriptors: Tuple of model and Dictionary of model parameters and their value/values as singleton values or a list of type grid_values :param num_folds: Number of folds to run the cross-validator on :param verbose: Flag indicating if the results of each fold are to be viewed. Default is set to False :param tc: spark-tk context (provided implicitly) :return: Summary of model's performance consisting of metrics of each combination of train_descriptor values per fold and averages across all folds Example ------- >>> frame = tc.frame.create([[1,0],[2,0],[3,0],[4,0],[5,0],[6,1],[7,1],[8,1],[9,1],[10,1]],[("data", float),("label",int)]) >>> frame.inspect() [#] data label ================ [0] 1 0 [1] 2 0 [2] 3 0 [3] 4 0 [4] 5 0 [5] 6 1 [6] 7 1 [7] 8 1 [8] 9 1 [9] 10 1 >>> from sparktk.models import grid_values >>> result = tc.models.cross_validate(frame, ... [(tc.models.classification.svm, ... {"observation_columns":"data", ... "label_column":"label", ... "num_iterations": grid_values(2, 10), ... "step_size": 0.01}), ... (tc.models.classification.logistic_regression, ... {"observation_columns":"data", ... "label_column":"label", ... "num_iterations": grid_values(2, 10), ... "step_size": 0.01})], ... num_folds=2, ... verbose=True) <skip> >>> result GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 1.0 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 0 5 f_measure = 1.0 precision = 1.0 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 2 0 Actual_Neg 2 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 2 0 Actual_Neg 2 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 2 0 Actual_Neg 2 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 1.0 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 2 0 Actual_Neg 0 2 f_measure = 1.0 precision = 1.0 recall = 1.0) ******Averages: ****** GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 1.0 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 0 5 f_measure = 1.0 precision = 1.0 recall = 1.0) >>> result.averages GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 1.0 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 0 5 f_measure = 1.0 precision = 1.0 recall = 1.0) </skip> """ TkContext.validate(tc) arguments.require_type(Frame, frame, "frame") all_grid_search_results = [] grid_search_results_accumulator = None for train_frame, test_frame in split_data(frame, num_folds, tc): scores = grid_search(train_frame, test_frame, train_descriptors, tc=tc) if grid_search_results_accumulator is None: grid_search_results_accumulator = scores else: grid_search_results_accumulator._accumulate_matching_points( scores.grid_points) all_grid_search_results.append(scores) # make the accumulator hold averages grid_search_results_accumulator._divide_metrics(num_folds) return CrossValidationResults(all_grid_search_results, grid_search_results_accumulator.copy(), verbose)
def load(path, tc=TkContext.implicit): """load PcaModel from given path""" TkContext.validate(tc) return tc.load(path, PcaModel)
def grid_search(train_frame, test_frame, train_descriptors, tc= TkContext.implicit): """ Implements grid search by training the specified model on all combinations of descriptor and testing on test frame :param train_frame: The frame to train the model on :param test_frame: The frame to test the model on :param train_descriptors: Tuple of model and Dictionary of model parameters and their value/values as singleton values or a list of type grid_values :param tc: spark-tk context passed implicitly :return: Summary of metrics for different combinations of the grid and the best performing parameter combination Example ------- >>> frame = tc.frame.create([[1,0],[2,0],[3,0],[4,0],[5,0],[6,1],[7,1],[8,1],[9,1],[10,1]],[("data", float),("label",int)]) >>> frame.inspect() [#] data label ================ [0] 1 0 [1] 2 0 [2] 3 0 [3] 4 0 [4] 5 0 [5] 6 1 [6] 7 1 [7] 8 1 [8] 9 1 [9] 10 1 >>> from sparktk.models import grid_values >>> grid_result = tc.models.grid_search(frame, frame, ... [(tc.models.classification.svm, ... {"observation_columns":"data", ... "label_column":"label", ... "num_iterations": grid_values(2, 10), ... "step_size": 0.01}), ... (tc.models.classification.logistic_regression, ... {"observation_columns":"data", ... "label_column":"label", ... "num_iterations": grid_values(2, 10), ... "step_size": 0.01})]) >>> grid_result GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 1.0 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 0 5 f_measure = 1.0 precision = 1.0 recall = 1.0) >>> grid_result.find_best() GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 1.0 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 0 5 f_measure = 1.0 precision = 1.0 recall = 1.0) >>> grid_result.grid_points [GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0), GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0), GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 2, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0), GridPoint(descriptor=sparktk.models.classification.logistic_regression: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 1.0 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 0 5 f_measure = 1.0 precision = 1.0 recall = 1.0)] >>> grid_result.grid_points[1] GridPoint(descriptor=sparktk.models.classification.svm: {'num_iterations': 10, 'step_size': 0.01, 'observation_columns': 'data', 'label_column': 'label'}, metrics=accuracy = 0.5 confusion_matrix = Predicted_Pos Predicted_Neg Actual_Pos 5 0 Actual_Neg 5 0 f_measure = 0.666666666667 precision = 0.5 recall = 1.0) """ # validate input TkContext.validate(tc) descriptors = affirm_type.list_of_anything(train_descriptors, "train_descriptors") for i in xrange(len(descriptors)): item = descriptors[i] if not isinstance(item, TrainDescriptor): require_type(tuple, item, "item", "grid_search needs a list of items which are either of type TrainDescriptor or tuples of (model, train_kwargs)") if len(item) != 2: raise value_error("list requires tuples of len 2", item, "item in train_descriptors") if not hasattr(item[0], 'train'): raise value_error("first item in tuple needs to be a object with a 'train' function", item, "item in train_descriptors") descriptors[i] = TrainDescriptor(item[0], item[1]) arguments.require_type(Frame, train_frame, "frame") arguments.require_type(Frame, test_frame, "frame") grid_points = [] for descriptor in descriptors: train_method = getattr(descriptor.model_type, "train") list_of_kwargs = expand_kwarg_grids([descriptor.kwargs]) for kwargs in list_of_kwargs: train_kwargs = dict(kwargs) train_kwargs['frame'] = train_frame validate_call(train_method, train_kwargs, ignore_self=True) model = descriptor.model_type.train(**train_kwargs) test_kwargs = dict(kwargs) test_kwargs['frame'] = test_frame test_kwargs = extract_call(model.test, test_kwargs, ignore_self=True) metrics = model.test(**test_kwargs) grid_points.append(GridPoint(descriptor=TrainDescriptor(descriptor.model_type, train_kwargs), metrics=metrics)) return GridSearchResults(grid_points)
def load(path, tc=TkContext.implicit): """load KMeansModel from given path""" TkContext.validate(tc) return tc.load(path, KMeansModel)