def KFold(data, n_folds=10): """ Create a K-Fold split of a data set as an iterable/indexable object of K pairs, where each pair is a partition of the dataset. This can be useful for cross validation, where each fold is used as a held out dataset while training on the remaining data. Parameters ---------- data: SFrame A Non empty SFrame. n_folds: int The number of folds to create. Must be at least 2. Notes ----- This does not shuffle the data. Shuffling your data is a useful preprocessing step when doing cross validation. Yields ------- (SArray, SArray) Yields train, test of each fold Examples -------- >>> url = 'https://static.turi.com/datasets/xgboost/mushroom.csv' >>> sf = tc.SFrame.read_csv(url) >>> folds = KFold(sf) """ if data.num_rows() < n_folds: raise ValueError for st, end in _kfold_sections(data, n_folds): idx = np.zeros(len(data)) idx[st:end] = 1 yield data[tc.SArray(1 - idx)], data[tc.SArray(idx)]
def test_exceptions(self): good1 = turicreate.SArray([{'a': 5, 'b': 7}]) good2 = turicreate.SFrame({'bow': good1}) good3 = turicreate.SArray([{}]) bad1 = turicreate.SFrame({'x': [0, 1, 2, 3]}) bad2 = turicreate.SFrame({'x': [{'0': 3}], 'y': [{'3': 5}]}) bad3 = turicreate.SArray([{'a': 5, 'b': 3}, None, {'a': 10}]) bad4 = turicreate.SArray([{'a': 5, 'b': None}, {'a': 3}]) for d in [good1, good2, good3]: m = topic_model.create(d) self.assertTrue(m is not None) # Test that create() throws on bad input with self.assertRaises(Exception): m = topic_model.create(bad1) with self.assertRaises(Exception): m = topic_model.create(bad2) with self.assertRaises(ToolkitError): m = topic_model.create(bad3) with self.assertRaises(ToolkitError): m = topic_model.create(bad4) m = self.models[0] with self.assertRaises(Exception): pr = m.predict(bad1) with self.assertRaises(Exception): pr = m.predict(bad2) with self.assertRaises(Exception): pr = m.predict(bad3)
def test_rmse(self): y = turicreate.SArray([1, 2, 1, 2]) yhat = turicreate.SArray([3, -1, 1, 0]) rmse = turicreate.toolkits.evaluation.rmse(y, yhat) true_rmse = (float(2 * 2 + 3 * 3 + 0 + 2 * 2) / 4) ** 0.5 self.assertAlmostEqual(rmse, true_rmse)
def test_confusion_matrix(self): y = turicreate.SArray([1, 1, 0, 1, 1, 0, 1]) yhat = turicreate.SArray([0, 1, 0, 0, 1, 1, 0]) res = turicreate.toolkits.evaluation.confusion_matrix(y, yhat) res = res.sort(["predicted_label", "target_label"])["count"] self.assertTrue((res == turicreate.SArray([1, 3, 1, 2])).all())
def test_missing_values(self): # Arrange t, p = _generate_classes_and_scores(3, n=100, hard_predictions=True) pm = [None if x == 2 else x for x in p] tm = [None if x == 2 else x for x in t] targets = turicreate.SArray(tm) predictions = turicreate.SArray(pm) # Act & Assert [accuracy] skl_score = accuracy_score(t, p) score = turicreate.toolkits.evaluation.accuracy(targets, predictions) self.assertAlmostEqual(skl_score, score) # Act & Assert [precision] skl_score = precision_score(t, p, average="macro") score = turicreate.toolkits.evaluation.precision(targets, predictions) self.assertAlmostEqual(skl_score, score) # Act & Assert [recall] skl_score = recall_score(t, p, average="macro") score = turicreate.toolkits.evaluation.recall(targets, predictions) self.assertAlmostEqual(skl_score, score) # Act & Assert [f1_score] skl_score = f1_score(t, p, average="macro") score = turicreate.toolkits.evaluation.f1_score(targets, predictions) self.assertAlmostEqual(skl_score, score) # Act & Assert [fbeta_score] skl_score = fbeta_score(t, p, beta=2.0, average="macro") score = turicreate.toolkits.evaluation.fbeta_score( targets, predictions, beta=2.0 ) self.assertAlmostEqual(skl_score, score)
def test_export_coreml(self): from PIL import Image import coremltools filename = tempfile.mkstemp('bingo.mlmodel')[1] self.model.export_coreml(filename, include_non_maximum_suppression=False) coreml_model = coremltools.models.MLModel(filename) img = self.train[0:1][self.feature][0] img_fixed = tc.image_analysis.resize(img, 416, 416, 3) pil_img = Image.fromarray(img_fixed.pixel_data) if _mac_ver() >= (10, 13): ret = coreml_model.predict({self.feature: pil_img}, usesCPUOnly=True) self.assertEqual(ret['coordinates'].shape[1], 4) self.assertEqual(ret['confidence'].shape[1], len(_CLASSES)) self.assertEqual(ret['coordinates'].shape[0], ret['confidence'].shape[0]) # Also check if we can train a second model and export it (there could # be naming issues in mxnet) filename2 = tempfile.mkstemp('bingo2.mlmodel')[1] # We also test at the same time if we can export a model with a single # class sf = tc.SFrame({ 'image': tc.SArray([self.train[self.feature][0]]), 'label': tc.SArray([self.train[self.target][0]]) }) model2 = tc.one_shot_object_detector.create(sf, 'label', max_iterations=1) model2.export_coreml(filename2, include_non_maximum_suppression=False)
def test_bm25(self): """ Check correctness of the BM2.5 query. """ # Test input formats query = ['a', 'b', 'c'] assert tc.text_analytics.bm25(self.data, query) is not None query = tc.SArray(['a', 'b', 'c']) assert tc.text_analytics.bm25(self.data, query) is not None query = {'a': 5, 'b': 3, 'c': 1} assert tc.text_analytics.bm25(self.data, query) is not None # Only documents containing query words are included in result assert tc.text_analytics.bm25(self.data, query).num_rows() == 4 dataset = tc.SArray([{ 'a': 5, 'b': 7, 'c': 10 }, { 'a': 3, 'c': 1, 'd': 2 }, None, { 'a': 1 }, { 'f': 5 }]) res = tc.text_analytics.bm25(dataset, query) assert res.num_rows() == 3
def get_backgrounds(self): # Download tar file, if not already downloaded # Get tar file path tarfile_path = _download_and_checksum_files(self.sarray_url_md5_pairs, _get_cache_dir("data"))[0] # Extract SArray from tar file, if not already extracted if _os.path.exists(self.destination_sarray_path): backgrounds_tar = _tarfile.open(tarfile_path) backgrounds_tar.extractall(_get_cache_dir("data")) # Verify and load the extracted SArray try: # Check we extracted the file we expected expected_extracted_files = set(self.extracted_file_to_md5.keys()) extracted_files = set(_os.listdir(self.destination_sarray_path)) assert expected_extracted_files == extracted_files # Check each of the files is what we expect for filename, expected_md5 in self.extracted_file_to_md5.items(): full_path = _os.path.join(_get_cache_dir("data"), filename) md5 = hashlib.md5(full_path).hexdigest() assert md5 == expected_md5 backgrounds = _tc.SArray(self.destination_sarray_path) except: # delete the incompletely/incorrectly extracted tarball bits on disk if _os.path.exists(self.destination_sarray_path): _shutil.rmtree(self.destination_sarray_path) # and re-extract backgrounds_tar = _tarfile.open(tarfile_path) backgrounds_tar.extractall(_get_cache_dir("data")) backgrounds = _tc.SArray(self.destination_sarray_path) return backgrounds
def _get_data(feature, target): from PIL import Image as _PIL_Image rs = np.random.RandomState(1234) def from_pil_image(pil_img, image_format="png"): if image_format == "raw": image = np.array(pil_img) FORMAT_RAW = 2 return tc.Image( _image_data=image.tobytes(), _width=image.shape[1], _height=image.shape[0], _channels=image.shape[2], _format_enum=FORMAT_RAW, _image_data_size=image.size, ) else: with tempfile.NamedTemporaryFile(mode="w+b", suffix="." + image_format) as f: pil_img.save(f, format=image_format) return tc.Image(f.name) num_examples = 100 num_starter_images = 5 max_num_boxes_per_image = 10 classes = _CLASSES images = [] FORMATS = ["png", "jpeg", "raw"] for _ in range(num_examples): # Randomly determine image size (should handle large and small) img_shape = tuple(rs.randint(100, 1000, size=2)) + (3, ) img = rs.randint(255, size=img_shape) pil_img = _PIL_Image.fromarray(img, mode="RGB") # Randomly select image format image_format = FORMATS[rs.randint(len(FORMATS))] images.append(from_pil_image(pil_img, image_format=image_format)) starter_images = [] starter_target = [] for i in range(num_starter_images): img_shape = tuple(rs.randint(100, 1000, size=2)) + (3, ) img = rs.randint(255, size=img_shape) pil_img = _PIL_Image.fromarray(img, mode="RGB") image_format = FORMATS[rs.randint(len(FORMATS))] starter_images.append( from_pil_image(pil_img, image_format=image_format)) starter_target.append(_CLASSES[i % len(_CLASSES)]) train = tc.SFrame({ feature: tc.SArray(starter_images), target: tc.SArray(starter_target), }) test = tc.SFrame({ feature: tc.SArray(images), }) backgrounds = test[feature].head(5) return train, test, backgrounds
def test_roc_curve_str(self): y = turicreate.SArray(['a', 'b', 'a', 'b']) yhat = turicreate.SArray([.1, .2, .3, .4]) res = turicreate.toolkits.evaluation.roc_curve(y, yhat) points = res[['fpr', 'tpr']].unique().sort(['fpr', 'tpr']) self.assertTrue(all(res['tpr'] >= 0) and all(res['tpr'] <= 1)) self.assertTrue(all(res['fpr'] >= 0) and all(res['fpr'] <= 1))
def test_roc_curve_str(self): y = turicreate.SArray(["a", "b", "a", "b"]) yhat = turicreate.SArray([0.1, 0.2, 0.3, 0.4]) res = turicreate.toolkits.evaluation.roc_curve(y, yhat) points = res[["fpr", "tpr"]].unique().sort(["fpr", "tpr"]) self.assertTrue(all(res["tpr"] >= 0) and all(res["tpr"] <= 1)) self.assertTrue(all(res["fpr"] >= 0) and all(res["fpr"] <= 1))
def test_drop_words(self): ## Bogus input type sa = tc.SArray([1, 2, 3]) with self.assertRaises(RuntimeError): text_analytics.drop_words(sa) sa = tc.SArray(["str", None]) # no throw, just give warning and skip # avoid segfault stop_words = text_analytics.stop_words() text_analytics.drop_words(sa, stop_words=stop_words) ## Other languages expected = [ "this is someurl http someurl this is someurl http someurl", "中文 应该也 行 中文 应该也 行", "Сблъсъкът между Сблъсъкът между", ] expected2 = [ "This is someurl http someurl This is someurl http someurl", "中文 应该也 行 中文 应该也 行", "Сблъсъкът между Сблъсъкът между", ] result = text_analytics.drop_words(self.languages_double) self.assertEqual(result.dtype, str) self.sframe_comparer._assert_sarray_equal(result, expected) result = text_analytics.drop_words(self.languages_double, to_lower=False) self.assertEqual(result.dtype, str) self.sframe_comparer._assert_sarray_equal(result, expected2) ## Check that delimiters work properly by default and when modified. expected1 = [ "this is some url http www someurl com this is some url http www someurl com", "should we yes we should should we yes we should", ] expected2 = [ "this is some url http://www.someurl.com this is some url http://www.someurl.com", "should we yes we should. should we yes we should.", ] expected3 = ["url http www someurl url http www someurl", ""] word_counts1 = text_analytics.drop_words(self.punctuated_double) word_counts2 = text_analytics.drop_words( self.punctuated_double, delimiters=["?", "!", ",", " "]) word_counts3 = text_analytics.drop_words( self.punctuated_double, stop_words=text_analytics.stop_words()) self.assertEqual(word_counts1.dtype, str) self.sframe_comparer._assert_sarray_equal(word_counts1, expected1) self.assertEqual(word_counts2.dtype, str) self.sframe_comparer._assert_sarray_equal(word_counts2, expected2) self.assertEqual(word_counts3.dtype, str) self.sframe_comparer._assert_sarray_equal(word_counts3, expected3)
def test_invalid_data_set(self): # infer dtype str a = tc.SArray(['str', None]) b = tc.SArray(['str', 'str']) # target contains none sf = tc.SFrame({'a': a, 'b': b}) with self.assertRaises(ToolkitError): tc.text_classifier.create(sf, target='a', features=['b'], word_count_threshold=1) # feature contains none, Github #2402 sf = tc.SFrame({'b': a, 'a': b}) with self.assertRaises(ToolkitError): tc.text_classifier.create(sf, target='b', features=['a'], word_count_threshold=1)
def test_auc_multi_class_score(self): # Arrange t, p = _generate_classes_and_scores(3, n=100, hard_predictions=False) sk_p = {} sk_t = {} for i in range(3): sk_p[i] = p[:, i] sk_t[i] = t == i targets = turicreate.SArray(t) predictions = turicreate.SArray(p) str_targets = targets.astype(str) # Act sk_score = {} for i in range(3): sk_score[i] = roc_auc_score(sk_t[i], sk_p[i]) # Act [Average = None] score = turicreate.toolkits.evaluation.auc(targets, predictions, average=None) str_score = turicreate.toolkits.evaluation.auc(str_targets, predictions, average=None) # Assert self.assertEqual(type(score), dict) self.assertEqual(set(score.keys()), set([0, 1, 2])) self.assertEqual(set(str_score.keys()), set(["0", "1", "2"])) # Note: Explicitly not putting it into a for loop for ease of # debugging when the tests fail. self.assertAlmostEqual(sk_score[0], score[0]) self.assertAlmostEqual(sk_score[0], str_score['0']) self.assertAlmostEqual(sk_score[1], score[1]) self.assertAlmostEqual(sk_score[1], str_score['1']) self.assertAlmostEqual(sk_score[2], score[2]) self.assertAlmostEqual(sk_score[2], str_score['2']) # Act [Average = 'macro'] score = turicreate.toolkits.evaluation.auc(targets, predictions, average='macro') str_score = turicreate.toolkits.evaluation.auc(str_targets, predictions, average='macro') avg_score = 0.0 for i in range(3): avg_score += sk_score[i] avg_score /= 3.0 self.assertAlmostEqual(avg_score, score) self.assertAlmostEqual(avg_score, str_score)
def test_logloss_clipping(self): y = turicreate.SArray([0, 1, 2, 0]) yhat = turicreate.SArray( [[0.9, 0.0, 0.1], [0.8, 0.1, 0.1], [0.1, 0.1, 0.8], [0.1, 0.1, 0.8]] ) log_loss = turicreate.toolkits.evaluation.log_loss(y, yhat) self.assertTrue(log_loss != inf) y = turicreate.SArray([0, 1, 2, 0]) yhat = turicreate.SArray( [[1.0, 0.0, 0.0], [0.8, 0.1, 0.1], [0.1, 0.1, 0.8], [0.1, 0.1, 0.8]] ) log_loss = turicreate.toolkits.evaluation.log_loss(y, yhat) self.assertTrue(log_loss != inf) y = turicreate.SArray([0, 1, 0, 0]) yhat = turicreate.SArray([0.0, 0.9, 0.1, 0.1]) log_loss = turicreate.toolkits.evaluation.log_loss(y, yhat) self.assertTrue(log_loss != inf) y = turicreate.SArray([0, 1, 0, 0]) yhat = turicreate.SArray([0.1, 1.0, 0.1, 0.1]) log_loss = turicreate.toolkits.evaluation.log_loss(y, yhat) self.assertTrue(log_loss != inf)
def get_backgrounds(self): tarfile_path = _download_and_checksum_files(self.sarray_url_md5_pairs, _get_cache_dir("data"))[0] backgrounds_tar = _tarfile.open(tarfile_path) try: backgrounds = _tc.SArray(self.destination_sarray_path) except: # delete the incompletely extracted tarball bits on disk if _os.path.exists(self.destination_sarray_path): _shutil.rmtree(self.destination_sarray_path) # and re-extract backgrounds_tar.extractall(_get_cache_dir("data")) backgrounds = _tc.SArray(self.destination_sarray_path) return backgrounds
def test_integer_probabilities(self): y = turicreate.SArray([0, 1, 2, 0]) yhat = turicreate.SArray([[1, 0, 1], [1, 0, 1], [0, 1, 1], [0, 0, 1]]) log_loss = turicreate.toolkits.evaluation.log_loss(y, yhat) auc = turicreate.toolkits.evaluation.auc(y, yhat) roc_curve = turicreate.toolkits.evaluation.roc_curve(y, yhat) y = turicreate.SArray([0, 1, 0, 0]) yhat = turicreate.SArray([0, 1, 0, 0]) turicreate.toolkits.evaluation.log_loss(y, yhat) turicreate.toolkits.evaluation.auc(y, yhat) turicreate.toolkits.evaluation.roc_curve(y, yhat)
def test_list_and_dict_type(self): accuracy_threshold = 0.8 simple_data = self.data simple_train, simple_test = simple_data.random_split(0.8, seed=1) # make a more complicated dataset containing list and dictionary type columns complex_data = copy.copy(simple_data) complex_data['random_list_noise'] = \ tc.SArray([[random.gauss(0, 1) for j in range(3)] for i in range(complex_data.num_rows())]) complex_data['random_dict_noise'] = \ tc.SArray([{'x0': random.gauss(0, 1)} for i in range(complex_data.num_rows())]) complex_train, complex_test = complex_data.random_split(0.8, seed=1) for (train, test) in [(simple_train, simple_test), (complex_train, complex_test)]: self._test_classifier_model(train, test, accuracy_threshold)
def setUpClass(self): """ Set up (Run only once) """ np.random.seed(15) n, d = 100, 3 self.sf = tc.SFrame() for i in range(d): self.sf.add_column(tc.SArray(np.random.rand(n)), inplace=True) self.sf["target"] = np.random.randint(2, size=n) self.target = "target" self.sf["vec"] = self.sf.apply( lambda row: [row["X{}".format(i + 1)] for i in range(d)]) self.sf["vec"] = self.sf["vec"].apply(lambda x: x, array.array) self.features = ["vec"] self.unpacked_features = ["vec[%s]" % (i) for i in range(d)] self.def_kwargs = _DEFAULT_SOLVER_OPTIONS ## Compute the correct answers with Scikit-Learn target_name = self.target feature_names = self.features X_train = list(self.sf["vec"]) y_train = list(self.sf[self.target]) sm_model = svm.LinearSVC(C=1.0, loss="hinge") sm_model.fit(X_train, y_train) self.coef = list(sm_model.intercept_) + list(sm_model.coef_[0])
def setUpClass(self): """ Set up (Run only once) """ np.random.seed(15) n, d = 100, 3 self.sf = tc.SFrame() for i in range(d): self.sf.add_column(tc.SArray(np.random.rand(n)), inplace=True) self.sf['target'] = np.random.randint(2, size=n) self.target = 'target' self.sf['dict'] = self.sf.apply( lambda row: {i: row['X{}'.format(i + 1)] for i in range(d)}) self.features = ['dict'] self.unpacked_features = ['dict[%s]' % i for i in range(d)] self.def_kwargs = _DEFAULT_SOLVER_OPTIONS ## Compute the correct answers with Scikit-Learn target_name = self.target feature_names = self.features X_train = list(self.sf['dict'].apply(lambda x: [x[k] for k in \ sorted(x.keys())])) y_train = list(self.sf[self.target]) sm_model = svm.LinearSVC(C=1.0, loss='l1') sm_model.fit(X_train, y_train) self.coef = list(sm_model.intercept_) + list(sm_model.coef_[0])
def setUpClass(self): ## Simulate test data np.random.seed(10) n, d = 100, 10 self.sf = tc.SFrame() for i in range(d): self.sf.add_column(tc.SArray(np.random.randn(n)), inplace=True) target = np.random.randint(2, size=n) ## Create the model self.sf['target'] = target self.def_kwargs = _DEFAULT_SOLVER_OPTIONS self.def_opts = dict( list(self.def_kwargs.items()) + list({ 'solver': 'auto', 'feature_rescaling': True, 'class_weights': None, 'penalty': 1.0 }.items())) self.solver = 'auto' self.opts = self.def_opts.copy() self.opts['max_iterations'] = 500 self.features = ['X{}'.format(i) for i in range(1, d + 1)] self.unpacked_features = ['X{}'.format(i) for i in range(1, d + 1)] self.target = 'target'
def setUpClass(self): """ Setup required for all tests that don't require an trained model. """ np.random.seed(8) n, d = 100, 10 self.sf = tc.SFrame() for i in range(d): self.sf.add_column(tc.SArray(np.random.randn(n)), inplace=True) target = np.random.randint(2, size=n) target[0] = 0 target[1] = 1 ## Create the model self.sf['target'] = target self.def_kwargs = _DEFAULT_SOLVER_OPTIONS self.solver = 'auto' self.features = ', '.join(['X{}'.format(i) for i in range(1, d + 1)]) self.target = 'target' self.sf['target'] = target self.features = ['X{}'.format(i) for i in range(1, d + 1)] ## Compute the correct answers with Scikit-Learns target_name = self.target feature_names = self.features X_train = list(self.sf.apply(lambda row: [row[k] for k in \ feature_names])) y_train = list(self.sf[self.target]) sm_model = svm.LinearSVC(C=1.0, loss='l1') sm_model.fit(X_train, y_train) self.coef = list(sm_model.intercept_) + list(sm_model.coef_[0])
def test_combination_gl_python_types(self): sg_test_1 = tc.SGraph().add_vertices([ tc.Vertex(1, {'fluffy': 1}), tc.Vertex(2, { 'fluffy': 1, 'woof': 1 }), tc.Vertex(3, {}) ]) sarray_test_1 = tc.SArray([1, 2, 3]) sframe_test_1 = tc.SFrame([1, 2, 3]) obj_list = [[sg_test_1, sframe_test_1, sarray_test_1], { 0: sg_test_1, 1: sframe_test_1, 2: sarray_test_1 }] for obj in obj_list: pickler = gl_pickle.GLPickler(self.filename) pickler.dump(obj) pickler.close() unpickler = gl_pickle.GLUnpickler(self.filename) obj_ret = unpickler.load() unpickler.close() assert_sframe_equal(obj[0].get_vertices(), obj_ret[0].get_vertices()) assert_sframe_equal(obj[0].get_edges(), obj_ret[0].get_edges()) assert_sframe_equal(obj[1], obj_ret[1]) assert list(obj[2]) == list(obj_ret[2])
def test_custom_initial_centers(self): """ Test that the user can pass hard-coded initial cluster centers, and that these are actually used to initialize the clusters. """ ## Empty initial centers with self.assertRaises(ValueError): m = tc.kmeans.create(dataset=self.sf, initial_centers=tc.SFrame(), max_iterations=self.max_iter, verbose=False) ## Initial centers as an SArray of indices with self.assertRaises(TypeError): m = tc.kmeans.create(dataset=self.sf, initial_centers=tc.SArray([1, 2, 3]), max_iterations=self.max_iter, verbose=False) ## Initial centers with a schema that doesn't match the data sf_init = make_clustering_data(n=10, d=self.dim-1, seed=43) with self.assertRaises(ValueError): m = tc.kmeans.create(dataset=self.sf, initial_centers=sf_init, max_iterations=self.max_iter, verbose=False) ## Good initial centers sf_init = make_clustering_data(n=10, d=self.dim, seed=43) ftrs = ['float0', 'float1', 'dict0'] # exclude int feature because these *are* changed. m = tc.kmeans.create(self.sf, features=ftrs, initial_centers=sf_init, max_iterations=0, verbose=False) model_init_centers = m.cluster_info assert_sframe_equal(sf_init[ftrs], model_init_centers[ftrs])
def test_predict(self): """ Test that we can make predictions using the model. """ docs = self.docs for m in self.models: preds = m.predict(docs) self.assertTrue(isinstance(preds, turicreate.SArray)) self.assertEqual(len(preds), len(docs)) self.assertEqual(preds.dtype, int) preds = m.predict(docs, output_type='probability') self.assertTrue(isinstance(preds, turicreate.SArray)) self.assertTrue(len(preds) == len(docs)) s = preds.apply(lambda x: sum(x)) self.assertTrue((s.apply(lambda x: abs(x - 1)) < .000001).all()) # Test predictions when docs have new words new_docs = turicreate.SArray([{ '-1,-1': 3.0, '0,4': 5.0, '0,3': 2.0 }]) preds = m.predict(new_docs) self.assertEqual(len(preds), len(new_docs)) # Test additional burnin. Ideally we could show that things # converge as you increase burnin. preds_no_burnin = m.predict(docs, output_type='probability', num_burnin=0) self.assertEqual(len(preds_no_burnin), len(docs))
def test_pickling_sarray_types(self): sarray_list = [ tc.SArray([1, 2, 3]), tc.SArray([1.0, 2.0, 3.5]), tc.SArray(["foo", "bar"]), ] for obj in sarray_list: pickler = gl_pickle.GLPickler(self.filename) pickler.dump(obj) pickler.close() unpickler = gl_pickle.GLUnpickler(self.filename) obj_ret = unpickler.load() unpickler.close() assert list(obj) == list(obj_ret), \ "Failed pickling in %s (Got back %s)" % (obj, obj_ret)
def test_grouped_precision_recall(self): data = turicreate.SFrame() data['user_id'] = ["a", "b", "b", "c", "c", "c"] data['item_id'] = ['x', 'x', 'y', 'v', 'w', 'z'] data['rating'] = [0, 1, 2, 3, 4, 5] m = turicreate.recommender.item_similarity_recommender.create(data) recs = m.recommend() test_data = turicreate.SFrame() test_data['user_id'] = ['a', 'b'] test_data['item_id'] = ['v', 'z'] test_data['rating'] = [7, 8] pr = turicreate.recommender.util.precision_recall_by_user(test_data, recs, cutoffs=[3]) self.assertEqual(type(pr), turicreate.SFrame) self.assertEqual(pr.column_names(), ['user_id', 'cutoff', 'precision', 'recall', 'count']) self.assertEqual(list(pr['user_id']), list(turicreate.SArray(['a', 'b', 'c']))) pr = turicreate.recommender.util.precision_recall_by_user( test_data, recs, cutoffs=[5, 10, 15]) self.assertEqual(pr.num_rows(), 9)
def setUpClass(self): ## Simulate test data np.random.seed(10) n, d = 100, 10 self.sf = tc.SFrame() for i in range(d): self.sf.add_column(tc.SArray(np.random.randn(n)), inplace=True) target = np.random.randint(2, size=n) ## Create the model self.sf["target"] = target self.def_kwargs = _DEFAULT_SOLVER_OPTIONS self.def_opts = dict( list(self.def_kwargs.items()) + list({ "solver": "auto", "feature_rescaling": True, "class_weights": None, "penalty": 1.0, }.items())) self.solver = "auto" self.opts = self.def_opts.copy() self.opts["max_iterations"] = 500 self.features = ["X{}".format(i) for i in range(1, d + 1)] self.unpacked_features = ["X{}".format(i) for i in range(1, d + 1)] self.target = "target"
def test_cat(self): import numpy as np # Arrange np.random.seed(8) n, d = 1000, 100 sf = tc.SFrame() for i in range(d): sf.add_column(tc.SArray(np.random.rand(n)), inplace=True) target = np.random.randint(2, size=n) sf["target"] = target sf["target"] = sf["target"].astype(str) sf["target"] = "cat-" + sf["target"] model = tc.boosted_trees_classifier.create(sf, "target") # Act evaluation = model.evaluate(sf) # Assert self.assertEqual( ["cat-0", "cat-1"], sorted( list(evaluation["confusion_matrix"]["target_label"].unique())), )
def test_invalid_data_set(self): # infer dtype str a = tc.SArray(["str", None]) b = tc.SArray(["str", "str"]) # target contains none sf = tc.SFrame({"a": a, "b": b}) with self.assertRaises(ToolkitError): tc.text_classifier.create( sf, target="a", features=["b"], word_count_threshold=1 ) # feature contains none, Github #2402 sf = tc.SFrame({"b": a, "a": b}) with self.assertRaises(ToolkitError): tc.text_classifier.create( sf, target="b", features=["a"], word_count_threshold=1 )