def test_input_mutations(self): """ Make sure inputs to the create() method are not mutated. Note that 'batch_size' may be mutated by the model, by design. The input data does have integer types, which are cast internally to floats. The user's data should not be changed at all. """ ## Make copies of key objects sf = copy.copy(self.sf) verbose = copy.copy(self.verbose) K = copy.copy(self.K) max_iter = copy.copy(self.max_iter) features = copy.copy(self.sf.column_names()) ## Create a model with the copied objects m = tc.kmeans.create(sf, features=features, num_clusters=K, max_iterations=max_iter, verbose=verbose) ## Check that the copies still equal the originals assert_sframe_equal(sf, self.sf) self.assertEqual(verbose, self.verbose) self.assertEqual(K, self.K) self.assertEqual(max_iter, self.max_iter) self.assertEqual(features, self.sf.column_names())
def test_custom_initial_centers(self): """ Test that the user can pass hard-coded initial cluster centers, and that these are actually used to initialize the clusters. """ ## Empty initial centers with self.assertRaises(ValueError): m = tc.kmeans.create(dataset=self.sf, initial_centers=tc.SFrame(), max_iterations=self.max_iter, verbose=False) ## Initial centers as an SArray of indices with self.assertRaises(TypeError): m = tc.kmeans.create(dataset=self.sf, initial_centers=tc.SArray([1, 2, 3]), max_iterations=self.max_iter, verbose=False) ## Initial centers with a schema that doesn't match the data sf_init = make_clustering_data(n=10, d=self.dim-1, seed=43) with self.assertRaises(ValueError): m = tc.kmeans.create(dataset=self.sf, initial_centers=sf_init, max_iterations=self.max_iter, verbose=False) ## Good initial centers sf_init = make_clustering_data(n=10, d=self.dim, seed=43) ftrs = ['float0', 'float1', 'dict0'] # exclude int feature because these *are* changed. m = tc.kmeans.create(self.sf, features=ftrs, initial_centers=sf_init, max_iterations=0, verbose=False) model_init_centers = m.cluster_info assert_sframe_equal(sf_init[ftrs], model_init_centers[ftrs])
def test_pickling_sgraph_types(self): sg_test_1 = tc.SGraph().add_vertices([ tc.Vertex(0, {'fluffy': 1}), tc.Vertex(1, { 'fluffy': 1, 'woof': 1 }), tc.Vertex(2, {}) ]) sg_test_2 = tc.SGraph() sg_test_2 = sg_test_2.add_vertices([tc.Vertex(x) for x in [0, 1, 2]]) sg_test_2 = sg_test_2.add_edges([ tc.Edge(0, 1, attr={'relationship': 'dislikes'}), tc.Edge(1, 2, attr={'relationship': 'likes'}), tc.Edge(1, 0, attr={'relationship': 'likes'}) ]) sarray_list = [sg_test_1, sg_test_2] for obj in sarray_list: pickler = gl_pickle.GLPickler(self.filename) pickler.dump(obj) pickler.close() unpickler = gl_pickle.GLUnpickler(self.filename) obj_ret = unpickler.load() unpickler.close() assert_sframe_equal(obj.get_vertices(), obj_ret.get_vertices()) assert_sframe_equal(obj.get_edges(), obj_ret.get_edges())
def test_predictions(self): """ Test correctness of predictions on new data, by comparing to nearest neighbors search results. Note that this implicitly checks that integer features are correctly cast as floats in the predict method. """ sf_train = self.sf[:-10] sf_predict = self.sf[-10:] kmeans = tc.kmeans.create(sf_train, num_clusters=3, verbose=False) sf_train_copy = copy.copy(sf_train) ## Check internal consistency - each training point's closest center # should be the one for the assigned cluster. Also check that the # input SFrame isn't mutated. yhat = kmeans.predict(sf_train) assert_sframe_equal(sf_train, sf_train_copy) self.assertTrue((yhat == kmeans.cluster_id['cluster_id']).all()) ## Check internal consistency for prediction distances. yhat_dists = kmeans.predict(sf_train, output_type='distance') assert_allclose(yhat_dists, kmeans.cluster_id['distance'], rtol=1e-6) ## Check consistency with nearest neighbors. # get the predictions from the model and combine into a single SFrame. ystar_labels = kmeans.predict(sf_predict, output_type='cluster_id') ystar_dists = kmeans.predict(sf_predict, output_type='distance') ystar = tc.SFrame({ 'cluster_id': ystar_labels, 'distance': ystar_dists }) ystar = ystar.add_row_number('row_id') # convert type of predictions to floats so they match the types of the # centers in the nearest neighbors model. coltype_map = { k: v for k, v in zip(sf_predict.column_names(), sf_predict.column_types()) } for ftr in coltype_map.keys(): if coltype_map[ftr] is int: sf_predict[ftr] = sf_predict[ftr].astype(float) knn_model = tc.nearest_neighbors.create(kmeans.cluster_info, features=kmeans.features, distance='euclidean', method='ball_tree') knn_dists = knn_model.query(sf_predict, k=1, radius=None) assert_sframe_equal(ystar[['row_id', 'cluster_id']], knn_dists[['query_label', 'reference_label']], check_column_names=False) assert_allclose(ystar['distance'], knn_dists['distance'], rtol=1e-6)
def test_pickling_sframe_types(self): sarray_list = [ tc.SFrame([1, 2, 3]), tc.SFrame([1.0, 2.0, 3.5]), tc.SFrame(["foo", "bar"]), ] for obj in sarray_list: pickler = gl_pickle.GLPickler(self.filename) pickler.dump(obj) pickler.close() unpickler = gl_pickle.GLUnpickler(self.filename) obj_ret = unpickler.load() unpickler.close() assert_sframe_equal(obj, obj_ret)
def test_relative_path(self): # Arrange sf1 = tc.SFrame(range(10)) relative_path = 'tmp/%s' % self.filename # Act pickler = gl_pickle.GLPickler(relative_path) pickler.dump(sf1) pickler.close() unpickler = gl_pickle.GLUnpickler(relative_path) sf2 = unpickler.load() unpickler.close() # Assert assert_sframe_equal(sf1, sf2) # Clean up shutil.rmtree(relative_path)
def test_input_mutations(self): """ Make sure inputs to the create() method are not mutated. """ local_sf = copy.copy(self.sf) local_dist = copy.deepcopy(self.distance) local_radius = copy.deepcopy(self.radius) local_min_core_neighbors = copy.deepcopy(self.min_core_neighbors) local_model = tc.dbscan.create(self.sf, distance=self.distance, radius=self.radius, min_core_neighbors=self.min_core_neighbors, verbose=False) assert_sframe_equal(self.sf, local_sf) self.assertEqual(self.distance, local_dist) self.assertEqual(self.radius, local_radius) self.assertEqual(self.min_core_neighbors, local_min_core_neighbors)
def test_input_mutations(self): """ Make sure inputs to the create() method are not mutated. """ ## Make copies of key objects sf = self.sf[:] distance = copy.deepcopy(self.distance) verbose = self.verbose ## Create a model with the copied opbjects m = tc.nearest_neighbor_classifier.create(sf, target='class', distance=distance, verbose=self.verbose) ## Check that the copies still equal the originals assert_sframe_equal(sf, self.sf) self.assertEqual(distance, self.distance) self.assertEqual(verbose, self.verbose)
def test_combination_gl_python_types(self): sg_test_1 = tc.SGraph().add_vertices([ tc.Vertex(1, {'fluffy': 1}), tc.Vertex(2, { 'fluffy': 1, 'woof': 1 }), tc.Vertex(3, {}) ]) sarray_test_1 = tc.SArray([1, 2, 3]) sframe_test_1 = tc.SFrame([1, 2, 3]) obj_list = [[sg_test_1, sframe_test_1, sarray_test_1], { 0: sg_test_1, 1: sframe_test_1, 2: sarray_test_1 }] for obj in obj_list: pickler = gl_pickle.GLPickler(self.filename) pickler.dump(obj) pickler.close() unpickler = gl_pickle.GLUnpickler(self.filename) obj_ret = unpickler.load() unpickler.close() assert_sframe_equal(obj[0].get_vertices(), obj_ret[0].get_vertices()) assert_sframe_equal(obj[0].get_edges(), obj_ret[0].get_edges()) assert_sframe_equal(obj[1], obj_ret[1]) assert list(obj[2]) == list(obj_ret[2])