def test_multiclass_classification_metrics_001(tc): print "create frame" rows = [["red", "red"], ["blue", "green"], ["green", "green"], ["green", "green"], ["orange", "orange"], ["red", "orange"]] schema = [('labels', str), ('predictions', str)] frame = tc.to_frame(rows, schema) assert (frame.row_count, 4, "frame should have 6 rows") assert (frame.column_names, ['labels', 'predictions']) print "compute multiclass_classification_metrics()" cm = frame.multiclass_classification_metrics('labels', 'predictions', 1) assert (cm.f_measure, 0.6, "computed f_measure for this model should be equal to 0.6") assert (cm.recall, 0.666666666667, "computed recall for this model should be equal to 0.666666666667") assert ( cm.accuracy, 0.666666666667, "computed accuracy for this model should be equal to 0.666666666667") assert ( cm.precision, 0.638888888889, "computed precision for this model should be equal to 0.638888888889") confusion_matrix = cm.confusion_matrix.values.tolist() assert (confusion_matrix, [ [1, 0, 0], [2, 0, 0], [0, 1, 0], [0, 1, 1] ], "computed confusion_matrix for this models should be equal to [1,0,0],[2,0,0],[0,1,0],[0,1,1]" )
def test_kmeans_save_load(tc): frame = tc.to_frame([[2, "ab"], [1,"cd"], [7,"ef"], [1,"gh"], [9,"ij"], [2,"kl"], [0,"mn"], [6,"op"], [5,"qr"]], [("data", float), ("name", str)]) model = kmeans.train(frame, ["data"], 3, seed=5) assert (model.k == 3) assert (model.columns == [u'data']) assert (model.scalings is None) sizes = model.compute_sizes(frame) assert (sizes == [4, 1, 4]) centroids = model.centroids model.save("sandbox/km1") restored = tc.load("sandbox/km1") assert(restored.centroids == centroids) restored_sizes = restored.compute_sizes(frame) assert (restored_sizes == sizes)
def test_multiclass_classification_metrics_002(tc): print "create frame" rows = [[0.0, 0.0], [None, 0.0], [0.0, 0.0], [1.5, 1.5], [1.0, 1.0], [1.5, None]] schema = [('labels', float32), ('predictions', float32)] frame = tc.to_frame(rows, schema) assert (frame.row_count, 4, "frame should have 6 rows") assert (frame.column_names, ['labels', 'predictions']) print "compute multiclass_classification_metrics()" cm = frame.multiclass_classification_metrics('labels', 'predictions', 1) assert ( cm.f_measure, 0.627777777778, "computed f_measure for this model should be equal to 0.627777777778") assert (cm.recall, 0.666666666667, "computed recall for this model should be equal to 0.666666666667") assert ( cm.accuracy, 0.666666666667, "computed accuracy for this model should be equal to 0.666666666667") assert ( cm.precision, 0.805555555556, "computed precision for this model should be equal to 0.805555555556") confusion_matrix = cm.confusion_matrix.values.tolist() assert (confusion_matrix, [ [2, 0, 0], [0, 1, 0], [1, 1, 1] ], "computed confusion_matrix for this models should be equal to [2,0,0],[0,1,0],[1,1,1]" )
def test_back_and_forth_py_scala(tc): # python f = tc.to_frame([[1, "one"], [2, "two"], [3, "three"], [4, "four"], [5, "five"], [6, "six"], [7, "seven"], [8, "eight"], [9, "nine"], [10, "ten"]], [("a", int32), ("b", str)]) # python f.add_columns(lambda row: row.a + 4, ("c", int)) # scala f.bin_column("a", [5, 8, 10.0, 30.0, 50, 80]) # python f.filter(lambda row: row.a > 5) results = str(f.inspect()) expected = """[#] a b c a_binned ============================ [0] 6 six 10 0 [1] 7 seven 11 0 [2] 8 eight 12 1 [3] 9 nine 13 1 [4] 10 ten 14 2""" assert(results == expected)
def test_binary_classification_metrics_001(tc): print "create frame" rows = [["red", "red"], ["blue", "green"], ["green", "green"], ["green", "green"]] schema = [('labels', str), ('predictions', str)] frame = tc.to_frame(rows, schema) assert (frame.row_count, 4, "frame should have 4 rows") assert (frame.column_names, ['labels', 'predictions']) print "compute binary_classification_metrics()" cm = frame.binary_classification_metrics('labels', 'predictions', 'green', 1) assert (cm.f_measure, 0.0, "computed f_measure for this model should be equal to 0.0") assert (cm.recall, 0.0, "computed recall for this model should be equal to 0.0") assert (cm.accuracy, 0.5, "computed accuracy for this model should be equal to 0.5") assert (cm.precision, 0.0, "computed precision for this model should be equal to 0.0") confusion_matrix = cm.confusion_matrix.values.tolist() assert (confusion_matrix, [ [0, 2], [0, 2] ], "computed confusion_matrix for this models should be equal to [[0, 2], [0, 2]]" )
def test_binary_classification_metrics_002(tc): print "create frame" rows = [[0.0, 0.0], [1.5, 0.0], [0.0, 0.0], [1.5, 1.5]] schema = [('labels', float32), ('predictions', float32)] frame = tc.to_frame(rows, schema) assert (frame.row_count, 4, "frame should have 4 rows") assert (frame.column_names, ['labels', 'predictions']) print "compute binary_classification_metrics()" cm = frame.binary_classification_metrics('labels', 'predictions', 1.5, 1) assert ( cm.f_measure, 0.66666666666666663, "computed f_measure for this model should be equal to 0.66666666666666663" ) assert (cm.recall, 0.5, "computed recall for this model should be equal to 0.5") assert (cm.accuracy, 0.75, "computed accuracy for this model should be equal to 0.75") assert (cm.precision, 1.0, "computed precision for this model should be equal to 1.0") confusion_matrix = cm.confusion_matrix.values.tolist() assert (confusion_matrix, [ [1, 1], [0, 2] ], "computed confusion_matrix for this models should be equal to [[1, 1], [0, 2]]" )
def est_np(tc): # We can't use numpy numeric types and go successfully to Scala RDDs --the unpickler gets a constructor error: # Caused by: net.razorvine.pickle.PickleException: expected zero arguments for construction of ClassDict (for numpy.dtype) # todo: get this test working! # when it works, go back to dtypes and enable the np types import numpy as np f = tc.to_frame([[np.int32(1), "one"], [np.int32(2), "two"]], [("a", int), ("b", str)]) # schema intentionally int, not np.int32 #print f.inspect() # force to_scala f.bin_column("a", [5, 8, 10.0, 30.0, 50, 80])
def test_row_count(tc): # create frame f = tc.to_frame([[item] for item in range(0, 10)],[("a", int)]) # check row count (python) assert(f._is_python == True) assert(f.row_count == 10) # to scala f._scala # check row count (scala) assert(f._is_python == False) assert(f.row_count == 10)
def test_save_load(tc): path = get_sandbox_path("briton1") rm(path) frame1 = tc.to_frame([[2,"ab"],[1.0,"cd"],[7.4,"ef"],[1.0,"gh"],[9.0,"ij"],[2.0,"kl"],[0,"mn"],[6.0,"op"],[5.0,"qr"]], [("data", float),("name", str)]) frame1_inspect = frame1.inspect() frame1.save(path) frame2 = tc.load(path) frame2_inspect = frame2.inspect() assert(frame1_inspect, frame2_inspect) assert(str(frame1.schema), str(frame2.schema))
def test_bin(tc): f = tc.to_frame([[1, "one"], [2, "two"], [3, "three"], [4, "four"], [5, "five"], [6, "six"], [7, "seven"], [8, "eight"], [9, "nine"], [10, "ten"]], [("a", int), ("b", str)]) f.bin_column("a", [5, 8, 10.0, 30.0, 50, 80]) #, bin_column_name="super_fred")
def test_kmeans(tc): frame = tc.to_frame([[2, "ab"], [1,"cd"], [7,"ef"], [1,"gh"], [9,"ij"], [2,"kl"], [0,"mn"], [6,"op"], [5,"qr"]], [("data", float), ("name", str)]) model = kmeans.train(frame, ["data"], 3, seed=5) assert (model.k == 3) sizes = model.compute_sizes(frame) assert (sizes == [4, 1, 4]) wsse = model.compute_wsse(frame) assert (wsse == 9.75) model.predict(frame) frame_inspect = str(frame.inspect()) assert (frame_inspect == """[#] data name cluster ======================== [0] 2.0 ab 0 [1] 1.0 cd 0 [2] 7.0 ef 1 [3] 1.0 gh 0 [4] 9.0 ij 1 [5] 2.0 kl 0 [6] 0.0 mn 2 [7] 6.0 op 1 [8] 5.0 qr 1""") model.add_distance_columns(frame) #print frame.inspect() frame_inspect = str(frame.inspect()) assert (frame_inspect == """[#] data name cluster distance0 distance1 distance2 ========================================================= [0] 2.0 ab 0 0.25 22.5625 4.0 [1] 1.0 cd 0 0.25 33.0625 1.0 [2] 7.0 ef 1 30.25 0.0625 49.0 [3] 1.0 gh 0 0.25 33.0625 1.0 [4] 9.0 ij 1 56.25 5.0625 81.0 [5] 2.0 kl 0 0.25 22.5625 4.0 [6] 0.0 mn 2 2.25 45.5625 0.0 [7] 6.0 op 1 20.25 0.5625 36.0 [8] 5.0 qr 1 12.25 3.0625 25.0""")
def test_smoke_take(tc): f = tc.to_frame([[1, "one"], [2, "two"], [3, "three"]]) t = f.take(2) assert t.data == [[1, 'one'], [2, 'two']]