Esempi in Python per to_frame, esempi in Python per setup.tc.to_frame

Esempio n. 1

0

Mostra file

def test_multiclass_classification_metrics_001(tc):
    print "create frame"
    rows = [["red", "red"], ["blue", "green"], ["green", "green"],
            ["green", "green"], ["orange", "orange"], ["red", "orange"]]
    schema = [('labels', str), ('predictions', str)]
    frame = tc.to_frame(rows, schema)

    assert (frame.row_count, 4, "frame should have 6 rows")
    assert (frame.column_names, ['labels', 'predictions'])

    print "compute multiclass_classification_metrics()"
    cm = frame.multiclass_classification_metrics('labels', 'predictions', 1)

    assert (cm.f_measure, 0.6,
            "computed f_measure for this model should be equal to 0.6")
    assert (cm.recall, 0.666666666667,
            "computed recall for this model should be equal to 0.666666666667")
    assert (
        cm.accuracy, 0.666666666667,
        "computed accuracy for this model should be equal to 0.666666666667")
    assert (
        cm.precision, 0.638888888889,
        "computed precision for this model should be equal to 0.638888888889")

    confusion_matrix = cm.confusion_matrix.values.tolist()
    assert (confusion_matrix, [
        [1, 0, 0], [2, 0, 0], [0, 1, 0], [0, 1, 1]
    ], "computed confusion_matrix for this models should be equal to [1,0,0],[2,0,0],[0,1,0],[0,1,1]"
            )

Esempio n. 2

0

Mostra file

def test_kmeans_save_load(tc):

    frame = tc.to_frame([[2, "ab"],
                         [1,"cd"],
                         [7,"ef"],
                         [1,"gh"],
                         [9,"ij"],
                         [2,"kl"],
                         [0,"mn"],
                         [6,"op"],
                         [5,"qr"]],
                        [("data", float), ("name", str)])
    model = kmeans.train(frame, ["data"], 3, seed=5)
    assert (model.k == 3)
    assert (model.columns == [u'data'])
    assert (model.scalings is None)

    sizes = model.compute_sizes(frame)
    assert (sizes == [4, 1, 4])

    centroids = model.centroids

    model.save("sandbox/km1")

    restored = tc.load("sandbox/km1")
    assert(restored.centroids == centroids)
    restored_sizes = restored.compute_sizes(frame)
    assert (restored_sizes == sizes)

Esempio n. 3

0

Mostra file

def test_multiclass_classification_metrics_002(tc):
    print "create frame"
    rows = [[0.0, 0.0], [None, 0.0], [0.0, 0.0], [1.5, 1.5], [1.0, 1.0],
            [1.5, None]]
    schema = [('labels', float32), ('predictions', float32)]
    frame = tc.to_frame(rows, schema)

    assert (frame.row_count, 4, "frame should have 6 rows")
    assert (frame.column_names, ['labels', 'predictions'])

    print "compute multiclass_classification_metrics()"
    cm = frame.multiclass_classification_metrics('labels', 'predictions', 1)

    assert (
        cm.f_measure, 0.627777777778,
        "computed f_measure for this model should be equal to 0.627777777778")
    assert (cm.recall, 0.666666666667,
            "computed recall for this model should be equal to 0.666666666667")
    assert (
        cm.accuracy, 0.666666666667,
        "computed accuracy for this model should be equal to 0.666666666667")
    assert (
        cm.precision, 0.805555555556,
        "computed precision for this model should be equal to 0.805555555556")

    confusion_matrix = cm.confusion_matrix.values.tolist()
    assert (confusion_matrix, [
        [2, 0, 0], [0, 1, 0], [1, 1, 1]
    ], "computed confusion_matrix for this models should be equal to [2,0,0],[0,1,0],[1,1,1]"
            )

Esempio n. 4

0

Mostra file

def test_back_and_forth_py_scala(tc):
    # python
    f = tc.to_frame([[1, "one"],
                     [2, "two"],
                     [3, "three"],
                     [4, "four"],
                     [5, "five"],
                     [6, "six"],
                     [7, "seven"],
                     [8, "eight"],
                     [9, "nine"],
                     [10, "ten"]],
                     [("a", int32), ("b", str)])
    # python
    f.add_columns(lambda row: row.a + 4, ("c", int))
    # scala
    f.bin_column("a", [5, 8, 10.0, 30.0, 50, 80])
    # python
    f.filter(lambda row: row.a > 5)
    results = str(f.inspect())
    expected = """[#]  a   b      c   a_binned
============================
[0]   6  six    10         0
[1]   7  seven  11         0
[2]   8  eight  12         1
[3]   9  nine   13         1
[4]  10  ten    14         2"""
    assert(results == expected)

Esempio n. 5

0

Mostra file

File: frame_binary_classification_metrics_test.py Progetto: tlisonbee/spark-tk

def test_binary_classification_metrics_001(tc):
    print "create frame"
    rows = [["red", "red"], ["blue", "green"], ["green", "green"],
            ["green", "green"]]
    schema = [('labels', str), ('predictions', str)]
    frame = tc.to_frame(rows, schema)

    assert (frame.row_count, 4, "frame should have 4 rows")
    assert (frame.column_names, ['labels', 'predictions'])

    print "compute binary_classification_metrics()"
    cm = frame.binary_classification_metrics('labels', 'predictions', 'green',
                                             1)

    assert (cm.f_measure, 0.0,
            "computed f_measure for this model should be equal to 0.0")
    assert (cm.recall, 0.0,
            "computed recall for this model should be equal to 0.0")
    assert (cm.accuracy, 0.5,
            "computed accuracy for this model should be equal to 0.5")
    assert (cm.precision, 0.0,
            "computed precision for this model should be equal to 0.0")

    confusion_matrix = cm.confusion_matrix.values.tolist()
    assert (confusion_matrix, [
        [0, 2], [0, 2]
    ], "computed confusion_matrix for this models should be equal to [[0, 2], [0, 2]]"
            )

Esempio n. 6

0

Mostra file

File: frame_binary_classification_metrics_test.py Progetto: tlisonbee/spark-tk

def test_binary_classification_metrics_002(tc):
    print "create frame"
    rows = [[0.0, 0.0], [1.5, 0.0], [0.0, 0.0], [1.5, 1.5]]
    schema = [('labels', float32), ('predictions', float32)]
    frame = tc.to_frame(rows, schema)

    assert (frame.row_count, 4, "frame should have 4 rows")
    assert (frame.column_names, ['labels', 'predictions'])

    print "compute binary_classification_metrics()"
    cm = frame.binary_classification_metrics('labels', 'predictions', 1.5, 1)

    assert (
        cm.f_measure, 0.66666666666666663,
        "computed f_measure for this model should be equal to 0.66666666666666663"
    )
    assert (cm.recall, 0.5,
            "computed recall for this model should be equal to 0.5")
    assert (cm.accuracy, 0.75,
            "computed accuracy for this model should be equal to 0.75")
    assert (cm.precision, 1.0,
            "computed precision for this model should be equal to 1.0")

    confusion_matrix = cm.confusion_matrix.values.tolist()
    assert (confusion_matrix, [
        [1, 1], [0, 2]
    ], "computed confusion_matrix for this models should be equal to [[1, 1], [0, 2]]"
            )

Esempio n. 7

0

Mostra file

def est_np(tc):
    # We can't use numpy numeric types and go successfully to Scala RDDs --the unpickler gets a constructor error:
    # Caused by: net.razorvine.pickle.PickleException: expected zero arguments for construction of ClassDict (for numpy.dtype)
    # todo: get this test working!
    # when it works, go back to dtypes and enable the np types
    import numpy as np
    f = tc.to_frame([[np.int32(1), "one"], [np.int32(2), "two"]], [("a", int), ("b", str)])  # schema intentionally int, not np.int32
    #print f.inspect()
    # force to_scala
    f.bin_column("a", [5, 8, 10.0, 30.0, 50, 80])

Esempio n. 8

0

Mostra file

def test_row_count(tc):
    # create frame
    f = tc.to_frame([[item] for item in range(0, 10)],[("a", int)])
    # check row count (python)
    assert(f._is_python == True)
    assert(f.row_count == 10)
    # to scala
    f._scala
    # check row count (scala)
    assert(f._is_python == False)
    assert(f.row_count == 10)

Esempio n. 9

0

Mostra file

def test_save_load(tc):
    path = get_sandbox_path("briton1")
    rm(path)
    frame1 = tc.to_frame([[2,"ab"],[1.0,"cd"],[7.4,"ef"],[1.0,"gh"],[9.0,"ij"],[2.0,"kl"],[0,"mn"],[6.0,"op"],[5.0,"qr"]],
                         [("data", float),("name", str)])
    frame1_inspect = frame1.inspect()
    frame1.save(path)
    frame2 = tc.load(path)
    frame2_inspect = frame2.inspect()
    assert(frame1_inspect, frame2_inspect)
    assert(str(frame1.schema), str(frame2.schema))

Esempio n. 10

0

Mostra file

def test_bin(tc):
    f = tc.to_frame([[1, "one"],
                     [2, "two"],
                     [3, "three"],
                     [4, "four"],
                     [5, "five"],
                     [6, "six"],
                     [7, "seven"],
                     [8, "eight"],
                     [9, "nine"],
                     [10, "ten"]],
                    [("a", int), ("b", str)])
    f.bin_column("a", [5, 8, 10.0, 30.0, 50, 80]) #, bin_column_name="super_fred")

Esempio n. 11

0

Mostra file

def test_kmeans(tc):

    frame = tc.to_frame([[2, "ab"],
                         [1,"cd"],
                         [7,"ef"],
                         [1,"gh"],
                         [9,"ij"],
                         [2,"kl"],
                         [0,"mn"],
                         [6,"op"],
                         [5,"qr"]],
                        [("data", float), ("name", str)])
    model = kmeans.train(frame, ["data"], 3, seed=5)
    assert (model.k == 3)

    sizes = model.compute_sizes(frame)
    assert (sizes == [4, 1, 4])

    wsse = model.compute_wsse(frame)
    assert (wsse == 9.75)

    model.predict(frame)
    frame_inspect = str(frame.inspect())
    assert (frame_inspect == """[#]  data  name  cluster
========================
[0]   2.0  ab          0
[1]   1.0  cd          0
[2]   7.0  ef          1
[3]   1.0  gh          0
[4]   9.0  ij          1
[5]   2.0  kl          0
[6]   0.0  mn          2
[7]   6.0  op          1
[8]   5.0  qr          1""")

    model.add_distance_columns(frame)
    #print frame.inspect()
    frame_inspect = str(frame.inspect())
    assert (frame_inspect == """[#]  data  name  cluster  distance0  distance1  distance2
=========================================================
[0]   2.0  ab          0       0.25    22.5625        4.0
[1]   1.0  cd          0       0.25    33.0625        1.0
[2]   7.0  ef          1      30.25     0.0625       49.0
[3]   1.0  gh          0       0.25    33.0625        1.0
[4]   9.0  ij          1      56.25     5.0625       81.0
[5]   2.0  kl          0       0.25    22.5625        4.0
[6]   0.0  mn          2       2.25    45.5625        0.0
[7]   6.0  op          1      20.25     0.5625       36.0
[8]   5.0  qr          1      12.25     3.0625       25.0""")

Esempio n. 12

0

Mostra file

def test_smoke_take(tc):
    f = tc.to_frame([[1, "one"], [2, "two"], [3, "three"]])
    t = f.take(2)
    assert t.data == [[1, 'one'], [2, 'two']]