Beispiel #1
0
    def test_generic_drop_with_list(self):
        frame_name1 = str(uuid.uuid1()).replace('-', '_')
        frame1 = ta.Frame(name=frame_name1)
        frame_name2 = str(uuid.uuid1()).replace('-', '_')
        ta.Frame(name=frame_name2)

        # Create list with frame proxy object and frame name
        frameList = [frame1, frame_name2]

        # Check that the frames we just created now exist
        self.assertTrue(
            frame_name1 in ta.get_frame_names(),
            frame_name1 + " should exist in the list of frame names")
        self.assertTrue(
            frame_name2 in ta.get_frame_names(),
            frame_name2 + " should exist in the list of frame names")

        self.assertEqual(
            2, ta.drop(frameList),
            "drop() should have deleted the 2 items from the list")

        # Check that the frames no longer exist
        self.assertFalse(
            frame_name1 in ta.get_frame_names(),
            frame_name1 + " should not be in the list of frame names")
        self.assertFalse(
            frame_name2 in ta.get_frame_names(),
            frame_name2 + " should not be in the list of frame names")
Beispiel #2
0
    def test_frame_drop(self):
        print "define csv file"
        csv = ta.CsvFile("/datasets/classification-compute.csv",
                         schema=[('a', str), ('b', ta.int32),
                                 ('labels', ta.int32),
                                 ('predictions', ta.int32)],
                         delimiter=',',
                         skip_header_lines=1)

        print "create frame"
        frame = ta.Frame(csv, name="test_frame_drop")

        print "dropping frame by entity"
        ta.drop_frames(frame)
        frames = ta.get_frame_names()
        self.assertFalse("test_frame_drop" in frames,
                         "test_frame_drop should not exist in list of frames")

        frame = ta.Frame(csv, name="test_frame_drop")

        print "dropping frame by name"
        self.assertEqual(1, ta.drop_frames("test_frame_drop"),
                         "drop_frames() should have deleted one frame")
        self.assertFalse("test_frame_drop" in frames,
                         "test_frame_drop should not exist in list of frames")
Beispiel #3
0
    def test_arx_with_lag(self):
        print "define csv file"
        schema = [("y", ta.float64), ("visitors", ta.float64),
                  ("wkends", ta.float64), ("seasonality", ta.float64),
                  ("incidentRate", ta.float64), ("holidayFlag", ta.float64),
                  ("postHolidayFlag", ta.float64), ("mintemp", ta.float64)]
        csv = ta.CsvFile("/datasets/arx_train.csv",
                         schema=schema,
                         skip_header_lines=1)

        print "create training frame"
        train_frame = ta.Frame(csv)

        print "Initializing a ArxModel object"
        arx = ta.ArxModel()

        print "Training the model on the Frame with yMaxLag = 2 and xMaxLag = 2"
        coefficients = arx.train(
            train_frame, "y",
            ["visitors", "wkends", "seasonality", "incidentRate", "mintemp"],
            2, 2, True)
        self.assertEqual(coefficients['coefficients'], [
            -0.033117384191517614, -0.06529674497484411,
            -3.328096129192338e-08, -1.4422196518869838e-08,
            -2.8970459135396235e-06, 2.0984826788508606e-06, 504.6479199133054,
            995.00122376607, 3.56120683505247e-08, -5.406341176251538e-08,
            -7.47887430442836e-08, 7.306703786303277e-08,
            2.3924223466200682e-08, 2.2165130696795696e-06, 15238.142787722905,
            2.061070059690899e-08, 1.3089764633101732e-07
        ])

        print "create test frame"
        csv = ta.CsvFile("/datasets/arx_test.csv",
                         schema=schema,
                         skip_header_lines=1)
        test_frame = ta.Frame(csv)

        print "Predicting on the Frame"
        p = arx.predict(
            test_frame, "y",
            ["visitors", "wkends", "seasonality", "incidentRate", "mintemp"])
        self.assertEqual(p.column_names, [
            "y", "visitors", "wkends", "seasonality", "incidentRate",
            "holidayFlag", "postHolidayFlag", "mintemp", "predicted_y"
        ])

        expected_results = [[None], [None], [101.99999649931183],
                            [98.00000211077416], [111.999996872938],
                            [99.00000347596028], [99.00000489674761],
                            [86.9999967418149], [103.00000106651471],
                            [114.99999387693828], [100.99999426757434],
                            [124.99999322753226], [116.99999537263702],
                            [109.00000298901594], [110.99999768325104],
                            [104.99999176999377]]

        self.assertEqual(expected_results, p.take(p.row_count, 0,
                                                  "predicted_y"))
Beispiel #4
0
    def test_arimax_air_quality(self):
        # Data from Lichman, M. (2013). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.
        print "Define csv file"
        schema = [("Date", str), ("Time", str), ("CO_GT", ta.float64),
                  ("PT08_S1_CO", ta.float64), ("NMHC_GT", ta.float64),
                  ("C6H6_GT", ta.float64), ("PT08_S2_NMHC", ta.float64),
                  ("NOx_GT", ta.float64), ("PT08_S3_NOx", ta.float64),
                  ("NO2_GT", ta.float64), ("PT08_S4_NO2", ta.float64),
                  ("PT08_S5_O3", ta.float64), ("T", ta.float64),
                  ("RH", ta.float64), ("AH", ta.float64)]
        csv = ta.CsvFile("/datasets/arimax_train.csv",
                         schema=schema,
                         skip_header_lines=1)

        print "Create training frame"
        train_frame = ta.Frame(csv)

        print "Initializing a ArimaxModel object"
        arimax = ta.ArimaxModel()

        print "Training the model on the Frame"
        arimax.train(train_frame, "CO_GT", ["C6H6_GT", "PT08_S2_NMHC", "T"], 1,
                     1, 1, 1, True, False)

        print "Create test frame"
        csv2 = ta.CsvFile("/datasets/arimax_test.csv",
                          schema=schema,
                          skip_header_lines=1)
        test_frame = ta.Frame(csv2)

        print "Predicting on the Frame"
        p = arimax.predict(test_frame, "CO_GT",
                           ["C6H6_GT", "PT08_S2_NMHC", "T"])

        expected_results = [[3.9, 3.1384052036036163], [3.7, 2.2096085801345],
                            [6.6,
                             3.052618296503863], [4.4, 2.1495532900204375],
                            [3.5, 2.929771168550256], [5.4, 2.155756454454324],
                            [2.7,
                             2.8784218519015745], [1.9, 2.1528352219380147],
                            [1.6,
                             2.7830795782099473], [1.7, 2.1096269282113664],
                            [-200.0, 2.8628707912495215],
                            [1.0,
                             2.0471200633069278], [1.2, 2.7726186606363887],
                            [1.5,
                             2.0820391788568395], [2.7, 2.9878888229516978],
                            [3.7,
                             2.3182512709816443], [3.2, 3.211283519783637],
                            [4.1, 2.5541133101407363],
                            [3.6, 3.268861636132588], [2.8, 2.467897319671856]]

        self.assertEqual(expected_results,
                         p.take(20, columns=["CO_GT", "predicted_y"]))
Beispiel #5
0
    def test_append_frame(self):
        src_frame = ta.Frame(self.csv1)
        self.assertEqual(src_frame.row_count, 20)
        self.assertEqual(src_frame.column_names, [name for name, type in self.schema1])

        dest_frame = ta.Frame(self.csv2)
        self.assertEqual(dest_frame.row_count, 10)
        self.assertEqual(dest_frame.column_names,[name for name, type in  self.schema2])

        src_frame.append(dest_frame)
        self.assertEqual(src_frame.row_count, 30)
        self.assertEqual(src_frame.column_names, [name for name, type in self.combined_schema])
Beispiel #6
0
    def test_max_air_quality(self):
        # Data from Lichman, M. (2013). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.
        print "Define csv file"
        schema = [("Date", str), ("Time", str), ("CO_GT", ta.float64),
                  ("PT08_S1_CO", ta.float64), ("NMHC_GT", ta.float64),
                  ("C6H6_GT", ta.float64), ("PT08_S2_NMHC", ta.float64),
                  ("NOx_GT", ta.float64), ("PT08_S3_NOx", ta.float64),
                  ("NO2_GT", ta.float64), ("PT08_S4_NO2", ta.float64),
                  ("PT08_S5_O3", ta.float64), ("T", ta.float64),
                  ("RH", ta.float64), ("AH", ta.float64)]
        csv = ta.CsvFile("/datasets/arimax_train.csv",
                         schema=schema,
                         skip_header_lines=1)

        print "Create training frame"
        train_frame = ta.Frame(csv)

        print "Initializing a MaxModel object"
        max = ta.MaxModel()

        print "Training the model on the Frame"
        max.train(train_frame, "CO_GT", ["C6H6_GT", "PT08_S2_NMHC", "T"], 3, 0,
                  True, False)

        print "Create test frame"
        csv2 = ta.CsvFile("/datasets/arimax_test.csv",
                          schema=schema,
                          skip_header_lines=1)
        test_frame = ta.Frame(csv2)

        print "Predicting on the Frame"
        p = max.predict(test_frame, "CO_GT", ["C6H6_GT", "PT08_S2_NMHC", "T"])

        expected_results = [[3.9, 0.40372259585936543],
                            [3.7,
                             6.6634901462882725], [6.6, 5.981442062684975],
                            [4.4, 5.35837518115529], [3.5, 5.026072844339458],
                            [5.4, 4.569157131217689], [2.7, 4.029165833891962],
                            [1.9,
                             3.9460902496880044], [1.6, 3.779939081280088],
                            [1.7, 3.655325704974152],
                            [-200.0, 3.2399477839543613],
                            [1.0,
                             2.9076454471385293], [1.2, 3.4476367444642566],
                            [1.5,
                             2.9907210313424875], [2.7, 2.6168809024246764],
                            [3.7, 2.6999564866286345],
                            [3.2, 3.987628041789983], [4.1, 5.150686220645396],
                            [3.6, 6.479895567908723], [2.8, 7.642953746764134]]

        self.assertEqual(expected_results,
                         p.take(20, columns=["CO_GT", "predicted_y"]))
    def test_duplicate_frame_rename(self):
        frame_name1 = str(uuid.uuid1()).replace('-', '_')
        frame_name2 = str(uuid.uuid1()).replace('-', '_')
        graph_name = str(uuid.uuid1()).replace('-', '_')
        model_name = str(uuid.uuid1()).replace('-', '_')

        # Create frames, graph, and model to test with
        frame1 = ta.Frame(name=frame_name1)
        frame2 = ta.Frame(name=frame_name2)
        ta.Graph(name=graph_name)
        ta.KMeansModel(name=model_name)

        # After creating frames, check that frames with each name exists on the server
        self.assertTrue(frame_name1 in ta.get_frame_names(),
                        frame_name1 + " should exist in list of frames")
        self.assertTrue(frame_name2 in ta.get_frame_names(),
                        frame_name2 + " should exist in list of frames")

        # Try to rename frame2 to have the same name as frame1 (we expect an exception here)
        with self.assertRaises(Exception):
            frame2.name = frame_name1

        # Both frame names should still exist on the server
        self.assertTrue(frame_name1 in ta.get_frame_names(),
                        frame_name1 + " should still exist in list of frames")
        self.assertTrue(frame_name2 in ta.get_frame_names(),
                        frame_name2 + " should still exist in list of frames")

        # Try to rename frame1 to have the same name as the graph (we expect an exception here)
        with self.assertRaises(Exception):
            frame1.name = graph_name

        # frame1 and the graph should still exist on the server
        self.assertTrue(
            frame_name1 in ta.get_frame_names(),
            frame_name1 + " should still exist in the list of frames")
        self.assertTrue(
            graph_name in ta.get_graph_names(),
            graph_name + " should still exist in the list of graphs")

        # Try to rename frame1 to have the same name as the model (we expect an exception here)
        with self.assertRaises(Exception):
            frame1.name = model_name

        # frame1 and the model should still exist on the server
        self.assertTrue(
            frame_name1 in ta.get_frame_names(),
            frame_name1 + " should still exist in the list of frames")
        self.assertTrue(
            model_name in ta.get_model_names(),
            model_name + " should still exist in the list of models")
Beispiel #8
0
    def test_arx_no_lags(self):
        print "define csv file"
        schema = [("y", ta.float64), ("visitors", ta.float64),
                  ("wkends", ta.float64), ("seasonality", ta.float64),
                  ("incidentRate", ta.float64), ("holidayFlag", ta.float64),
                  ("postHolidayFlag", ta.float64), ("mintemp", ta.float64)]
        csv = ta.CsvFile("/datasets/arx_train.csv",
                         schema=schema,
                         skip_header_lines=1)

        print "create training frame"
        train_frame = ta.Frame(csv)

        print "Initializing a ArxModel object"
        arx = ta.ArxModel()

        print "Training the model on the Frame"
        arx.train(train_frame, "y", [
            "visitors", "wkends", "seasonality", "incidentRate", "holidayFlag",
            "postHolidayFlag", "mintemp"
        ], 0, 0, True)

        print "create test frame"
        csv = ta.CsvFile("/datasets/arx_test.csv",
                         schema=schema,
                         skip_header_lines=1)
        test_frame = ta.Frame(csv)

        print "Predicting on the Frame"
        p = arx.predict(test_frame, "y", [
            "visitors", "wkends", "seasonality", "incidentRate", "holidayFlag",
            "postHolidayFlag", "mintemp"
        ])
        self.assertEqual(p.column_names, [
            "y", "visitors", "wkends", "seasonality", "incidentRate",
            "holidayFlag", "postHolidayFlag", "mintemp", "predicted_y"
        ])

        expected_results = [[99.99999234330198], [98.00000220169095],
                            [101.99999803760333], [98.00000071010813],
                            [111.99999886664024], [99.00000373787175],
                            [99.00000353440495], [86.99999823659364],
                            [103.00000236184275], [114.99999178843603],
                            [100.9999939917012], [124.99999319338036],
                            [116.9999989603231], [109.00000481908955],
                            [110.99999666776476], [104.99999266331749]]

        self.assertEqual(expected_results, p.take(p.row_count, 0,
                                                  "predicted_y"))
    def test_gc_drop_stale_and_finalize(self):
        csv = ta.CsvFile("/datasets/dates.csv",
                         schema=[('start', ta.datetime), ('id', int),
                                 ('stop', ta.datetime), ('color', str)],
                         delimiter=',')
        f2_name = "dates_two"
        if f2_name in ta.get_frame_names():
            ta.drop_frames(f2_name)

        f1 = ta.Frame(csv)
        f1e = f1.get_error_frame()
        self.assertIsNotNone(f1e)
        self.assertIsNone(f1e.name)
        f2 = ta.Frame(csv, name=f2_name)
        f2e = f2.get_error_frame()
        self.assertIsNotNone(f2e)
        self.assertIsNone(f2e.name)

        admin.drop_stale(
        )  # first, normal drop_stale, nothing should change because these frames aren't old enough
        self.assertEqual("ACTIVE", f1.status)
        self.assertEqual("ACTIVE", f1e.status)
        self.assertEqual("ACTIVE", f2.status)
        self.assertEqual("ACTIVE", f2e.status)
        # print "f1.status=%s, f2.status=%s" % (f1.status, f2.status)

        admin.finalize_dropped(
        )  # nothing is dropped, so nothing so be finalized
        self.assertEqual("ACTIVE", f1.status)
        self.assertEqual("ACTIVE", f1e.status)
        self.assertEqual("ACTIVE", f2.status)
        self.assertEqual("ACTIVE", f2e.status)

        admin.drop_stale(
            "1ms"
        )  # now drop with very tiny age, so non-name f1 should get dropped
        self.assertEqual("DROPPED", f1.status)
        self.assertEqual("DROPPED", f1e.status)
        self.assertEqual("ACTIVE", f2.status)
        self.assertEqual("ACTIVE", f2e.status)
        # print "f1.status=%s, f2.status=%s" % (f1.status, f2.status)

        admin.finalize_dropped(
        )  # on f1 and f1e are dropped, so only they should be finalized
        self.assertEqual("FINALIZED", f1.status)
        self.assertEqual("FINALIZED", f1e.status)
        self.assertEqual("ACTIVE", f2.status)
        self.assertEqual("ACTIVE", f2e.status)
Beispiel #10
0
    def test_add_columns_and_copy_where(self):
        """
        Tests UDFs for add_columns and copy(where), and uses the vector type

        Changes the 2 population strings to a vector, and then uses the vector
        to compute the change, and then copy out all the incorrect ones
        """
        frame = ta.Frame(csv)
        self.assertEquals(frame.row_count, 20, "frame should have 20 rows")
        frame.add_columns(
            lambda row: [
                float(row['pop_2010'].translate({ord(','): None})),
                float(row['population_2013'].translate({ord(','): None}))
            ], ("vpops", ta.vector(2)))
        self.assertEquals(frame.row_count, 20, "frame should have 20 rows")
        self.assertEquals(frame.column_names, [
            'rank', 'city', 'population_2013', 'pop_2010', 'change', 'county',
            'vpops'
        ])
        frame.add_columns(
            lambda row: (row.vpops[1] - row.vpops[0]) / row.vpops[0],
            ("comp_change", ta.float64))
        #print frame.inspect(20)
        bad_cities = frame.copy(columns=['city', 'change', 'comp_change'],
                                where=lambda row: row.change != "%.2f%%" %
                                round(100 * row.comp_change, 2))
        self.assertEquals(bad_cities.column_names,
                          ['city', 'change', 'comp_change'])
        self.assertEquals(bad_cities.row_count, 1)
        #print bad_cities.inspect()
        row = bad_cities.take(1)[0]
        row[2] = round(row[2], 5)
        self.assertEquals(row, [u'Tualatin', u'4.17%', 0.03167
                                ])  # should just be one bad one, Tualatin
Beispiel #11
0
    def test_adf_column_types(self):
        """
        Tests the Augmented Dickey-Fuller test with different column types
        """
        data = [[1, "a", 1.5], [2, "b", 18.5], [4, "c", 22.1], [5, "d", 19.0],
                [7, "e", 25.6], [8, "f", 36.75]]
        schema = [("int_column", ta.int32), ("str_column", str),
                  ("float_column", ta.float32)]
        frame = ta.Frame(ta.UploadRows(data, schema))

        try:
            # string column should have an error
            frame.timeseries_augmented_dickey_fuller_test("str_column", 0)
            raise RuntimeError(
                "Expected error since the str_column is not numerical.")
        except Exception as e:
            assert ("Column str_column was not numerical" in e.message)

        # Numerical columns should not have an error
        self.assertNotEqual(
            frame.timeseries_augmented_dickey_fuller_test("int_column", 0),
            None)
        self.assertNotEqual(
            frame.timeseries_augmented_dickey_fuller_test("float_column", 0),
            None)
Beispiel #12
0
    def test_bpt_invalid_column(self):
        """
        Tests the Breusch-Pagan test with non-numerical data, and expects an error
        """
        data = [[1, "a", 1.5], [2, "b", 18.5], [4, "c", 22.1], [5, "d", 19.0],
                [7, "e", 25.6], [8, "f", 36.75]]
        schema = [("int_column", ta.int32), ("str_column", str),
                  ("float_column", ta.float32)]
        frame = ta.Frame(ta.UploadRows(data, schema))

        try:
            frame.timeseries_breusch_pagan_test("str_column",
                                                ["int_column", "float_column"])
            raise RuntimeError(
                "Expected error since the y column specified has strings")
        except Exception as e:
            assert ("Column str_column was not numerical" in e.message)

        try:
            frame.timeseries_breusch_pagan_test("float_column",
                                                ["int_column", "str_column"])
            raise RuntimeError(
                "Expected error since one of the x columns specified has strings."
            )
        except Exception as e:
            assert ("Column str_column was not numerical" in e.message)

        # numerical data should not have an error
        self.assertNotEqual(
            frame.timeseries_breusch_pagan_test("float_column",
                                                ["int_column"]), None)
    def test_lasso(self):

        print "create frame"
        frame = ta.Frame(ta.CsvFile("/datasets/lasso_lpsa.csv", schema=[
            ('y', ta.float64),
            ('x1', ta.float64),
            ('x2', ta.float64),
            ('x3', ta.float64),
            ('x4', ta.float64),
            ('x5', ta.float64),
            ('x6', ta.float64),
            ('x7', ta.float64),
            ('x8', ta.float64)], delimiter=' '))

        model = ta.LassoModel()
        model.train(frame, 'y', ['x1','x2','x3','x4','x5','x6','x7','x8'])

        #print repr(train_output)

        predicted_frame = model.predict(frame)
        print predicted_frame.inspect(20, columns=['y', 'predicted_value'])

        test_metrics = model.test(predicted_frame, 'predicted_value')

        print str(test_metrics)
Beispiel #14
0
    def testLinearRegression(self):
        print "define csv file"
        csv = ta.CsvFile("/datasets/linear_regression_8_columns.csv",
                         schema=[("y", ta.float64), ("1", ta.float64),
                                 ("2", ta.float64), ("3", ta.float64),
                                 ("4", ta.float64), ("5", ta.float64),
                                 ("6", ta.float64), ("7", ta.float64),
                                 ("8", ta.float64), ("9", ta.float64),
                                 ("10", ta.float64)])

        print "create frame"
        frame = ta.Frame(csv, 'LinearRegressionSampleFrame')

        print "Initializing a LinearRegressionModel object"
        model = ta.LinearRegressionModel(name='myLinearRegressionModel')

        print "Training the model on the Frame"
        model.train(frame, 'y',
                    ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'])

        output = model.predict(frame)
        self.assertEqual(output.column_names, [
            'y', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10',
            'predicted_value'
        ])
Beispiel #15
0
 def test_append_new_columns(self):
     frame = ta.Frame(self.csv1)
     self.assertEqual(frame.row_count, 20)
     self.assertEqual(frame.column_names, [name for name, type in self.schema1])
     frame.append(self.csv2)
     self.assertEqual(frame.row_count, 30)
     self.assertEqual(frame.column_names, [name for name, type in self.combined_schema])
Beispiel #16
0
 def test_append_same_schema(self):
     frame = ta.Frame(self.csv1)
     self.assertEqual(frame.row_count, 20)
     self.assertEqual(frame.column_names, [name for name, type in self.schema1])
     frame.append(self.csv1)
     self.assertEqual(frame.row_count, 40)
     self.assertEqual(frame.column_names, [name for name, type in self.schema1])
Beispiel #17
0
    def test_flatten_column_with_differing_size_vectors(self):
        data = [[1,[1,2,3],[8,7]],[2,[4,5,6],[6,5]],[3,[7,8,9],[4,3]],[4,[10,11,12],[2,1]]]
        schema = [('a', ta.int32), ('b', ta.vector(3)), ('c', ta.vector(2))]
        test_frame = ta.Frame(ta.UploadRows(data,schema))

        test_frame.flatten_columns(['b','c'])

        # expected data after flattening
        expected_data = [
            [1,1.0,8.0],
            [1,2.0,7.0],
            [1,3.0,0.0],
            [2,4.0,6.0],
            [2,5.0,5.0],
            [2,6.0,0.0],
            [3,7.0,4.0],
            [3,8.0,3.0],
            [3,9.0,0.0],
            [4,10.0,2.0],
            [4,11.0,1.0],
            [4,12.0,0.0]
        ]

        self.assertEqual(test_frame.row_count, 12)
        self.assertEqual(test_frame.take(test_frame.row_count), expected_data)
Beispiel #18
0
 def test_filter(self):
     frame = ta.Frame(csv)
     self.assertEquals(frame.row_count, 20, "frame should have 20 rows")
     frame.filter(lambda row: row.county == "Washington")
     self.assertEquals(frame.row_count, 4, "frame should have 4 rows after filtering")
     cities = frame.take(frame.row_count, columns="city")
     self.assertEquals(sorted(map(lambda f: str(f[0]), cities)), ["Beaverton", "Hillsboro", "Tigard","Tualatin"])
Beispiel #19
0
    def test_flatten_columns_with_strings_and_vectors_with_default_delimiter(self):
        data = [[1,"1,2",[1,2],"a,b"],[2,"3,4",[3,4],"c,d"],[3,"5,6",[5,6],"e,f"],[4,"7,8",[7,8],"g,h"]]
        schema = [('a', ta.int32),('b', str), ('c', ta.vector(2)), ('d', str)]
        test_frame = ta.Frame(ta.UploadRows(data,schema))

        # there are only 2 string columns.  giving 3 delimiters should give an exception.
        with self.assertRaises(Exception):
            test_frame.flatten_columns(['b', 'c', 'd'], [',',',',','])

        test_frame.flatten_columns(['b', 'c', 'd'])

        # expected data after flattening
        expected_data = [
            [1,"1",1.0,"a"],
            [1,"2",2.0,"b"],
            [2,"3",3.0,"c"],
            [2,"4",4.0,"d"],
            [3,"5",5.0,"e"],
            [3,"6",6.0,"f"],
            [4,"7",7.0,"g"],
            [4,"8",8.0,"h"]
        ]

        self.assertEqual(test_frame.row_count, 8)
        self.assertEqual(test_frame.take(test_frame.row_count), expected_data)
    def test_principal_components(self):
        print "define csv file"
        schema = [("1", ta.float64), ("2", ta.float64), ("3", ta.float64),
                  ("4", ta.float64), ("5", ta.float64), ("6", ta.float64),
                  ("7", ta.float64), ("8", ta.float64), ("9", ta.float64),
                  ("10", ta.float64), ("11", ta.float64)]
        train_file = ta.CsvFile("/datasets/pca_10rows.csv", schema=schema)
        print "creating the frame"
        train_frame = ta.Frame(train_file)

        print "initializing the naivebayes model"
        p = ta.PrincipalComponentsModel()

        print "training the model on the frame"
        p.train(train_frame,
                ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"], 9)

        print "predicting the class using the model and the frame"
        output = p.predict(train_frame, c=5, t_square_index=True)
        output_frame = output['output_frame']

        self.assertEqual(output_frame.column_names, [
            '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', 'p_1',
            'p_2', 'p_3', 'p_4', 'p_5'
        ])
Beispiel #21
0
    def testSvm(self):
        print "define csv file"
        csv = ia.CsvFile("/datasets/RandomForest.csv",
                         schema=[('Class', int), ('Dim_1', ia.float64),
                                 ('Dim_2', ia.float64)])

        print "create frame"
        frame = ia.Frame(csv)

        print "Initializing the classifier model object"
        classifier = ia.RandomForestClassifierModel()

        print "Training the model on the Frame"
        classifier.train(frame, 'Class', ['Dim_1', 'Dim_2'], num_classes=2)

        print "Predicting on the Frame"
        output = classifier.predict(frame)

        self.assertEqual(output.column_names,
                         ['Class', 'Dim_1', 'Dim_2', 'predicted_class'])

        print "Initializing the classifier model object"
        regressor = ia.RandomForestRegressorModel()

        print "Training the model on the Frame"
        regressor.train(frame, 'Class', ['Dim_1', 'Dim_2'])

        print "Predicting on the Frame"
        regressor_output = regressor.predict(frame)

        self.assertEqual(regressor_output.column_names,
                         ['Class', 'Dim_1', 'Dim_2', 'predicted_value'])
    def test_category_summary_threshold(self):
        print "create frame"
        frame = ta.Frame(self.csv)

        print "compute category summary"
        cm = frame.categorical_summary(('source', {'threshold': 0.5}))

        expected_result = {
            u'categorical_summary': [{
                u'column':
                u'source',
                u'levels': [{
                    u'percentage': 0.0,
                    u'frequency': 0,
                    u'level': u'Missing'
                }, {
                    u'percentage': 1.0,
                    u'frequency': 28,
                    u'level': u'Other'
                }]
            }]
        }
        self.assertEquals(
            cm, expected_result,
            "test_category_summary_threshold expected_result %s got %s" %
            (expected_result, cm))
    def test_category_summary_topk(self):
        print "create frame"
        frame = ta.Frame(self.csv)

        print "compute category summary"
        cm = frame.categorical_summary(('source', {'top_k': 2}))

        expected_result = {
            u'categorical_summary': [{
                u'column':
                u'source',
                u'levels': [{
                    u'percentage': 0.32142857142857145,
                    u'frequency': 9,
                    u'level': u'thing'
                }, {
                    u'percentage': 0.32142857142857145,
                    u'frequency': 9,
                    u'level': u'abstraction'
                }, {
                    u'percentage': 0.0,
                    u'frequency': 0,
                    u'level': u'Missing'
                }, {
                    u'percentage': 0.35714285714285715,
                    u'frequency': 10,
                    u'level': u'Other'
                }]
            }]
        }

        self.assertEquals(
            cm, expected_result,
            "test_category_summary_topk expected_result %s got %s" %
            (expected_result, cm))
Beispiel #24
0
    def test_page_rank(self):
        """tests page_rank, +piggyback last_read_date testing"""
        graph_data = "/datasets/page_rank_test_data.csv"
        schema = [("followed", ta.int32), ("follows", ta.int32)]
        frame = ta.Frame(ta.CsvFile(graph_data, schema))

        graph = ta.Graph()
        t0 = graph.last_read_date
        graph.define_vertex_type("node")
        graph.vertices["node"].add_vertices(frame, "follows")
        t1 = graph.last_read_date
        self.assertLess(t0, t1)  # make sure the last_read_date is updating

        graph.vertices["node"].add_vertices(frame, "followed")

        graph.define_edge_type("e1", "node", "node", directed=True)
        graph.edges["e1"].add_edges(frame, "follows", "followed")
        t2 = graph.last_read_date
        self.assertLess(t1, t2)  # make sure the last_read_date is updating
        result = graph.graphx_pagerank(output_property="PageRank",
                                       max_iterations=2,
                                       convergence_tolerance=0.001)
        t3 = graph.last_read_date
        self.assertLess(t2, t3)  # make sure the last_read_date is updating

        vertex_dict = result['vertex_dictionary']
        edge_dict = result['edge_dictionary']

        self.assertTrue(dict(vertex_dict['node'].schema).has_key('PageRank'))

        self.assertTrue(dict(edge_dict['e1'].schema).has_key('PageRank'))

        t4 = graph.last_read_date
        self.assertEqual(
            t3, t4)  # metadata access should not have updated the date
    def setUp(self):
        csv = ta.CsvFile("/datasets/oregon-cities.csv",
                         schema=[('rank', ta.int32), ('city', str),
                                 ('population_2013', str), ('pop_2010', str),
                                 ('change', str), ('county', str)],
                         delimiter='|',
                         skip_header_lines=1)
        self.frame = ta.Frame(csv)
        self.graph = ta.Graph()
        self.graph.define_vertex_type('city')
        self.graph.define_vertex_type('population_2013')
        self.graph.define_edge_type('rank',
                                    'city',
                                    'population_2013',
                                    directed=False)

        self.graph.vertices['city'].add_vertices(self.frame, 'city')
        self.graph.vertices['population_2013'].add_vertices(
            self.frame, 'population_2013')
        self.graph.edges['rank'].add_edges(self.frame,
                                           'city',
                                           'population_2013', ['rank'],
                                           create_missing_vertices=False)

        self.vertex_frame = self.graph.vertices['city']
Beispiel #26
0
    def setUp(self):
        print "define csv file"
        csv = ta.CsvFile("/datasets/flattenable.csv", schema= [('number', ta.int32),
                                                             ('abc', str),
                                                             ('food', str)], delimiter=',')

        print "create frame"
        self.frame = ta.Frame(csv)
Beispiel #27
0
 def setUp(self):
     print "define csv file"
     self.csv = ta.CsvFile("/datasets/movie.csv",
                           schema=[('user', ta.int32), ('vertex_type', str),
                                   ('movie', ta.int32),
                                   ('rating', ta.int32), ('splits', str)])
     print "creating frame"
     self.frame = ta.Frame(self.csv)
Beispiel #28
0
    def test_graph(self):
        print "define csv file"
        csv = ta.CsvFile("/datasets/movie.csv", schema= [('user', ta.int32),
                                            ('vertex_type', str),
                                            ('movie', ta.int32),
                                            ('rating', ta.int32),
                                            ('splits', str)])

        print "creating frame"
        frame = ta.Frame(csv)

        # TODO: add asserts verifying inspect is working
        print
        print frame.inspect(20)
        print
        self.assertEquals(frame.row_count, 20, "frame should have 20 rows")
        #self.assertEqual(frame.column_names, ['', '', '', '', ''])
        self.assertEquals(len(frame.column_names), 5, "frame should have 5 columns")

        print "create graph"
        graph = ta.Graph()

        self.assertIsNotNone(graph.uri)

        print "define vertices and edges"
        graph.define_vertex_type('movies')
        graph.define_vertex_type('users')
        graph.define_edge_type('ratings', 'users', 'movies', directed=True)
        self.assertEquals(graph.vertices['users'].row_count, 0, "making sure newly defined vertex frame does not have rows")
        self.assertEquals(graph.vertices['movies'].row_count, 0, "making sure newly defined vertex frame does not have rows")
        self.assertEquals(graph.edges['ratings'].row_count, 0, "making sure newly defined edge frame does not have rows")
        #self.assertEquals(graph.vertex_count, 0, "no vertices expected yet")
        #self.assertEquals(graph.edge_count, 0, "no edges expected yet")

        print "add_vertices() users"
        graph.vertices['users'].add_vertices( frame, 'user', [])

        # TODO: add asserts verifying inspect is working
        print
        print graph.vertices['users'].inspect(20)
        print
        self.assertEquals(graph.vertices['users'].row_count, 13)
        self.assertEquals(len(graph.vertices['users'].column_names), 3)
        #self.assertEquals(graph.vertices['users'].row_count, graph.vertex_count, "row count of user vertices should be same as vertex count on graph")

        print "add_vertices() movies"
        graph.vertices['movies'].add_vertices( frame, 'movie', [])
        self.assertEquals(graph.vertices['users'].row_count, 13)
        self.assertEquals(graph.vertices['movies'].row_count, 11)
        self.assertEquals(len(graph.vertices['users'].column_names), 3)
        self.assertEquals(len(graph.vertices['movies'].column_names), 3)
        #self.assertEquals(graph.vertex_count, 24, "vertex_count should be the total number of users and movies")

        print "add_edges()"
        graph.edges['ratings'].add_edges(frame, 'user', 'movie', ['rating'], create_missing_vertices=False)
        self.assertEquals(len(graph.edges['ratings'].column_names), 5)
        self.assertEquals(graph.edges['ratings'].row_count, 20, "expected 20 rating edges")
Beispiel #29
0
    def setUp(self):
        print "define csv file"
        csv = ta.CsvFile("/datasets/dates.csv",
                         schema=[('start', ta.datetime), ('id', int),
                                 ('stop', ta.datetime), ('color', str)],
                         delimiter=',')

        print "create frame"
        self.frame = ta.Frame(csv)
Beispiel #30
0
    def setUp(self):
        # there's already a "splits" column in this data set, but for testing purposes, it doesn't affect anything
        print "define csv file"
        self.schema = [('user', ta.int32), ('vertex_type', str),
                       ('movie', ta.int32), ('rating', ta.int32),
                       ('splits', str)]
        self.csv = ta.CsvFile("/datasets/movie.csv", self.schema)

        print "creating frame"
        self.frame = ta.Frame(self.csv)