def test_generic_drop_with_list(self): frame_name1 = str(uuid.uuid1()).replace('-', '_') frame1 = ta.Frame(name=frame_name1) frame_name2 = str(uuid.uuid1()).replace('-', '_') ta.Frame(name=frame_name2) # Create list with frame proxy object and frame name frameList = [frame1, frame_name2] # Check that the frames we just created now exist self.assertTrue( frame_name1 in ta.get_frame_names(), frame_name1 + " should exist in the list of frame names") self.assertTrue( frame_name2 in ta.get_frame_names(), frame_name2 + " should exist in the list of frame names") self.assertEqual( 2, ta.drop(frameList), "drop() should have deleted the 2 items from the list") # Check that the frames no longer exist self.assertFalse( frame_name1 in ta.get_frame_names(), frame_name1 + " should not be in the list of frame names") self.assertFalse( frame_name2 in ta.get_frame_names(), frame_name2 + " should not be in the list of frame names")
def test_frame_drop(self): print "define csv file" csv = ta.CsvFile("/datasets/classification-compute.csv", schema=[('a', str), ('b', ta.int32), ('labels', ta.int32), ('predictions', ta.int32)], delimiter=',', skip_header_lines=1) print "create frame" frame = ta.Frame(csv, name="test_frame_drop") print "dropping frame by entity" ta.drop_frames(frame) frames = ta.get_frame_names() self.assertFalse("test_frame_drop" in frames, "test_frame_drop should not exist in list of frames") frame = ta.Frame(csv, name="test_frame_drop") print "dropping frame by name" self.assertEqual(1, ta.drop_frames("test_frame_drop"), "drop_frames() should have deleted one frame") self.assertFalse("test_frame_drop" in frames, "test_frame_drop should not exist in list of frames")
def test_arx_with_lag(self): print "define csv file" schema = [("y", ta.float64), ("visitors", ta.float64), ("wkends", ta.float64), ("seasonality", ta.float64), ("incidentRate", ta.float64), ("holidayFlag", ta.float64), ("postHolidayFlag", ta.float64), ("mintemp", ta.float64)] csv = ta.CsvFile("/datasets/arx_train.csv", schema=schema, skip_header_lines=1) print "create training frame" train_frame = ta.Frame(csv) print "Initializing a ArxModel object" arx = ta.ArxModel() print "Training the model on the Frame with yMaxLag = 2 and xMaxLag = 2" coefficients = arx.train( train_frame, "y", ["visitors", "wkends", "seasonality", "incidentRate", "mintemp"], 2, 2, True) self.assertEqual(coefficients['coefficients'], [ -0.033117384191517614, -0.06529674497484411, -3.328096129192338e-08, -1.4422196518869838e-08, -2.8970459135396235e-06, 2.0984826788508606e-06, 504.6479199133054, 995.00122376607, 3.56120683505247e-08, -5.406341176251538e-08, -7.47887430442836e-08, 7.306703786303277e-08, 2.3924223466200682e-08, 2.2165130696795696e-06, 15238.142787722905, 2.061070059690899e-08, 1.3089764633101732e-07 ]) print "create test frame" csv = ta.CsvFile("/datasets/arx_test.csv", schema=schema, skip_header_lines=1) test_frame = ta.Frame(csv) print "Predicting on the Frame" p = arx.predict( test_frame, "y", ["visitors", "wkends", "seasonality", "incidentRate", "mintemp"]) self.assertEqual(p.column_names, [ "y", "visitors", "wkends", "seasonality", "incidentRate", "holidayFlag", "postHolidayFlag", "mintemp", "predicted_y" ]) expected_results = [[None], [None], [101.99999649931183], [98.00000211077416], [111.999996872938], [99.00000347596028], [99.00000489674761], [86.9999967418149], [103.00000106651471], [114.99999387693828], [100.99999426757434], [124.99999322753226], [116.99999537263702], [109.00000298901594], [110.99999768325104], [104.99999176999377]] self.assertEqual(expected_results, p.take(p.row_count, 0, "predicted_y"))
def test_arimax_air_quality(self): # Data from Lichman, M. (2013). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science. print "Define csv file" schema = [("Date", str), ("Time", str), ("CO_GT", ta.float64), ("PT08_S1_CO", ta.float64), ("NMHC_GT", ta.float64), ("C6H6_GT", ta.float64), ("PT08_S2_NMHC", ta.float64), ("NOx_GT", ta.float64), ("PT08_S3_NOx", ta.float64), ("NO2_GT", ta.float64), ("PT08_S4_NO2", ta.float64), ("PT08_S5_O3", ta.float64), ("T", ta.float64), ("RH", ta.float64), ("AH", ta.float64)] csv = ta.CsvFile("/datasets/arimax_train.csv", schema=schema, skip_header_lines=1) print "Create training frame" train_frame = ta.Frame(csv) print "Initializing a ArimaxModel object" arimax = ta.ArimaxModel() print "Training the model on the Frame" arimax.train(train_frame, "CO_GT", ["C6H6_GT", "PT08_S2_NMHC", "T"], 1, 1, 1, 1, True, False) print "Create test frame" csv2 = ta.CsvFile("/datasets/arimax_test.csv", schema=schema, skip_header_lines=1) test_frame = ta.Frame(csv2) print "Predicting on the Frame" p = arimax.predict(test_frame, "CO_GT", ["C6H6_GT", "PT08_S2_NMHC", "T"]) expected_results = [[3.9, 3.1384052036036163], [3.7, 2.2096085801345], [6.6, 3.052618296503863], [4.4, 2.1495532900204375], [3.5, 2.929771168550256], [5.4, 2.155756454454324], [2.7, 2.8784218519015745], [1.9, 2.1528352219380147], [1.6, 2.7830795782099473], [1.7, 2.1096269282113664], [-200.0, 2.8628707912495215], [1.0, 2.0471200633069278], [1.2, 2.7726186606363887], [1.5, 2.0820391788568395], [2.7, 2.9878888229516978], [3.7, 2.3182512709816443], [3.2, 3.211283519783637], [4.1, 2.5541133101407363], [3.6, 3.268861636132588], [2.8, 2.467897319671856]] self.assertEqual(expected_results, p.take(20, columns=["CO_GT", "predicted_y"]))
def test_append_frame(self): src_frame = ta.Frame(self.csv1) self.assertEqual(src_frame.row_count, 20) self.assertEqual(src_frame.column_names, [name for name, type in self.schema1]) dest_frame = ta.Frame(self.csv2) self.assertEqual(dest_frame.row_count, 10) self.assertEqual(dest_frame.column_names,[name for name, type in self.schema2]) src_frame.append(dest_frame) self.assertEqual(src_frame.row_count, 30) self.assertEqual(src_frame.column_names, [name for name, type in self.combined_schema])
def test_max_air_quality(self): # Data from Lichman, M. (2013). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science. print "Define csv file" schema = [("Date", str), ("Time", str), ("CO_GT", ta.float64), ("PT08_S1_CO", ta.float64), ("NMHC_GT", ta.float64), ("C6H6_GT", ta.float64), ("PT08_S2_NMHC", ta.float64), ("NOx_GT", ta.float64), ("PT08_S3_NOx", ta.float64), ("NO2_GT", ta.float64), ("PT08_S4_NO2", ta.float64), ("PT08_S5_O3", ta.float64), ("T", ta.float64), ("RH", ta.float64), ("AH", ta.float64)] csv = ta.CsvFile("/datasets/arimax_train.csv", schema=schema, skip_header_lines=1) print "Create training frame" train_frame = ta.Frame(csv) print "Initializing a MaxModel object" max = ta.MaxModel() print "Training the model on the Frame" max.train(train_frame, "CO_GT", ["C6H6_GT", "PT08_S2_NMHC", "T"], 3, 0, True, False) print "Create test frame" csv2 = ta.CsvFile("/datasets/arimax_test.csv", schema=schema, skip_header_lines=1) test_frame = ta.Frame(csv2) print "Predicting on the Frame" p = max.predict(test_frame, "CO_GT", ["C6H6_GT", "PT08_S2_NMHC", "T"]) expected_results = [[3.9, 0.40372259585936543], [3.7, 6.6634901462882725], [6.6, 5.981442062684975], [4.4, 5.35837518115529], [3.5, 5.026072844339458], [5.4, 4.569157131217689], [2.7, 4.029165833891962], [1.9, 3.9460902496880044], [1.6, 3.779939081280088], [1.7, 3.655325704974152], [-200.0, 3.2399477839543613], [1.0, 2.9076454471385293], [1.2, 3.4476367444642566], [1.5, 2.9907210313424875], [2.7, 2.6168809024246764], [3.7, 2.6999564866286345], [3.2, 3.987628041789983], [4.1, 5.150686220645396], [3.6, 6.479895567908723], [2.8, 7.642953746764134]] self.assertEqual(expected_results, p.take(20, columns=["CO_GT", "predicted_y"]))
def test_duplicate_frame_rename(self): frame_name1 = str(uuid.uuid1()).replace('-', '_') frame_name2 = str(uuid.uuid1()).replace('-', '_') graph_name = str(uuid.uuid1()).replace('-', '_') model_name = str(uuid.uuid1()).replace('-', '_') # Create frames, graph, and model to test with frame1 = ta.Frame(name=frame_name1) frame2 = ta.Frame(name=frame_name2) ta.Graph(name=graph_name) ta.KMeansModel(name=model_name) # After creating frames, check that frames with each name exists on the server self.assertTrue(frame_name1 in ta.get_frame_names(), frame_name1 + " should exist in list of frames") self.assertTrue(frame_name2 in ta.get_frame_names(), frame_name2 + " should exist in list of frames") # Try to rename frame2 to have the same name as frame1 (we expect an exception here) with self.assertRaises(Exception): frame2.name = frame_name1 # Both frame names should still exist on the server self.assertTrue(frame_name1 in ta.get_frame_names(), frame_name1 + " should still exist in list of frames") self.assertTrue(frame_name2 in ta.get_frame_names(), frame_name2 + " should still exist in list of frames") # Try to rename frame1 to have the same name as the graph (we expect an exception here) with self.assertRaises(Exception): frame1.name = graph_name # frame1 and the graph should still exist on the server self.assertTrue( frame_name1 in ta.get_frame_names(), frame_name1 + " should still exist in the list of frames") self.assertTrue( graph_name in ta.get_graph_names(), graph_name + " should still exist in the list of graphs") # Try to rename frame1 to have the same name as the model (we expect an exception here) with self.assertRaises(Exception): frame1.name = model_name # frame1 and the model should still exist on the server self.assertTrue( frame_name1 in ta.get_frame_names(), frame_name1 + " should still exist in the list of frames") self.assertTrue( model_name in ta.get_model_names(), model_name + " should still exist in the list of models")
def test_arx_no_lags(self): print "define csv file" schema = [("y", ta.float64), ("visitors", ta.float64), ("wkends", ta.float64), ("seasonality", ta.float64), ("incidentRate", ta.float64), ("holidayFlag", ta.float64), ("postHolidayFlag", ta.float64), ("mintemp", ta.float64)] csv = ta.CsvFile("/datasets/arx_train.csv", schema=schema, skip_header_lines=1) print "create training frame" train_frame = ta.Frame(csv) print "Initializing a ArxModel object" arx = ta.ArxModel() print "Training the model on the Frame" arx.train(train_frame, "y", [ "visitors", "wkends", "seasonality", "incidentRate", "holidayFlag", "postHolidayFlag", "mintemp" ], 0, 0, True) print "create test frame" csv = ta.CsvFile("/datasets/arx_test.csv", schema=schema, skip_header_lines=1) test_frame = ta.Frame(csv) print "Predicting on the Frame" p = arx.predict(test_frame, "y", [ "visitors", "wkends", "seasonality", "incidentRate", "holidayFlag", "postHolidayFlag", "mintemp" ]) self.assertEqual(p.column_names, [ "y", "visitors", "wkends", "seasonality", "incidentRate", "holidayFlag", "postHolidayFlag", "mintemp", "predicted_y" ]) expected_results = [[99.99999234330198], [98.00000220169095], [101.99999803760333], [98.00000071010813], [111.99999886664024], [99.00000373787175], [99.00000353440495], [86.99999823659364], [103.00000236184275], [114.99999178843603], [100.9999939917012], [124.99999319338036], [116.9999989603231], [109.00000481908955], [110.99999666776476], [104.99999266331749]] self.assertEqual(expected_results, p.take(p.row_count, 0, "predicted_y"))
def test_gc_drop_stale_and_finalize(self): csv = ta.CsvFile("/datasets/dates.csv", schema=[('start', ta.datetime), ('id', int), ('stop', ta.datetime), ('color', str)], delimiter=',') f2_name = "dates_two" if f2_name in ta.get_frame_names(): ta.drop_frames(f2_name) f1 = ta.Frame(csv) f1e = f1.get_error_frame() self.assertIsNotNone(f1e) self.assertIsNone(f1e.name) f2 = ta.Frame(csv, name=f2_name) f2e = f2.get_error_frame() self.assertIsNotNone(f2e) self.assertIsNone(f2e.name) admin.drop_stale( ) # first, normal drop_stale, nothing should change because these frames aren't old enough self.assertEqual("ACTIVE", f1.status) self.assertEqual("ACTIVE", f1e.status) self.assertEqual("ACTIVE", f2.status) self.assertEqual("ACTIVE", f2e.status) # print "f1.status=%s, f2.status=%s" % (f1.status, f2.status) admin.finalize_dropped( ) # nothing is dropped, so nothing so be finalized self.assertEqual("ACTIVE", f1.status) self.assertEqual("ACTIVE", f1e.status) self.assertEqual("ACTIVE", f2.status) self.assertEqual("ACTIVE", f2e.status) admin.drop_stale( "1ms" ) # now drop with very tiny age, so non-name f1 should get dropped self.assertEqual("DROPPED", f1.status) self.assertEqual("DROPPED", f1e.status) self.assertEqual("ACTIVE", f2.status) self.assertEqual("ACTIVE", f2e.status) # print "f1.status=%s, f2.status=%s" % (f1.status, f2.status) admin.finalize_dropped( ) # on f1 and f1e are dropped, so only they should be finalized self.assertEqual("FINALIZED", f1.status) self.assertEqual("FINALIZED", f1e.status) self.assertEqual("ACTIVE", f2.status) self.assertEqual("ACTIVE", f2e.status)
def test_add_columns_and_copy_where(self): """ Tests UDFs for add_columns and copy(where), and uses the vector type Changes the 2 population strings to a vector, and then uses the vector to compute the change, and then copy out all the incorrect ones """ frame = ta.Frame(csv) self.assertEquals(frame.row_count, 20, "frame should have 20 rows") frame.add_columns( lambda row: [ float(row['pop_2010'].translate({ord(','): None})), float(row['population_2013'].translate({ord(','): None})) ], ("vpops", ta.vector(2))) self.assertEquals(frame.row_count, 20, "frame should have 20 rows") self.assertEquals(frame.column_names, [ 'rank', 'city', 'population_2013', 'pop_2010', 'change', 'county', 'vpops' ]) frame.add_columns( lambda row: (row.vpops[1] - row.vpops[0]) / row.vpops[0], ("comp_change", ta.float64)) #print frame.inspect(20) bad_cities = frame.copy(columns=['city', 'change', 'comp_change'], where=lambda row: row.change != "%.2f%%" % round(100 * row.comp_change, 2)) self.assertEquals(bad_cities.column_names, ['city', 'change', 'comp_change']) self.assertEquals(bad_cities.row_count, 1) #print bad_cities.inspect() row = bad_cities.take(1)[0] row[2] = round(row[2], 5) self.assertEquals(row, [u'Tualatin', u'4.17%', 0.03167 ]) # should just be one bad one, Tualatin
def test_adf_column_types(self): """ Tests the Augmented Dickey-Fuller test with different column types """ data = [[1, "a", 1.5], [2, "b", 18.5], [4, "c", 22.1], [5, "d", 19.0], [7, "e", 25.6], [8, "f", 36.75]] schema = [("int_column", ta.int32), ("str_column", str), ("float_column", ta.float32)] frame = ta.Frame(ta.UploadRows(data, schema)) try: # string column should have an error frame.timeseries_augmented_dickey_fuller_test("str_column", 0) raise RuntimeError( "Expected error since the str_column is not numerical.") except Exception as e: assert ("Column str_column was not numerical" in e.message) # Numerical columns should not have an error self.assertNotEqual( frame.timeseries_augmented_dickey_fuller_test("int_column", 0), None) self.assertNotEqual( frame.timeseries_augmented_dickey_fuller_test("float_column", 0), None)
def test_bpt_invalid_column(self): """ Tests the Breusch-Pagan test with non-numerical data, and expects an error """ data = [[1, "a", 1.5], [2, "b", 18.5], [4, "c", 22.1], [5, "d", 19.0], [7, "e", 25.6], [8, "f", 36.75]] schema = [("int_column", ta.int32), ("str_column", str), ("float_column", ta.float32)] frame = ta.Frame(ta.UploadRows(data, schema)) try: frame.timeseries_breusch_pagan_test("str_column", ["int_column", "float_column"]) raise RuntimeError( "Expected error since the y column specified has strings") except Exception as e: assert ("Column str_column was not numerical" in e.message) try: frame.timeseries_breusch_pagan_test("float_column", ["int_column", "str_column"]) raise RuntimeError( "Expected error since one of the x columns specified has strings." ) except Exception as e: assert ("Column str_column was not numerical" in e.message) # numerical data should not have an error self.assertNotEqual( frame.timeseries_breusch_pagan_test("float_column", ["int_column"]), None)
def test_lasso(self): print "create frame" frame = ta.Frame(ta.CsvFile("/datasets/lasso_lpsa.csv", schema=[ ('y', ta.float64), ('x1', ta.float64), ('x2', ta.float64), ('x3', ta.float64), ('x4', ta.float64), ('x5', ta.float64), ('x6', ta.float64), ('x7', ta.float64), ('x8', ta.float64)], delimiter=' ')) model = ta.LassoModel() model.train(frame, 'y', ['x1','x2','x3','x4','x5','x6','x7','x8']) #print repr(train_output) predicted_frame = model.predict(frame) print predicted_frame.inspect(20, columns=['y', 'predicted_value']) test_metrics = model.test(predicted_frame, 'predicted_value') print str(test_metrics)
def testLinearRegression(self): print "define csv file" csv = ta.CsvFile("/datasets/linear_regression_8_columns.csv", schema=[("y", ta.float64), ("1", ta.float64), ("2", ta.float64), ("3", ta.float64), ("4", ta.float64), ("5", ta.float64), ("6", ta.float64), ("7", ta.float64), ("8", ta.float64), ("9", ta.float64), ("10", ta.float64)]) print "create frame" frame = ta.Frame(csv, 'LinearRegressionSampleFrame') print "Initializing a LinearRegressionModel object" model = ta.LinearRegressionModel(name='myLinearRegressionModel') print "Training the model on the Frame" model.train(frame, 'y', ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']) output = model.predict(frame) self.assertEqual(output.column_names, [ 'y', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', 'predicted_value' ])
def test_append_new_columns(self): frame = ta.Frame(self.csv1) self.assertEqual(frame.row_count, 20) self.assertEqual(frame.column_names, [name for name, type in self.schema1]) frame.append(self.csv2) self.assertEqual(frame.row_count, 30) self.assertEqual(frame.column_names, [name for name, type in self.combined_schema])
def test_append_same_schema(self): frame = ta.Frame(self.csv1) self.assertEqual(frame.row_count, 20) self.assertEqual(frame.column_names, [name for name, type in self.schema1]) frame.append(self.csv1) self.assertEqual(frame.row_count, 40) self.assertEqual(frame.column_names, [name for name, type in self.schema1])
def test_flatten_column_with_differing_size_vectors(self): data = [[1,[1,2,3],[8,7]],[2,[4,5,6],[6,5]],[3,[7,8,9],[4,3]],[4,[10,11,12],[2,1]]] schema = [('a', ta.int32), ('b', ta.vector(3)), ('c', ta.vector(2))] test_frame = ta.Frame(ta.UploadRows(data,schema)) test_frame.flatten_columns(['b','c']) # expected data after flattening expected_data = [ [1,1.0,8.0], [1,2.0,7.0], [1,3.0,0.0], [2,4.0,6.0], [2,5.0,5.0], [2,6.0,0.0], [3,7.0,4.0], [3,8.0,3.0], [3,9.0,0.0], [4,10.0,2.0], [4,11.0,1.0], [4,12.0,0.0] ] self.assertEqual(test_frame.row_count, 12) self.assertEqual(test_frame.take(test_frame.row_count), expected_data)
def test_filter(self): frame = ta.Frame(csv) self.assertEquals(frame.row_count, 20, "frame should have 20 rows") frame.filter(lambda row: row.county == "Washington") self.assertEquals(frame.row_count, 4, "frame should have 4 rows after filtering") cities = frame.take(frame.row_count, columns="city") self.assertEquals(sorted(map(lambda f: str(f[0]), cities)), ["Beaverton", "Hillsboro", "Tigard","Tualatin"])
def test_flatten_columns_with_strings_and_vectors_with_default_delimiter(self): data = [[1,"1,2",[1,2],"a,b"],[2,"3,4",[3,4],"c,d"],[3,"5,6",[5,6],"e,f"],[4,"7,8",[7,8],"g,h"]] schema = [('a', ta.int32),('b', str), ('c', ta.vector(2)), ('d', str)] test_frame = ta.Frame(ta.UploadRows(data,schema)) # there are only 2 string columns. giving 3 delimiters should give an exception. with self.assertRaises(Exception): test_frame.flatten_columns(['b', 'c', 'd'], [',',',',',']) test_frame.flatten_columns(['b', 'c', 'd']) # expected data after flattening expected_data = [ [1,"1",1.0,"a"], [1,"2",2.0,"b"], [2,"3",3.0,"c"], [2,"4",4.0,"d"], [3,"5",5.0,"e"], [3,"6",6.0,"f"], [4,"7",7.0,"g"], [4,"8",8.0,"h"] ] self.assertEqual(test_frame.row_count, 8) self.assertEqual(test_frame.take(test_frame.row_count), expected_data)
def test_principal_components(self): print "define csv file" schema = [("1", ta.float64), ("2", ta.float64), ("3", ta.float64), ("4", ta.float64), ("5", ta.float64), ("6", ta.float64), ("7", ta.float64), ("8", ta.float64), ("9", ta.float64), ("10", ta.float64), ("11", ta.float64)] train_file = ta.CsvFile("/datasets/pca_10rows.csv", schema=schema) print "creating the frame" train_frame = ta.Frame(train_file) print "initializing the naivebayes model" p = ta.PrincipalComponentsModel() print "training the model on the frame" p.train(train_frame, ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"], 9) print "predicting the class using the model and the frame" output = p.predict(train_frame, c=5, t_square_index=True) output_frame = output['output_frame'] self.assertEqual(output_frame.column_names, [ '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', 'p_1', 'p_2', 'p_3', 'p_4', 'p_5' ])
def testSvm(self): print "define csv file" csv = ia.CsvFile("/datasets/RandomForest.csv", schema=[('Class', int), ('Dim_1', ia.float64), ('Dim_2', ia.float64)]) print "create frame" frame = ia.Frame(csv) print "Initializing the classifier model object" classifier = ia.RandomForestClassifierModel() print "Training the model on the Frame" classifier.train(frame, 'Class', ['Dim_1', 'Dim_2'], num_classes=2) print "Predicting on the Frame" output = classifier.predict(frame) self.assertEqual(output.column_names, ['Class', 'Dim_1', 'Dim_2', 'predicted_class']) print "Initializing the classifier model object" regressor = ia.RandomForestRegressorModel() print "Training the model on the Frame" regressor.train(frame, 'Class', ['Dim_1', 'Dim_2']) print "Predicting on the Frame" regressor_output = regressor.predict(frame) self.assertEqual(regressor_output.column_names, ['Class', 'Dim_1', 'Dim_2', 'predicted_value'])
def test_category_summary_threshold(self): print "create frame" frame = ta.Frame(self.csv) print "compute category summary" cm = frame.categorical_summary(('source', {'threshold': 0.5})) expected_result = { u'categorical_summary': [{ u'column': u'source', u'levels': [{ u'percentage': 0.0, u'frequency': 0, u'level': u'Missing' }, { u'percentage': 1.0, u'frequency': 28, u'level': u'Other' }] }] } self.assertEquals( cm, expected_result, "test_category_summary_threshold expected_result %s got %s" % (expected_result, cm))
def test_category_summary_topk(self): print "create frame" frame = ta.Frame(self.csv) print "compute category summary" cm = frame.categorical_summary(('source', {'top_k': 2})) expected_result = { u'categorical_summary': [{ u'column': u'source', u'levels': [{ u'percentage': 0.32142857142857145, u'frequency': 9, u'level': u'thing' }, { u'percentage': 0.32142857142857145, u'frequency': 9, u'level': u'abstraction' }, { u'percentage': 0.0, u'frequency': 0, u'level': u'Missing' }, { u'percentage': 0.35714285714285715, u'frequency': 10, u'level': u'Other' }] }] } self.assertEquals( cm, expected_result, "test_category_summary_topk expected_result %s got %s" % (expected_result, cm))
def test_page_rank(self): """tests page_rank, +piggyback last_read_date testing""" graph_data = "/datasets/page_rank_test_data.csv" schema = [("followed", ta.int32), ("follows", ta.int32)] frame = ta.Frame(ta.CsvFile(graph_data, schema)) graph = ta.Graph() t0 = graph.last_read_date graph.define_vertex_type("node") graph.vertices["node"].add_vertices(frame, "follows") t1 = graph.last_read_date self.assertLess(t0, t1) # make sure the last_read_date is updating graph.vertices["node"].add_vertices(frame, "followed") graph.define_edge_type("e1", "node", "node", directed=True) graph.edges["e1"].add_edges(frame, "follows", "followed") t2 = graph.last_read_date self.assertLess(t1, t2) # make sure the last_read_date is updating result = graph.graphx_pagerank(output_property="PageRank", max_iterations=2, convergence_tolerance=0.001) t3 = graph.last_read_date self.assertLess(t2, t3) # make sure the last_read_date is updating vertex_dict = result['vertex_dictionary'] edge_dict = result['edge_dictionary'] self.assertTrue(dict(vertex_dict['node'].schema).has_key('PageRank')) self.assertTrue(dict(edge_dict['e1'].schema).has_key('PageRank')) t4 = graph.last_read_date self.assertEqual( t3, t4) # metadata access should not have updated the date
def setUp(self): csv = ta.CsvFile("/datasets/oregon-cities.csv", schema=[('rank', ta.int32), ('city', str), ('population_2013', str), ('pop_2010', str), ('change', str), ('county', str)], delimiter='|', skip_header_lines=1) self.frame = ta.Frame(csv) self.graph = ta.Graph() self.graph.define_vertex_type('city') self.graph.define_vertex_type('population_2013') self.graph.define_edge_type('rank', 'city', 'population_2013', directed=False) self.graph.vertices['city'].add_vertices(self.frame, 'city') self.graph.vertices['population_2013'].add_vertices( self.frame, 'population_2013') self.graph.edges['rank'].add_edges(self.frame, 'city', 'population_2013', ['rank'], create_missing_vertices=False) self.vertex_frame = self.graph.vertices['city']
def setUp(self): print "define csv file" csv = ta.CsvFile("/datasets/flattenable.csv", schema= [('number', ta.int32), ('abc', str), ('food', str)], delimiter=',') print "create frame" self.frame = ta.Frame(csv)
def setUp(self): print "define csv file" self.csv = ta.CsvFile("/datasets/movie.csv", schema=[('user', ta.int32), ('vertex_type', str), ('movie', ta.int32), ('rating', ta.int32), ('splits', str)]) print "creating frame" self.frame = ta.Frame(self.csv)
def test_graph(self): print "define csv file" csv = ta.CsvFile("/datasets/movie.csv", schema= [('user', ta.int32), ('vertex_type', str), ('movie', ta.int32), ('rating', ta.int32), ('splits', str)]) print "creating frame" frame = ta.Frame(csv) # TODO: add asserts verifying inspect is working print print frame.inspect(20) print self.assertEquals(frame.row_count, 20, "frame should have 20 rows") #self.assertEqual(frame.column_names, ['', '', '', '', '']) self.assertEquals(len(frame.column_names), 5, "frame should have 5 columns") print "create graph" graph = ta.Graph() self.assertIsNotNone(graph.uri) print "define vertices and edges" graph.define_vertex_type('movies') graph.define_vertex_type('users') graph.define_edge_type('ratings', 'users', 'movies', directed=True) self.assertEquals(graph.vertices['users'].row_count, 0, "making sure newly defined vertex frame does not have rows") self.assertEquals(graph.vertices['movies'].row_count, 0, "making sure newly defined vertex frame does not have rows") self.assertEquals(graph.edges['ratings'].row_count, 0, "making sure newly defined edge frame does not have rows") #self.assertEquals(graph.vertex_count, 0, "no vertices expected yet") #self.assertEquals(graph.edge_count, 0, "no edges expected yet") print "add_vertices() users" graph.vertices['users'].add_vertices( frame, 'user', []) # TODO: add asserts verifying inspect is working print print graph.vertices['users'].inspect(20) print self.assertEquals(graph.vertices['users'].row_count, 13) self.assertEquals(len(graph.vertices['users'].column_names), 3) #self.assertEquals(graph.vertices['users'].row_count, graph.vertex_count, "row count of user vertices should be same as vertex count on graph") print "add_vertices() movies" graph.vertices['movies'].add_vertices( frame, 'movie', []) self.assertEquals(graph.vertices['users'].row_count, 13) self.assertEquals(graph.vertices['movies'].row_count, 11) self.assertEquals(len(graph.vertices['users'].column_names), 3) self.assertEquals(len(graph.vertices['movies'].column_names), 3) #self.assertEquals(graph.vertex_count, 24, "vertex_count should be the total number of users and movies") print "add_edges()" graph.edges['ratings'].add_edges(frame, 'user', 'movie', ['rating'], create_missing_vertices=False) self.assertEquals(len(graph.edges['ratings'].column_names), 5) self.assertEquals(graph.edges['ratings'].row_count, 20, "expected 20 rating edges")
def setUp(self): print "define csv file" csv = ta.CsvFile("/datasets/dates.csv", schema=[('start', ta.datetime), ('id', int), ('stop', ta.datetime), ('color', str)], delimiter=',') print "create frame" self.frame = ta.Frame(csv)
def setUp(self): # there's already a "splits" column in this data set, but for testing purposes, it doesn't affect anything print "define csv file" self.schema = [('user', ta.int32), ('vertex_type', str), ('movie', ta.int32), ('rating', ta.int32), ('splits', str)] self.csv = ta.CsvFile("/datasets/movie.csv", self.schema) print "creating frame" self.frame = ta.Frame(self.csv)