def test_arx_with_lag(self): print "define csv file" schema = [("y", ta.float64), ("visitors", ta.float64), ("wkends", ta.float64), ("seasonality", ta.float64), ("incidentRate", ta.float64), ("holidayFlag", ta.float64), ("postHolidayFlag", ta.float64), ("mintemp", ta.float64)] csv = ta.CsvFile("/datasets/arx_train.csv", schema=schema, skip_header_lines=1) print "create training frame" train_frame = ta.Frame(csv) print "Initializing a ArxModel object" arx = ta.ArxModel() print "Training the model on the Frame with yMaxLag = 2 and xMaxLag = 2" coefficients = arx.train( train_frame, "y", ["visitors", "wkends", "seasonality", "incidentRate", "mintemp"], 2, 2, True) self.assertEqual(coefficients['coefficients'], [ -0.033117384191517614, -0.06529674497484411, -3.328096129192338e-08, -1.4422196518869838e-08, -2.8970459135396235e-06, 2.0984826788508606e-06, 504.6479199133054, 995.00122376607, 3.56120683505247e-08, -5.406341176251538e-08, -7.47887430442836e-08, 7.306703786303277e-08, 2.3924223466200682e-08, 2.2165130696795696e-06, 15238.142787722905, 2.061070059690899e-08, 1.3089764633101732e-07 ]) print "create test frame" csv = ta.CsvFile("/datasets/arx_test.csv", schema=schema, skip_header_lines=1) test_frame = ta.Frame(csv) print "Predicting on the Frame" p = arx.predict( test_frame, "y", ["visitors", "wkends", "seasonality", "incidentRate", "mintemp"]) self.assertEqual(p.column_names, [ "y", "visitors", "wkends", "seasonality", "incidentRate", "holidayFlag", "postHolidayFlag", "mintemp", "predicted_y" ]) expected_results = [[None], [None], [101.99999649931183], [98.00000211077416], [111.999996872938], [99.00000347596028], [99.00000489674761], [86.9999967418149], [103.00000106651471], [114.99999387693828], [100.99999426757434], [124.99999322753226], [116.99999537263702], [109.00000298901594], [110.99999768325104], [104.99999176999377]] self.assertEqual(expected_results, p.take(p.row_count, 0, "predicted_y"))
def test_arimax_air_quality(self): # Data from Lichman, M. (2013). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science. print "Define csv file" schema = [("Date", str), ("Time", str), ("CO_GT", ta.float64), ("PT08_S1_CO", ta.float64), ("NMHC_GT", ta.float64), ("C6H6_GT", ta.float64), ("PT08_S2_NMHC", ta.float64), ("NOx_GT", ta.float64), ("PT08_S3_NOx", ta.float64), ("NO2_GT", ta.float64), ("PT08_S4_NO2", ta.float64), ("PT08_S5_O3", ta.float64), ("T", ta.float64), ("RH", ta.float64), ("AH", ta.float64)] csv = ta.CsvFile("/datasets/arimax_train.csv", schema=schema, skip_header_lines=1) print "Create training frame" train_frame = ta.Frame(csv) print "Initializing a ArimaxModel object" arimax = ta.ArimaxModel() print "Training the model on the Frame" arimax.train(train_frame, "CO_GT", ["C6H6_GT", "PT08_S2_NMHC", "T"], 1, 1, 1, 1, True, False) print "Create test frame" csv2 = ta.CsvFile("/datasets/arimax_test.csv", schema=schema, skip_header_lines=1) test_frame = ta.Frame(csv2) print "Predicting on the Frame" p = arimax.predict(test_frame, "CO_GT", ["C6H6_GT", "PT08_S2_NMHC", "T"]) expected_results = [[3.9, 3.1384052036036163], [3.7, 2.2096085801345], [6.6, 3.052618296503863], [4.4, 2.1495532900204375], [3.5, 2.929771168550256], [5.4, 2.155756454454324], [2.7, 2.8784218519015745], [1.9, 2.1528352219380147], [1.6, 2.7830795782099473], [1.7, 2.1096269282113664], [-200.0, 2.8628707912495215], [1.0, 2.0471200633069278], [1.2, 2.7726186606363887], [1.5, 2.0820391788568395], [2.7, 2.9878888229516978], [3.7, 2.3182512709816443], [3.2, 3.211283519783637], [4.1, 2.5541133101407363], [3.6, 3.268861636132588], [2.8, 2.467897319671856]] self.assertEqual(expected_results, p.take(20, columns=["CO_GT", "predicted_y"]))
def test_max_air_quality(self): # Data from Lichman, M. (2013). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science. print "Define csv file" schema = [("Date", str), ("Time", str), ("CO_GT", ta.float64), ("PT08_S1_CO", ta.float64), ("NMHC_GT", ta.float64), ("C6H6_GT", ta.float64), ("PT08_S2_NMHC", ta.float64), ("NOx_GT", ta.float64), ("PT08_S3_NOx", ta.float64), ("NO2_GT", ta.float64), ("PT08_S4_NO2", ta.float64), ("PT08_S5_O3", ta.float64), ("T", ta.float64), ("RH", ta.float64), ("AH", ta.float64)] csv = ta.CsvFile("/datasets/arimax_train.csv", schema=schema, skip_header_lines=1) print "Create training frame" train_frame = ta.Frame(csv) print "Initializing a MaxModel object" max = ta.MaxModel() print "Training the model on the Frame" max.train(train_frame, "CO_GT", ["C6H6_GT", "PT08_S2_NMHC", "T"], 3, 0, True, False) print "Create test frame" csv2 = ta.CsvFile("/datasets/arimax_test.csv", schema=schema, skip_header_lines=1) test_frame = ta.Frame(csv2) print "Predicting on the Frame" p = max.predict(test_frame, "CO_GT", ["C6H6_GT", "PT08_S2_NMHC", "T"]) expected_results = [[3.9, 0.40372259585936543], [3.7, 6.6634901462882725], [6.6, 5.981442062684975], [4.4, 5.35837518115529], [3.5, 5.026072844339458], [5.4, 4.569157131217689], [2.7, 4.029165833891962], [1.9, 3.9460902496880044], [1.6, 3.779939081280088], [1.7, 3.655325704974152], [-200.0, 3.2399477839543613], [1.0, 2.9076454471385293], [1.2, 3.4476367444642566], [1.5, 2.9907210313424875], [2.7, 2.6168809024246764], [3.7, 2.6999564866286345], [3.2, 3.987628041789983], [4.1, 5.150686220645396], [3.6, 6.479895567908723], [2.8, 7.642953746764134]] self.assertEqual(expected_results, p.take(20, columns=["CO_GT", "predicted_y"]))
def test_arx_no_lags(self): print "define csv file" schema = [("y", ta.float64), ("visitors", ta.float64), ("wkends", ta.float64), ("seasonality", ta.float64), ("incidentRate", ta.float64), ("holidayFlag", ta.float64), ("postHolidayFlag", ta.float64), ("mintemp", ta.float64)] csv = ta.CsvFile("/datasets/arx_train.csv", schema=schema, skip_header_lines=1) print "create training frame" train_frame = ta.Frame(csv) print "Initializing a ArxModel object" arx = ta.ArxModel() print "Training the model on the Frame" arx.train(train_frame, "y", [ "visitors", "wkends", "seasonality", "incidentRate", "holidayFlag", "postHolidayFlag", "mintemp" ], 0, 0, True) print "create test frame" csv = ta.CsvFile("/datasets/arx_test.csv", schema=schema, skip_header_lines=1) test_frame = ta.Frame(csv) print "Predicting on the Frame" p = arx.predict(test_frame, "y", [ "visitors", "wkends", "seasonality", "incidentRate", "holidayFlag", "postHolidayFlag", "mintemp" ]) self.assertEqual(p.column_names, [ "y", "visitors", "wkends", "seasonality", "incidentRate", "holidayFlag", "postHolidayFlag", "mintemp", "predicted_y" ]) expected_results = [[99.99999234330198], [98.00000220169095], [101.99999803760333], [98.00000071010813], [111.99999886664024], [99.00000373787175], [99.00000353440495], [86.99999823659364], [103.00000236184275], [114.99999178843603], [100.9999939917012], [124.99999319338036], [116.9999989603231], [109.00000481908955], [110.99999666776476], [104.99999266331749]] self.assertEqual(expected_results, p.take(p.row_count, 0, "predicted_y"))
def setUp(self): self.schema1 = [('rank', ta.int32), ('city', str), ('population_2013', str), ('pop_2010', str), ('change', str), ('county', str)] self.schema2 = [('number', ta.int32), ('abc', str), ('food', str)] self.combined_schema = [] self.combined_schema.extend(self.schema1) self.combined_schema.extend(self.schema2) self.csv1 = ta.CsvFile("/datasets/oregon-cities.csv", schema=self.schema1, delimiter='|') self.csv2 = ta.CsvFile("/datasets/flattenable.csv", schema=self.schema2, delimiter=',')
def setUp(self): csv = ta.CsvFile("/datasets/oregon-cities.csv", schema=[('rank', ta.int32), ('city', str), ('population_2013', str), ('pop_2010', str), ('change', str), ('county', str)], delimiter='|', skip_header_lines=1) self.frame = ta.Frame(csv) self.graph = ta.Graph() self.graph.define_vertex_type('city') self.graph.define_vertex_type('population_2013') self.graph.define_edge_type('rank', 'city', 'population_2013', directed=False) self.graph.vertices['city'].add_vertices(self.frame, 'city') self.graph.vertices['population_2013'].add_vertices( self.frame, 'population_2013') self.graph.edges['rank'].add_edges(self.frame, 'city', 'population_2013', ['rank'], create_missing_vertices=False) self.vertex_frame = self.graph.vertices['city']
def test_frame_drop(self): print "define csv file" csv = ta.CsvFile("/datasets/classification-compute.csv", schema=[('a', str), ('b', ta.int32), ('labels', ta.int32), ('predictions', ta.int32)], delimiter=',', skip_header_lines=1) print "create frame" frame = ta.Frame(csv, name="test_frame_drop") print "dropping frame by entity" ta.drop_frames(frame) frames = ta.get_frame_names() self.assertFalse("test_frame_drop" in frames, "test_frame_drop should not exist in list of frames") frame = ta.Frame(csv, name="test_frame_drop") print "dropping frame by name" self.assertEqual(1, ta.drop_frames("test_frame_drop"), "drop_frames() should have deleted one frame") self.assertFalse("test_frame_drop" in frames, "test_frame_drop should not exist in list of frames")
def test_lasso(self): print "create frame" frame = ta.Frame(ta.CsvFile("/datasets/lasso_lpsa.csv", schema=[ ('y', ta.float64), ('x1', ta.float64), ('x2', ta.float64), ('x3', ta.float64), ('x4', ta.float64), ('x5', ta.float64), ('x6', ta.float64), ('x7', ta.float64), ('x8', ta.float64)], delimiter=' ')) model = ta.LassoModel() model.train(frame, 'y', ['x1','x2','x3','x4','x5','x6','x7','x8']) #print repr(train_output) predicted_frame = model.predict(frame) print predicted_frame.inspect(20, columns=['y', 'predicted_value']) test_metrics = model.test(predicted_frame, 'predicted_value') print str(test_metrics)
def test_page_rank(self): """tests page_rank, +piggyback last_read_date testing""" graph_data = "/datasets/page_rank_test_data.csv" schema = [("followed", ta.int32), ("follows", ta.int32)] frame = ta.Frame(ta.CsvFile(graph_data, schema)) graph = ta.Graph() t0 = graph.last_read_date graph.define_vertex_type("node") graph.vertices["node"].add_vertices(frame, "follows") t1 = graph.last_read_date self.assertLess(t0, t1) # make sure the last_read_date is updating graph.vertices["node"].add_vertices(frame, "followed") graph.define_edge_type("e1", "node", "node", directed=True) graph.edges["e1"].add_edges(frame, "follows", "followed") t2 = graph.last_read_date self.assertLess(t1, t2) # make sure the last_read_date is updating result = graph.graphx_pagerank(output_property="PageRank", max_iterations=2, convergence_tolerance=0.001) t3 = graph.last_read_date self.assertLess(t2, t3) # make sure the last_read_date is updating vertex_dict = result['vertex_dictionary'] edge_dict = result['edge_dictionary'] self.assertTrue(dict(vertex_dict['node'].schema).has_key('PageRank')) self.assertTrue(dict(edge_dict['e1'].schema).has_key('PageRank')) t4 = graph.last_read_date self.assertEqual( t3, t4) # metadata access should not have updated the date
def test_principal_components(self): print "define csv file" schema = [("1", ta.float64), ("2", ta.float64), ("3", ta.float64), ("4", ta.float64), ("5", ta.float64), ("6", ta.float64), ("7", ta.float64), ("8", ta.float64), ("9", ta.float64), ("10", ta.float64), ("11", ta.float64)] train_file = ta.CsvFile("/datasets/pca_10rows.csv", schema=schema) print "creating the frame" train_frame = ta.Frame(train_file) print "initializing the naivebayes model" p = ta.PrincipalComponentsModel() print "training the model on the frame" p.train(train_frame, ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"], 9) print "predicting the class using the model and the frame" output = p.predict(train_frame, c=5, t_square_index=True) output_frame = output['output_frame'] self.assertEqual(output_frame.column_names, [ '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', 'p_1', 'p_2', 'p_3', 'p_4', 'p_5' ])
def testSvm(self): print "define csv file" csv = ia.CsvFile("/datasets/RandomForest.csv", schema=[('Class', int), ('Dim_1', ia.float64), ('Dim_2', ia.float64)]) print "create frame" frame = ia.Frame(csv) print "Initializing the classifier model object" classifier = ia.RandomForestClassifierModel() print "Training the model on the Frame" classifier.train(frame, 'Class', ['Dim_1', 'Dim_2'], num_classes=2) print "Predicting on the Frame" output = classifier.predict(frame) self.assertEqual(output.column_names, ['Class', 'Dim_1', 'Dim_2', 'predicted_class']) print "Initializing the classifier model object" regressor = ia.RandomForestRegressorModel() print "Training the model on the Frame" regressor.train(frame, 'Class', ['Dim_1', 'Dim_2']) print "Predicting on the Frame" regressor_output = regressor.predict(frame) self.assertEqual(regressor_output.column_names, ['Class', 'Dim_1', 'Dim_2', 'predicted_value'])
def testLinearRegression(self): print "define csv file" csv = ta.CsvFile("/datasets/linear_regression_8_columns.csv", schema=[("y", ta.float64), ("1", ta.float64), ("2", ta.float64), ("3", ta.float64), ("4", ta.float64), ("5", ta.float64), ("6", ta.float64), ("7", ta.float64), ("8", ta.float64), ("9", ta.float64), ("10", ta.float64)]) print "create frame" frame = ta.Frame(csv, 'LinearRegressionSampleFrame') print "Initializing a LinearRegressionModel object" model = ta.LinearRegressionModel(name='myLinearRegressionModel') print "Training the model on the Frame" model.train(frame, 'y', ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']) output = model.predict(frame) self.assertEqual(output.column_names, [ 'y', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', 'predicted_value' ])
def setUp(self): print "define csv file" self.csv = ta.CsvFile("/datasets/movie.csv", schema=[('user', ta.int32), ('vertex_type', str), ('movie', ta.int32), ('rating', ta.int32), ('splits', str)]) print "creating frame" self.frame = ta.Frame(self.csv)
def setUp(self): print "define csv file" csv = ta.CsvFile("/datasets/flattenable.csv", schema= [('number', ta.int32), ('abc', str), ('food', str)], delimiter=',') print "create frame" self.frame = ta.Frame(csv)
def setUp(self): print "define csv file" csv = ta.CsvFile("/datasets/dates.csv", schema=[('start', ta.datetime), ('id', int), ('stop', ta.datetime), ('color', str)], delimiter=',') print "create frame" self.frame = ta.Frame(csv)
def test_graph(self): print "define csv file" csv = ta.CsvFile("/datasets/movie.csv", schema= [('user', ta.int32), ('vertex_type', str), ('movie', ta.int32), ('rating', ta.int32), ('splits', str)]) print "creating frame" frame = ta.Frame(csv) # TODO: add asserts verifying inspect is working print print frame.inspect(20) print self.assertEquals(frame.row_count, 20, "frame should have 20 rows") #self.assertEqual(frame.column_names, ['', '', '', '', '']) self.assertEquals(len(frame.column_names), 5, "frame should have 5 columns") print "create graph" graph = ta.Graph() self.assertIsNotNone(graph.uri) print "define vertices and edges" graph.define_vertex_type('movies') graph.define_vertex_type('users') graph.define_edge_type('ratings', 'users', 'movies', directed=True) self.assertEquals(graph.vertices['users'].row_count, 0, "making sure newly defined vertex frame does not have rows") self.assertEquals(graph.vertices['movies'].row_count, 0, "making sure newly defined vertex frame does not have rows") self.assertEquals(graph.edges['ratings'].row_count, 0, "making sure newly defined edge frame does not have rows") #self.assertEquals(graph.vertex_count, 0, "no vertices expected yet") #self.assertEquals(graph.edge_count, 0, "no edges expected yet") print "add_vertices() users" graph.vertices['users'].add_vertices( frame, 'user', []) # TODO: add asserts verifying inspect is working print print graph.vertices['users'].inspect(20) print self.assertEquals(graph.vertices['users'].row_count, 13) self.assertEquals(len(graph.vertices['users'].column_names), 3) #self.assertEquals(graph.vertices['users'].row_count, graph.vertex_count, "row count of user vertices should be same as vertex count on graph") print "add_vertices() movies" graph.vertices['movies'].add_vertices( frame, 'movie', []) self.assertEquals(graph.vertices['users'].row_count, 13) self.assertEquals(graph.vertices['movies'].row_count, 11) self.assertEquals(len(graph.vertices['users'].column_names), 3) self.assertEquals(len(graph.vertices['movies'].column_names), 3) #self.assertEquals(graph.vertex_count, 24, "vertex_count should be the total number of users and movies") print "add_edges()" graph.edges['ratings'].add_edges(frame, 'user', 'movie', ['rating'], create_missing_vertices=False) self.assertEquals(len(graph.edges['ratings'].column_names), 5) self.assertEquals(graph.edges['ratings'].row_count, 20, "expected 20 rating edges")
def setUp(self): # there's already a "splits" column in this data set, but for testing purposes, it doesn't affect anything print "define csv file" self.schema = [('user', ta.int32), ('vertex_type', str), ('movie', ta.int32), ('rating', ta.int32), ('splits', str)] self.csv = ta.CsvFile("/datasets/movie.csv", self.schema) print "creating frame" self.frame = ta.Frame(self.csv)
def setUp(self): print "define csv file" csv = ta.CsvFile("/datasets/oregon-cities.csv", schema=[('rank', ta.int32), ('city', str), ('population_2013', str), ('pop_2010', str), ('change', str), ('county', str)], delimiter='|') print "create frame" self.frame = ta.Frame(csv)
def test_frame_loading_multiple_files_with_wildcard(self): csv = ta.CsvFile("/datasets/movie-part*.csv", schema= [('user', ta.int32), ('vertex_type', str), ('movie', ta.int32), ('rating', ta.int32), ('splits', str)]) frame = ta.Frame(csv) self.assertEquals(frame.row_count, 20, "frame should have 20 rows") self.assertEquals(len(frame.column_names), 5, "frame should have 5 columns") self.assertGreaterEqual(frame._size_on_disk, 0, "frame size on disk should be non-negative")
def testKMeans(self): print "define csv file" csv = ta.CsvFile("/datasets/KMeansTestFile.csv", schema=[('data', ta.float64), ('name', str)], skip_header_lines=1) print "create frame" frame = ta.Frame(csv) print "Initializing a KMeansModel object" k = ta.KMeansModel(name='myKMeansModel') print "Training the model on the Frame" k.train(frame, ['data'], [2.0])
def test_load_csv_with_missing_values(self): # Load csv with missing values schema = [('a', ta.int32), ('b', ta.int64), ('c', ta.float32), ('d', ta.float64)] csv = ta.CsvFile("datasets/missing_values.csv", schema=schema) frame = ta.Frame(csv) # Check row count self.assertEqual(6, frame.row_count) # Check expected values expected_value = [[1, 2, None, 3.5], [4, None, None, 7.0], [None, None, None, None], [None, 75, 4.5, None], [10, 20, 30.0, 40.5], [None, None, None, None]] self.assertEqual(expected_value, frame.take(frame.row_count))
def test_gc_drop_stale_and_finalize(self): csv = ta.CsvFile("/datasets/dates.csv", schema=[('start', ta.datetime), ('id', int), ('stop', ta.datetime), ('color', str)], delimiter=',') f2_name = "dates_two" if f2_name in ta.get_frame_names(): ta.drop_frames(f2_name) f1 = ta.Frame(csv) f1e = f1.get_error_frame() self.assertIsNotNone(f1e) self.assertIsNone(f1e.name) f2 = ta.Frame(csv, name=f2_name) f2e = f2.get_error_frame() self.assertIsNotNone(f2e) self.assertIsNone(f2e.name) admin.drop_stale( ) # first, normal drop_stale, nothing should change because these frames aren't old enough self.assertEqual("ACTIVE", f1.status) self.assertEqual("ACTIVE", f1e.status) self.assertEqual("ACTIVE", f2.status) self.assertEqual("ACTIVE", f2e.status) # print "f1.status=%s, f2.status=%s" % (f1.status, f2.status) admin.finalize_dropped( ) # nothing is dropped, so nothing so be finalized self.assertEqual("ACTIVE", f1.status) self.assertEqual("ACTIVE", f1e.status) self.assertEqual("ACTIVE", f2.status) self.assertEqual("ACTIVE", f2e.status) admin.drop_stale( "1ms" ) # now drop with very tiny age, so non-name f1 should get dropped self.assertEqual("DROPPED", f1.status) self.assertEqual("DROPPED", f1e.status) self.assertEqual("ACTIVE", f2.status) self.assertEqual("ACTIVE", f2e.status) # print "f1.status=%s, f2.status=%s" % (f1.status, f2.status) admin.finalize_dropped( ) # on f1 and f1e are dropped, so only they should be finalized self.assertEqual("FINALIZED", f1.status) self.assertEqual("FINALIZED", f1e.status) self.assertEqual("ACTIVE", f2.status) self.assertEqual("ACTIVE", f2e.status)
def test_copy_empty(self): csv = ta.CsvFile("/datasets/empty.csv", schema=[('rank', ta.int32), ('city', str), ('population_2013', str), ('pop_2010', str), ('change', str), ('county', str)], delimiter='|') print "create frame" f1 = ta.Frame(csv) f1.inspect() print "copy frame" f2 = f1.copy() f2.inspect() self.assertEquals(f1.row_count, 0) self.assertEquals(f2.row_count, 0)
def test_column_median(self): print "define csv file" csv = ta.CsvFile("/datasets/classification-compute.csv", schema= [('a', str), ('b', ta.int32), ('labels', ta.int32), ('predictions', ta.int32)], delimiter=',', skip_header_lines=1) print "create frame" frame = ta.Frame(csv) print "compute column median()" column_median_b = frame.column_median(data_column='b') self.assertEquals(column_median_b, 1, "computed column median for column b should be equal to 1") column_median_b_weighted = frame.column_median(data_column='b', weights_column='labels') self.assertEquals(column_median_b_weighted, 0, "computed column median for column b with weights column labels should be equal to 0")
def test_triangle_count(self): graph_data = "/datasets/triangle_count_small.csv" schema = [('from_node', str), ('to_node', str), ('max_k', ta.int64), ('cc', ta.int64)] frame = ta.Frame(ta.CsvFile(graph_data, schema)) graph = ta.Graph() graph.define_vertex_type("node") graph.vertices["node"].add_vertices(frame, "from_node", ["max_k", "cc"]) graph.vertices["node"].add_vertices(frame, "to_node", ["max_k", "cc"]) graph.define_edge_type("edge", "node", "node", directed=True) graph.edges["edge"].add_edges(frame, "from_node", "to_node") result = graph.graphx_triangle_count(output_property="triangle") frame_result = result['node'] self.assertTrue(dict(frame_result.schema).has_key('triangle'))
def test_kclique(self): print "define csv file" noun_graph_data ="datasets/noun_graph_small.csv" schema = [("source",str),("target",str)] noun_words_frame = ta.Frame(ta.CsvFile(noun_graph_data,schema)) graph = ta.Graph() graph.define_vertex_type("source") graph.vertices["source"].add_vertices(noun_words_frame,"source") graph.vertices["source"].add_vertices(noun_words_frame,"target") graph.define_edge_type("edge", "source", "source", False) graph.edges["edge"].add_edges(noun_words_frame,"source","target") output = graph.kclique_percolation(clique_size = 3, community_property_label = "community") output_dictionary = output['vertex_dictionary'] self.assertTrue('source' in output_dictionary)
def test_copy_001(self): print "define csv file" csv = ta.CsvFile("/datasets/oregon-cities.csv", schema= [('rank', ta.int32), ('city', str), ('population_2013', str), ('pop_2010', str), ('change', str), ('county', str)], delimiter='|') print "create frame" frame = ta.Frame(csv) self.assertEquals(frame.row_count, 20, "frame should have 20 rows") self.assertEqual(frame.column_names, ['rank', 'city', 'population_2013', 'pop_2010', 'change', 'county']) print "copy()" top10_frame = frame.copy() self.assertEquals(top10_frame.row_count, 20, "copy should have same number of rows as original") self.assertNotEquals(frame._id, top10_frame._id, "copy should have a different id from the original")
def testSvm(self): print "define csv file" csv = ta.CsvFile("/datasets/SvmTestFile.csv", schema=[('data', ta.float64), ('label', str)], skip_header_lines=1) print "create frame" frame = ta.Frame(csv) print "Initializing a SvmModel object" k = ta.SvmModel(name='mySvmModel') print "Training the model on the Frame" k.train(frame, 'label', ['data']) print "Predicting on the Frame" m = k.predict(frame) self.assertEqual(m.column_names, ['data', 'label', 'predicted_label'])
def test_frame_rename(self): print "define csv file" csv = ta.CsvFile("/datasets/classification-compute.csv", schema=[('a', str), ('b', ta.int32), ('labels', ta.int32), ('predictions', ta.int32)], delimiter=',', skip_header_lines=1) print "create frame" frame = ta.Frame(csv, name="test_frame_rename") new_name = "test_frame_new_name" self.assertFalse( new_name in ta.get_frame_names(), "test_frame_new_name should not exist in list of frames") print "renaming frame" frame.name = new_name self.assertTrue(new_name in ta.get_frame_names(), "test_frame_new_name should exist in list of frames")
def test_naive_bayes(self): print "define csv file" schema = [("Class", ta.int32), ("Dim_1", ta.int32), ("Dim_2", ta.int32), ("Dim_3", ta.int32)] train_file = ta.CsvFile("/datasets/naivebayes_spark_data.csv", schema=schema) print "creating the frame" train_frame = ta.Frame(train_file) print "initializing the naivebayes model" n = ta.NaiveBayesModel() print "training the model on the frame" n.train(train_frame, 'Class', ['Dim_1', 'Dim_2', 'Dim_3']) print "predicting the class using the model and the frame" output = n.predict(train_frame) self.assertEqual( output.column_names, ['Class', 'Dim_1', 'Dim_2', 'Dim_3', 'predicted_class'])