def test_save_load(self): g = SGraph().add_vertices(self.vertices, 'vid').add_edges(self.edges, 'src_id', 'dst_id') with util.TempDirectory() as f: g.save(f) g2 = load_graph(f, 'binary') self.assertEqual(g2.summary(), {'num_vertices': 4, 'num_edges': 3}) self.assertItemsEqual( g2.get_fields(), {'__id', '__src_id', '__dst_id', 'color', 'vec', 'weight'}) with util.TempDirectory() as f: g.save(f, format='csv') vertices = SFrame.read_csv(f + "/vertices.csv") edges = SFrame.read_csv(f + "/edges.csv") g2 = SGraph().add_edges(edges, '__src_id', '__dst_id').add_vertices(vertices, '__id') self.assertEqual(g2.summary(), {'num_vertices': 4, 'num_edges': 3}) self.assertItemsEqual( g2.get_fields(), {'__id', '__src_id', '__dst_id', 'color', 'vec', 'weight'}) with tempfile.NamedTemporaryFile(suffix='.json') as f: g.save(f.name) with open(f.name, 'r') as f2: data = f2.read() g2 = json.loads(data) self.assertTrue("vertices" in g2) self.assertTrue("edges" in g2)
def test_missing_value_vids(self): vertices = SFrame() vertices['vid'] = [1, 2, 3, None] edges = SFrame() edges['src'] = [1, 2, 3, None] edges['dst'] = [4, 4, 4, 4] self.assertRaises( RuntimeError, lambda: SGraph().add_vertices(vertices, 'vid').summary()) self.assertRaises( RuntimeError, lambda: SGraph().add_edges(edges, 'src', 'dst').summary()) self.assertRaises( RuntimeError, lambda: SGraph().add_edges(edges, 'dst', 'src').summary())
def test_select_query_with_same_vertex_edge_field(self): vertices = SFrame({'__id': range(10)}) edges = SFrame({'__src_id': range(10), '__dst_id': range(1, 11)}) g = SGraph(vertices, edges) g.vertices['weight'] = 0 g.vertices['v'] = 0 g.edges['weight'] = 0 g.edges['e'] = 0 self.assertItemsEqual( g.get_fields(), ['v', 'e', 'weight', 'weight', '__id', '__src_id', '__dst_id']) g2 = g.select_fields('weight') self.assertItemsEqual( g2.get_fields(), ['weight', 'weight', '__id', '__src_id', '__dst_id'])
def test_sframe_le_append_skip_row_bug_is_fixed(self): """ This test is actually for SFrame lazy evaluation. The reason it is here is because the repro can only be done in SGraph. The bug appears when the SFrame has lazy_append and when passing through the logical filter, skip_rows is not done correctly. So the edge_sframe is in a bad state when not materialized. This unit test stays here to ensure the bug is fixed until we can find a more clean repro. """ n = 12 # smallest n to repro the le_append bug # A graph with edge i -> i + 1 g = SGraph().add_edges( SFrame({ 'src': range(n), 'dst': range(1, n + 1) }), 'src', 'dst') lazy_sf = g.get_edges() materialized_sf = g.get_edges() materialized_sf.__materialize__() assert_frame_equal( lazy_sf[lazy_sf['__dst_id'] == n].to_dataframe(), materialized_sf[materialized_sf['__dst_id'] == n].to_dataframe())
def _load_sframes(self): from graphlab.data_structures.sframe import SFrame if self.id == 'movielens' or self.id == 'movielens_all': for folder in self.folders: train_sframe = SFrame.read_csv(url = folder.train_file, delimiter = '::', header = False, column_type_hints=[int, str, int, str, float, str, int]) train_sframe.remove_columns(column_names = ['X2', 'X4', 'X6', 'X7']) train_sframe.rename({'X1': 'user_id', 'X3':'item_id', 'X5': 'rating'}) folder.train_sframe = train_sframe test_sframe = SFrame.read_csv(url = folder.test_file, delimiter = '::', header = False, column_type_hints=[int, str, int, str, float, str, int]) test_sframe.remove_columns(column_names = ['X2', 'X4', 'X6', 'X7']) test_sframe.rename({'X1': 'user_id', 'X3':'item_id', 'X5': 'rating'}) folder.test_sframe = test_sframe
def _process_movies(self, filename): from graphlab.data_structures.sarray import SArray from graphlab.data_structures.sframe import SFrame sframe = SFrame.read_csv(url = filename, delimiter = '=', header = False, column_type_hints=[int, str, str]) sframe.remove_columns(column_names = ['X2']) sframe.rename({'X1': 'item_id', }) movies = [] for movie in sframe: m = {genre:1 for genre in movie['X3'].split('|')} movies.append(m) sa = SArray(movies) sframe.add_column(sa, name = 'item_genre') sframe.remove_columns(column_names = ['X3', ]) movie_sframe = sframe.unpack('item_genre', column_name_prefix = '') #Putting 0 where in the movies that does not have genre column_names = movie_sframe.column_names() for c in column_names: movie_sframe = movie_sframe.fillna(c, 0) return movie_sframe
def test_graph_constructor(self): g = SGraph().add_vertices(self.vertices, 'vid').add_edges(self.edges, 'src_id', 'dst_id') g2 = SGraph(g.vertices, g.edges) g3 = SGraph(g.vertices, g.edges, src_field="__dst_id", dst_field="__src_id") #flip around src and dst assert_frame_equal( g.vertices.to_dataframe().sort('__id').reset_index(drop=True), g2.vertices.to_dataframe().sort('__id').reset_index(drop=True)) assert_frame_equal( g.edges.to_dataframe().sort(['__src_id', '__dst_id']).reset_index(drop=True), g2.edges.to_dataframe().sort(['__src_id', '__dst_id']).reset_index(drop=True)) self.assertRaises( ValueError, lambda: SGraph(SFrame(self.vertices), SFrame(self.edges))) self.assertRaises( ValueError, lambda: SGraph(SFrame(self.vertices), SFrame( self.edges), 'vid', '__src_id', '__dst_id')) self.assertRaises( ValueError, lambda: SGraph(SFrame(self.vertices), SFrame(self.edges), vid_field=None, src_field='src_id', dst_field='dst_id'))
def test_save_load(self): g = SGraph().add_vertices(self.vertices, 'vid').add_edges(self.edges, 'src_id', 'dst_id') with util.TempDirectory() as f: g.save(f) g2 = load_graph(f, 'binary') self.assertEqual(g2.summary(), {'num_vertices': 4, 'num_edges': 3}) self.assertItemsEqual(g2.get_fields(), {'__id', '__src_id', '__dst_id', 'color', 'vec', 'weight'}) with util.TempDirectory() as f: g.save(f, format='csv') vertices = SFrame.read_csv(f + "/vertices.csv") edges = SFrame.read_csv(f + "/edges.csv") g2 = SGraph().add_edges(edges, '__src_id', '__dst_id').add_vertices(vertices, '__id') self.assertEqual(g2.summary(), {'num_vertices': 4, 'num_edges': 3}) self.assertItemsEqual(g2.get_fields(), {'__id', '__src_id', '__dst_id', 'color', 'vec', 'weight'}) with tempfile.NamedTemporaryFile(suffix='.json') as f: g.save(f.name) with open(f.name, 'r') as f2: data = f2.read() g2 = json.loads(data) self.assertTrue("vertices" in g2) self.assertTrue("edges" in g2)
def get(self, field): """ Return the value for the queried field. Get the value of a given field. The list of all queryable fields is documented in the beginning of the model class. Each of these fields can be queried in one of two ways: >>> out = m['graph'] # m is a trained graph analytics model >>> out = m.get('graph') # equivalent to previous line Parameters ---------- field : string Name of the field to be retrieved. Returns ------- out : value The current value of the requested field. See Also -------- list_fields Examples -------- >>> g = m.get('graph') """ _mt._get_metric_tracker().track('toolkit.graph_analytics.get') if field in self.list_fields(): obj = self.__proxy__.get(field) if type(obj) == UnityGraphProxy: return SGraph(_proxy=obj) elif type(obj) == UnitySFrameProxy: return SFrame(_proxy=obj) else: return obj else: raise KeyError( 'Key \"%s\" not in model. Available fields are %s.' % (field, ', '.join(self.list_fields())))
def rating_prediction_switch(self, dataset, dataset_switch, model_manager, force): from graphlab.data_structures.sframe import SFrame from graphlab.data_structures.sarray import SArray import os for folder in dataset.folders: rating_prediction_file = self._get_rating_prediction_file(dataset_switch, folder) class_prediction_file = self._get_class_prediction_file(dataset_switch, folder) if os.path.exists(rating_prediction_file) and not force: print "Model " + self.id + " in " + dataset_switch.id + " " + folder.id + " already tested." continue cf_predictions = model_manager.get_predictions(dataset, folder) if self.id == 'best': test_sframe = folder.test_sframe target = test_sframe.select_column(key = 'rating') rating_predictions = map(lambda t, *p: self._get_best_prediction(t, *p), target, *cf_predictions) rating_array = SArray(rating_predictions) rating_array.save(filename = rating_prediction_file) else: sf = SFrame.read_csv(class_prediction_file, header = True, quote_char = '"', column_type_hints = [int, str]) switch_predictions = sf.select_column(key = 'x') index_switch_predictions = model_manager.get_index_model(switch_predictions) rating_predictions = map(lambda t, *p: self._get_switch_prediction(t, *p), index_switch_predictions, *cf_predictions) rating_array = SArray(rating_predictions) rating_array.save(filename = rating_prediction_file)
def set_edges_empty(g): g.edges = SFrame()
def set_vertices_empty(g): g.vertices = SFrame()
def __repr__(self): """ Emits a brief summary of all the statistics as a string. """ fields = [['size', 'Length', 'Yes'], ['min', 'Min', 'Yes'], ['max', 'Max', 'Yes'], ['mean', 'Mean', 'Yes'], ['sum', 'Sum', 'Yes'], ['var', 'Variance', 'Yes'], ['std', 'Standard Deviation', 'Yes'], [ 'num_undefined', '# Missing Values', 'Yes', ], ['num_unique', '# unique values', 'No']] s = '\n' result = [] for field in fields: try: method_to_call = getattr(self, field[0]) result.append([field[1], str(method_to_call()), field[2]]) except: pass sf = SArray(result).unpack(column_name_prefix="") sf.rename({'0': 'item', '1': 'value', '2': 'is exact'}) s += sf.__str__(footer=False) s += "\n" s += "\nMost frequent items:\n" frequent = self.frequent_items() sorted_freq = sorted(frequent.iteritems(), key=operator.itemgetter(1), reverse=True) if len(sorted_freq) == 0: s += " -- All elements appear with less than 0.01% frequency -- \n" else: sorted_freq = sorted_freq[:10] sf = SFrame() sf.add_column(SArray(['count']), 'value') for elem in sorted_freq: sf.add_column(SArray([elem[1]]), str(elem[0])) s += sf.__str__(footer=False) + "\n" s += "\n" try: # print quantiles t = self.quantile(0) s += "Quantiles: \n" sf = SFrame() for q in [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.00]: sf.add_column(SArray([self.quantile(q)]), str(int(q * 100)) + '%') s += sf.__str__(footer=False) + "\n" except: pass try: t_k = self.dict_key_summary() t_v = self.dict_value_summary() s += "\n******** Dictionary Element Key Summary ********\n" s += t_k.__repr__() s += "\n******** Dictionary Element Value Summary ********\n" s += t_v.__repr__() + '\n' except: pass try: t_k = self.element_summary() s += "\n******** Element Summary ********\n" s += t_k.__repr__() + '\n' except: pass return s.expandtabs(8)
def get_evaluation(self, dataset, folder, evaluation_type = 'item'): from graphlab.data_structures.sframe import SFrame evaluation_file = self._get_evaluation_file(dataset, folder, evaluation_type) evaluation_sframe = SFrame(evaluation_file) return evaluation_sframe.select_column(key = 'rmse')