Exemple #1
0
    def test_save_load(self):
        g = SGraph().add_vertices(self.vertices,
                                  'vid').add_edges(self.edges, 'src_id',
                                                   'dst_id')
        with util.TempDirectory() as f:
            g.save(f)
            g2 = load_graph(f, 'binary')
            self.assertEqual(g2.summary(), {'num_vertices': 4, 'num_edges': 3})
            self.assertItemsEqual(
                g2.get_fields(),
                {'__id', '__src_id', '__dst_id', 'color', 'vec', 'weight'})

        with util.TempDirectory() as f:
            g.save(f, format='csv')
            vertices = SFrame.read_csv(f + "/vertices.csv")
            edges = SFrame.read_csv(f + "/edges.csv")
            g2 = SGraph().add_edges(edges, '__src_id',
                                    '__dst_id').add_vertices(vertices, '__id')
            self.assertEqual(g2.summary(), {'num_vertices': 4, 'num_edges': 3})
            self.assertItemsEqual(
                g2.get_fields(),
                {'__id', '__src_id', '__dst_id', 'color', 'vec', 'weight'})

        with tempfile.NamedTemporaryFile(suffix='.json') as f:
            g.save(f.name)
            with open(f.name, 'r') as f2:
                data = f2.read()
                g2 = json.loads(data)
            self.assertTrue("vertices" in g2)
            self.assertTrue("edges" in g2)
Exemple #2
0
 def test_missing_value_vids(self):
     vertices = SFrame()
     vertices['vid'] = [1, 2, 3, None]
     edges = SFrame()
     edges['src'] = [1, 2, 3, None]
     edges['dst'] = [4, 4, 4, 4]
     self.assertRaises(
         RuntimeError,
         lambda: SGraph().add_vertices(vertices, 'vid').summary())
     self.assertRaises(
         RuntimeError,
         lambda: SGraph().add_edges(edges, 'src', 'dst').summary())
     self.assertRaises(
         RuntimeError,
         lambda: SGraph().add_edges(edges, 'dst', 'src').summary())
Exemple #3
0
 def test_select_query_with_same_vertex_edge_field(self):
     vertices = SFrame({'__id': range(10)})
     edges = SFrame({'__src_id': range(10), '__dst_id': range(1, 11)})
     g = SGraph(vertices, edges)
     g.vertices['weight'] = 0
     g.vertices['v'] = 0
     g.edges['weight'] = 0
     g.edges['e'] = 0
     self.assertItemsEqual(
         g.get_fields(),
         ['v', 'e', 'weight', 'weight', '__id', '__src_id', '__dst_id'])
     g2 = g.select_fields('weight')
     self.assertItemsEqual(
         g2.get_fields(),
         ['weight', 'weight', '__id', '__src_id', '__dst_id'])
Exemple #4
0
    def test_sframe_le_append_skip_row_bug_is_fixed(self):
        """
        This test is actually for SFrame lazy evaluation.
        The reason it is here is because the repro can only be done in SGraph.

        The bug appears when the SFrame has lazy_append and when passing through
        the logical filter, skip_rows is not done correctly. So the edge_sframe
        is in a bad state when not materialized.

        This unit test stays here to ensure the bug is fixed until we can find
        a more clean repro.
        """
        n = 12  # smallest n to repro the le_append bug

        # A graph with edge i -> i + 1
        g = SGraph().add_edges(
            SFrame({
                'src': range(n),
                'dst': range(1, n + 1)
            }), 'src', 'dst')

        lazy_sf = g.get_edges()
        materialized_sf = g.get_edges()
        materialized_sf.__materialize__()
        assert_frame_equal(
            lazy_sf[lazy_sf['__dst_id'] == n].to_dataframe(),
            materialized_sf[materialized_sf['__dst_id'] == n].to_dataframe())
Exemple #5
0
    def _load_sframes(self):
        from graphlab.data_structures.sframe import SFrame

        if self.id == 'movielens' or self.id == 'movielens_all':
            for folder in self.folders:
                train_sframe = SFrame.read_csv(url = folder.train_file, delimiter = '::', header = False, 
                                          column_type_hints=[int, str, int, str, float, str, int])
                train_sframe.remove_columns(column_names = ['X2', 'X4', 'X6', 'X7'])
                train_sframe.rename({'X1': 'user_id', 'X3':'item_id', 'X5': 'rating'})
                folder.train_sframe = train_sframe
                
                test_sframe = SFrame.read_csv(url = folder.test_file, delimiter = '::', header = False, 
                                          column_type_hints=[int, str, int, str, float, str, int])
                test_sframe.remove_columns(column_names = ['X2', 'X4', 'X6', 'X7'])
                test_sframe.rename({'X1': 'user_id', 'X3':'item_id', 'X5': 'rating'})
                folder.test_sframe = test_sframe
Exemple #6
0
    def _process_movies(self, filename):
        from graphlab.data_structures.sarray import SArray
        from graphlab.data_structures.sframe import SFrame
        
        sframe = SFrame.read_csv(url = filename, delimiter = '=', header = False, 
                                          column_type_hints=[int, str, str])
       
        sframe.remove_columns(column_names = ['X2'])
        sframe.rename({'X1': 'item_id', })
        
        movies = []
        for movie in sframe:
            m = {genre:1 for genre in movie['X3'].split('|')}
            movies.append(m)
        
        sa = SArray(movies)
        sframe.add_column(sa, name = 'item_genre')
       
        sframe.remove_columns(column_names = ['X3', ])
        movie_sframe = sframe.unpack('item_genre', column_name_prefix = '')
        
        #Putting 0 where in the movies that does not have genre
        column_names = movie_sframe.column_names()
        for c in column_names:
            movie_sframe = movie_sframe.fillna(c, 0)

        return movie_sframe
Exemple #7
0
 def test_graph_constructor(self):
     g = SGraph().add_vertices(self.vertices,
                               'vid').add_edges(self.edges, 'src_id',
                                                'dst_id')
     g2 = SGraph(g.vertices, g.edges)
     g3 = SGraph(g.vertices,
                 g.edges,
                 src_field="__dst_id",
                 dst_field="__src_id")  #flip around src and dst
     assert_frame_equal(
         g.vertices.to_dataframe().sort('__id').reset_index(drop=True),
         g2.vertices.to_dataframe().sort('__id').reset_index(drop=True))
     assert_frame_equal(
         g.edges.to_dataframe().sort(['__src_id',
                                      '__dst_id']).reset_index(drop=True),
         g2.edges.to_dataframe().sort(['__src_id',
                                       '__dst_id']).reset_index(drop=True))
     self.assertRaises(
         ValueError,
         lambda: SGraph(SFrame(self.vertices), SFrame(self.edges)))
     self.assertRaises(
         ValueError, lambda: SGraph(SFrame(self.vertices), SFrame(
             self.edges), 'vid', '__src_id', '__dst_id'))
     self.assertRaises(
         ValueError, lambda: SGraph(SFrame(self.vertices),
                                    SFrame(self.edges),
                                    vid_field=None,
                                    src_field='src_id',
                                    dst_field='dst_id'))
Exemple #8
0
    def test_save_load(self):
        g = SGraph().add_vertices(self.vertices, 'vid').add_edges(self.edges, 'src_id', 'dst_id')
        with util.TempDirectory() as f:
            g.save(f)
            g2 = load_graph(f, 'binary')
            self.assertEqual(g2.summary(), {'num_vertices': 4, 'num_edges': 3})
            self.assertItemsEqual(g2.get_fields(), {'__id', '__src_id', '__dst_id', 'color', 'vec', 'weight'})

        with util.TempDirectory() as f:
            g.save(f, format='csv')
            vertices = SFrame.read_csv(f + "/vertices.csv")
            edges = SFrame.read_csv(f + "/edges.csv")
            g2 = SGraph().add_edges(edges, '__src_id', '__dst_id').add_vertices(vertices, '__id')
            self.assertEqual(g2.summary(), {'num_vertices': 4, 'num_edges': 3})
            self.assertItemsEqual(g2.get_fields(), {'__id', '__src_id', '__dst_id', 'color', 'vec', 'weight'})

        with tempfile.NamedTemporaryFile(suffix='.json') as f:
            g.save(f.name)
            with open(f.name, 'r') as f2:
                data = f2.read()
                g2 = json.loads(data)
            self.assertTrue("vertices" in g2)
            self.assertTrue("edges" in g2)
    def get(self, field):
        """
        Return the value for the queried field.

        Get the value of a given field. The list of all queryable fields is
        documented in the beginning of the model class.

        Each of these fields can be queried in one of two ways:

        >>> out = m['graph']      # m is a trained graph analytics model
        >>> out = m.get('graph')  # equivalent to previous line

        Parameters
        ----------
        field : string
            Name of the field to be retrieved.

        Returns
        -------
        out : value
            The current value of the requested field.

        See Also
        --------
        list_fields

        Examples
        --------
        >>> g = m.get('graph')
        """
        _mt._get_metric_tracker().track('toolkit.graph_analytics.get')

        if field in self.list_fields():
            obj = self.__proxy__.get(field)
            if type(obj) == UnityGraphProxy:
                return SGraph(_proxy=obj)
            elif type(obj) == UnitySFrameProxy:
                return SFrame(_proxy=obj)
            else:
                return obj
        else:
            raise KeyError(
                'Key \"%s\" not in model. Available fields are %s.' %
                (field, ', '.join(self.list_fields())))
Exemple #10
0
 def rating_prediction_switch(self, dataset, dataset_switch, model_manager, force):
     from graphlab.data_structures.sframe import SFrame
     from graphlab.data_structures.sarray import SArray
     import os
     
     for folder in dataset.folders:
         rating_prediction_file  = self._get_rating_prediction_file(dataset_switch, folder)
         class_prediction_file   = self._get_class_prediction_file(dataset_switch, folder)
         
         if os.path.exists(rating_prediction_file) and not force:
             print "Model " + self.id + " in " + dataset_switch.id + " " + folder.id + " already tested."
             continue 
         
         cf_predictions      = model_manager.get_predictions(dataset, folder)
         
         if self.id == 'best':
             test_sframe = folder.test_sframe
             target      = test_sframe.select_column(key = 'rating')
             
             rating_predictions  = map(lambda t, *p: self._get_best_prediction(t, *p), target, *cf_predictions)
             rating_array        = SArray(rating_predictions)
             rating_array.save(filename = rating_prediction_file)
             
         else:
             
             sf                  = SFrame.read_csv(class_prediction_file, header = True, quote_char = '"', 
                                                   column_type_hints = [int, str])
             switch_predictions  = sf.select_column(key = 'x') 
             
             index_switch_predictions = model_manager.get_index_model(switch_predictions)
             
             rating_predictions  = map(lambda t, *p: self._get_switch_prediction(t, *p), 
                                       index_switch_predictions, *cf_predictions)
             
             rating_array = SArray(rating_predictions)
             rating_array.save(filename = rating_prediction_file)
Exemple #11
0
 def set_edges_empty(g):
     g.edges = SFrame()
Exemple #12
0
 def set_vertices_empty(g):
     g.vertices = SFrame()
Exemple #13
0
    def __repr__(self):
        """
      Emits a brief summary of all the statistics as a string.
      """
        fields = [['size', 'Length', 'Yes'], ['min', 'Min', 'Yes'],
                  ['max', 'Max', 'Yes'], ['mean', 'Mean', 'Yes'],
                  ['sum', 'Sum', 'Yes'], ['var', 'Variance', 'Yes'],
                  ['std', 'Standard Deviation', 'Yes'],
                  [
                      'num_undefined',
                      '# Missing Values',
                      'Yes',
                  ], ['num_unique', '# unique values', 'No']]

        s = '\n'
        result = []
        for field in fields:
            try:
                method_to_call = getattr(self, field[0])
                result.append([field[1], str(method_to_call()), field[2]])
            except:
                pass
        sf = SArray(result).unpack(column_name_prefix="")
        sf.rename({'0': 'item', '1': 'value', '2': 'is exact'})
        s += sf.__str__(footer=False)
        s += "\n"

        s += "\nMost frequent items:\n"
        frequent = self.frequent_items()
        sorted_freq = sorted(frequent.iteritems(),
                             key=operator.itemgetter(1),
                             reverse=True)
        if len(sorted_freq) == 0:
            s += " -- All elements appear with less than 0.01% frequency -- \n"
        else:
            sorted_freq = sorted_freq[:10]
            sf = SFrame()
            sf.add_column(SArray(['count']), 'value')
            for elem in sorted_freq:
                sf.add_column(SArray([elem[1]]), str(elem[0]))
            s += sf.__str__(footer=False) + "\n"
        s += "\n"

        try:
            # print quantiles
            t = self.quantile(0)
            s += "Quantiles: \n"
            sf = SFrame()
            for q in [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.00]:
                sf.add_column(SArray([self.quantile(q)]),
                              str(int(q * 100)) + '%')
            s += sf.__str__(footer=False) + "\n"
        except:
            pass

        try:
            t_k = self.dict_key_summary()
            t_v = self.dict_value_summary()
            s += "\n******** Dictionary Element Key Summary ********\n"
            s += t_k.__repr__()
            s += "\n******** Dictionary Element Value Summary ********\n"
            s += t_v.__repr__() + '\n'
        except:
            pass

        try:
            t_k = self.element_summary()
            s += "\n******** Element Summary ********\n"
            s += t_k.__repr__() + '\n'
        except:
            pass

        return s.expandtabs(8)
Exemple #14
0
 def get_evaluation(self, dataset, folder, evaluation_type = 'item'):
     from graphlab.data_structures.sframe import SFrame
     
     evaluation_file = self._get_evaluation_file(dataset, folder, evaluation_type)
     evaluation_sframe = SFrame(evaluation_file)
     return evaluation_sframe.select_column(key = 'rmse')