def __init__(self,
                 features=None,
                 excluded_features=None,
                 max_categories=None,
                 output_column_name='encoded_features'):

        # Process and make a copy of the features, exclude.
        _features, _exclude = _internal_utils.process_features(
            features, excluded_features)

        # Type checking
        _raise_error_if_not_of_type(max_categories, [int, type(None)])
        _raise_error_if_not_of_type(output_column_name, [str])

        # Set up options
        opts = {
            'max_categories': max_categories,
            'output_column_name': output_column_name,
        }
        if _exclude:
            opts['exclude'] = True
            opts['features'] = _exclude
        else:
            opts['exclude'] = False
            opts['features'] = _features

        # Initialize object
        proxy = _gl.extensions._OneHotEncoder()
        proxy.init_transformer(opts)
        super(OneHotEncoder, self).__init__(proxy, self.__class__)
    def __init__(self, features=None, excluded_features=None,
            max_categories=None, output_column_name = 'encoded_features'):

        # Process and make a copy of the features, exclude.
        _features, _exclude = _internal_utils.process_features(
                                        features, excluded_features)

        # Type checking
        _raise_error_if_not_of_type(max_categories, [int, _NoneType])
        _raise_error_if_not_of_type(output_column_name, [str])

        # Set up options
        opts = {
          'max_categories': max_categories,
          'output_column_name': output_column_name,
        }
        if _exclude:
            opts['exclude'] = True
            opts['features'] = _exclude
        else:
            opts['exclude'] = False
            opts['features'] = _features

        # Initialize object
        proxy = _gl.extensions._OneHotEncoder()
        proxy.init_transformer(opts)
        super(OneHotEncoder, self).__init__(proxy, self.__class__)
Esempio n. 3
0
    def __init__(self,
                 features=None,
                 excluded_features=None,
                 num_bits=18,
                 output_column_name='hashed_features'):

        # Process and make a copy of the features, exclude.
        _features, _exclude = _internal_utils.process_features(
            features, excluded_features)

        # Type checking
        _raise_error_if_not_of_type(num_bits, [int])
        _raise_error_if_not_of_type(output_column_name, [str])

        # Set up options
        opts = {
            'num_bits': num_bits,
            'output_column_name': output_column_name,
        }
        if _exclude:
            opts['exclude'] = True
            opts['features'] = _exclude
        else:
            opts['exclude'] = False
            opts['features'] = _features

        # Initialize object
        proxy = _gl.extensions._FeatureHasher()
        proxy.init_transformer(opts)
        super(FeatureHasher, self).__init__(proxy, self.__class__)
Esempio n. 4
0
    def query(self,
              query,
              num_results=10,
              expansion_k=5,
              expansion_epsilon=0.1,
              expansion_near_match_weight=.5):
        """
        Search for text.

        Parameters
        ----------
        query: str
            A string of text.

        num_results : int
            The number of results to return.

        expansion_k : int
          Maximum number of nearest words to include from query token.

        expansion_epsilon : float
          Maximum distance to allow between query token and nearby word when
          doing query expansion. Must be between 0 and 1.

        expansion_near_match_weight : float
          Multiplier to use on BM25 scores for documents indexed via an
          approximate match with a given token. This will be used for each of
          the `expansion_k` words that are considered an approximate match.
          Must be between 0 and 1.

        Returns
        -------
        out: SFrame
          The rows of the original SFrame along with a `score` column
          which contains the BM25 score between this query and the row.

        Examples
        --------

        >>> import graphlab as gl
        >>> sf = gl.SFrame({'text': ['Hello my friend', 'I love this burrito']})
        >>> s = gl.search.create(sf, features=['text'])
        >>> s.query('burrito')

        """
        if _sys.version_info.major == 2:
            _raise_error_if_not_of_type(query, [str, unicode])
        else:
            _raise_error_if_not_of_type(query, [str])
        q = query.split(' ')
        results = self.__proxy__.query_index(
            q,
            expansion_k=expansion_k,
            expansion_epsilon=expansion_epsilon,
            expansion_near_match_weight=expansion_near_match_weight)
        results = self.__proxy__.join_query_result(results,
                                                   method='default',
                                                   num_results=num_results)

        return results
Esempio n. 5
0
    def __init__(self,
                 features=None,
                 excluded_features=None,
                 output_column_prefix=None,
                 verbose=True):

        self._setup()

        _features, _exclude = process_features(features, excluded_features)

        #Type check
        _raise_error_if_not_of_type(output_column_prefix, [str, NoneType])
        _raise_error_if_not_of_type(verbose, [bool])

        state = {}
        state['output_column_prefix'] = output_column_prefix
        state['features'] = _features
        state['excluded_features'] = _exclude
        state['fitted'] = False
        state['verbose'] = verbose

        if _exclude:
            self._exclude = True
            self._features = _exclude
        else:
            self._exclude = False
            self._features = _features

        self.__proxy__.update(state)
    def __init__(self, features=None, excluded_features=None, threshold=1,
                 output_category_name=None, output_column_prefix=None):

        # Process and make a copy of the features, exclude.
        _features, _exclude = _internal_utils.process_features(
                                        features, excluded_features)

        # Type checking
        _raise_error_if_not_of_type(threshold, [int, _NoneType])

        # Set up options
        opts = {
          'threshold': threshold,
          'output_category_name': output_category_name,
          'output_column_prefix': output_column_prefix
        }
        if _exclude:
            opts['exclude'] = True
            opts['features'] = _exclude
        else:
            opts['exclude'] = False
            opts['features'] = _features

        # Initialize object
        proxy = _gl.extensions._CountThresholder()
        proxy.init_transformer(opts)
        super(CountThresholder, self).__init__(proxy, self.__class__)
Esempio n. 7
0
    def __init__(self, features=None, excluded_features=None,
                 separator = ".", none_tag = "__none__",
                 output_column_prefix = None):

        # Process and make a copy of the features, exclude.
        _features, _exclude = _internal_utils.process_features(features, excluded_features)

        # Type checking
        _raise_error_if_not_of_type(output_column_prefix, [str, type(None)])
        if output_column_prefix is None:
            output_column_prefix = ''
        
        opts = {
            'separator' : separator,
            'none_tag' : none_tag,
            'output_column_prefix' : output_column_prefix
            }
            
        if _exclude:
            opts['exclude'] = True
            opts['features'] = _exclude
        else:
            opts['exclude'] = False
            opts['features'] = _features
            
        # Initialize object
        proxy = _gl.extensions._TransformToFlatDictionary()
        proxy.init_transformer(opts)
        super(TransformToFlatDictionary, self).__init__(proxy, self.__class__)
    def __init__(self, features=None, excluded_features=None, num_bits=18,
                                        output_column_name='hashed_features'):

        # Process and make a copy of the features, exclude.
        _features, _exclude = _internal_utils.process_features(
                                        features, excluded_features)

        # Type checking
        _raise_error_if_not_of_type(num_bits, [int])
        _raise_error_if_not_of_type(output_column_name, [str])

        # Set up options
        opts = {
            'num_bits': num_bits,
            'output_column_name': output_column_name,
            }
        if _exclude:
            opts['exclude'] = True
            opts['features'] = _exclude
        else:
            opts['exclude'] = False
            opts['features'] = _features

        # Initialize object
        proxy = _gl.extensions._FeatureHasher()
        proxy.init_transformer(opts)
        super(FeatureHasher, self).__init__(proxy, self.__class__)
    def get_prediction_score(self, node_id):
        """
        Return the prediction score (if leaf node) or None if its an
        intermediate node.

        Parameters
        ----------
        node_id: id of the node to get the prediction value.

        Returns
        -------
        float or None: returns float value of predictio if leaf node and None
        if not.

        Examples
        --------
        .. sourcecode:: python

            >>> tree.get_prediction_score(120)  # Leaf node
            0.251092

            >>> tree.get_prediction_score(120)  # Not a leaf node
            None

        """
        _raise_error_if_not_of_type(node_id, [int,long], "node_id")
        _numeric_param_check_range("node_id", node_id, 0, self.num_nodes - 1)
        node = self.nodes[node_id]
        return None if node.is_leaf == False else node.value
Esempio n. 10
0
def _validate_job_create_args(function, name, environment):
    """ Validate the arguments for job.create and map_job.create
    """
    __LOGGER__.info("Validating job.")
    _raise_error_if_not_of_type(environment,
               [type(None), str, _environment._Environment],  'environment')
    _raise_error_if_not_of_type(name,
               [type(None), str], 'name')

    if name is not None and not _job_name_checker.match(name):
        raise ValueError('Job name can only contain digits, characters, "-" and "_".')

    # Setup the env
    if not environment:
        try:
            environment = _gl.deploy.environments['async']
        except KeyError:
            __LOGGER__.info("Creating a LocalAsync environment called 'async'.")
            try:
                environment = _environment.LocalAsync('async')
            except KeyError:
                environment = _gl.deploy.environments['async']
    else:
        if isinstance(environment, str):
            __LOGGER__.debug("Loading environment: %s" % environment)
            environment = _gl.deploy.environments[environment]

    # Clone to prevent the user's environment to reflect changes.
    return function, name, environment
Esempio n. 11
0
def _validate_job_create_args(function, name, environment):
    """ Validate the arguments for job.create and map_job.create
    """
    __LOGGER__.info("Validating job.")
    _raise_error_if_not_of_type(environment,
                                [type(None), str, _environment._Environment],
                                'environment')
    _raise_error_if_not_of_type(name, [type(None), str], 'name')

    if name is not None and not _job_name_checker.match(name):
        raise ValueError(
            'Job name can only contain digits, characters, "-" and "_".')

    # Setup the env
    if not environment:
        try:
            environment = _gl.deploy.environments['async']
        except KeyError:
            __LOGGER__.info(
                "Creating a LocalAsync environment called 'async'.")
            try:
                environment = _environment.LocalAsync('async')
            except KeyError:
                environment = _gl.deploy.environments['async']
    else:
        if isinstance(environment, str):
            __LOGGER__.debug("Loading environment: %s" % environment)
            environment = _gl.deploy.environments[environment]

    # Clone to prevent the user's environment to reflect changes.
    return function, name, environment
    def __init__(self,
                 features=None,
                 excluded_features=None,
                 strategy='logarithmic',
                 num_bins=10,
                 output_column_prefix=None):

        # Process and make a copy of the features, exclude.
        _features, _exclude = _internal_utils.process_features(
            features, excluded_features)

        # Type checking
        _raise_error_if_not_of_type(num_bins, [int])
        _raise_error_if_not_of_type(strategy, [str])

        # Set up options
        opts = {
            'strategy': strategy,
            'num_bins': num_bins,
            'output_column_prefix': output_column_prefix
        }
        if _exclude:
            opts['exclude'] = True
            opts['features'] = _exclude
        else:
            opts['exclude'] = False
            opts['features'] = _features

        # Initialize object
        proxy = _gl.extensions._FeatureBinner()
        proxy.init_transformer(opts)
        super(FeatureBinner, self).__init__(proxy, self.__class__)
    def __init__(self,
                 features=None,
                 excluded_features=None,
                 threshold=1,
                 output_category_name=None,
                 output_column_prefix=None):

        # Process and make a copy of the features, exclude.
        _features, _exclude = _internal_utils.process_features(
            features, excluded_features)

        # Type checking
        _raise_error_if_not_of_type(threshold, [int, type(None)])

        # Set up options
        opts = {
            'threshold': threshold,
            'output_category_name': output_category_name,
            'output_column_prefix': output_column_prefix
        }
        if _exclude:
            opts['exclude'] = True
            opts['features'] = _exclude
        else:
            opts['exclude'] = False
            opts['features'] = _features

        # Initialize object
        proxy = _gl.extensions._CountThresholder()
        proxy.init_transformer(opts)
        super(CountThresholder, self).__init__(proxy, self.__class__)
    def __init__(self, features=None, excluded_features=None,
                 strategy='logarithmic', num_bins=10,
                 output_column_prefix=None):

        # Process and make a copy of the features, exclude.
        _features, _exclude = _internal_utils.process_features(features, excluded_features)

        # Type checking
        _raise_error_if_not_of_type(num_bins, [int])
        _raise_error_if_not_of_type(strategy, [str])

        # Set up options
        opts = {
          'strategy': strategy,
          'num_bins': num_bins,
          'output_column_prefix': output_column_prefix
        }
        if _exclude:
            opts['exclude'] = True
            opts['features'] = _exclude
        else:
            opts['exclude'] = False
            opts['features'] = _features

        # Initialize object
        proxy = _gl.extensions._FeatureBinner()
        proxy.init_transformer(opts)
        super(FeatureBinner, self).__init__(proxy, self.__class__)
Esempio n. 15
0
    def __init__(self,
                 features=None,
                 excluded_features=None,
                 output_column_prefix=None,
                 transform_function=lambda x: x,
                 transform_function_name="none"):

        self._setup()

        # Process and make a copy of the features, exclude.
        _features, _exclude = _internal_utils.process_features(
            features, excluded_features)

        #Type check
        _raise_error_if_not_of_type(output_column_prefix, [str, type(None)])

        state = {}
        state['output_column_prefix'] = output_column_prefix
        state['features'] = _features
        state['excluded_features'] = _exclude
        state['fitted'] = False
        state['transform_function'] = transform_function
        state['transform_function_name'] = transform_function_name

        if _exclude:
            self._exclude = True
            self._features = _exclude
        else:
            self._exclude = False
            self._features = _features

        self.__proxy__.update(state)
    def fit(self, data):
        """
        Fits a transformer using the SFrame `data`. The `fit` phase does not
        train a deep learning model, it only checks that the trained model
        is comptable with the data provided. If the `auto` model is chosen, then
        the fit phase choses the right model to extract features from.

        Parameters
        ----------
        data : SFrame
            The data used to fit the transformer.

        Returns
        -------
        self (A fitted object)

        See Also
        --------
        transform, fit_transform

        Examples
        --------

        # Create data.
        >>> import graphlab as gl

        # Import data from MNIST
        >>> data = gl.SFrame('http://s3.amazonaws.com/dato-datasets/mnist/sframe/train6k')

        # Create a DeepFeatureExtractorObject
        >>> extractor = gl.feature_engineering.DeepFeatureExtractor(
                                                    features = 'image')

        # Fit the encoder for a given dataset.
        >>> extractor = extractor.fit(data)

        # Return the model used for the deep feature extraction.
        >>> extractor['model']
        """
        _mt._get_metric_tracker().track(self.__class__.__module__ + '.fit')

        # Check that the column is in the SFrame.
        _raise_error_if_not_of_type(data, [_SFrame])
        _raise_error_if_column_exists(data, self._state["features"])

        # Make sure the output column_name exists.
        count = 1
        old_output_column_name = self._state["output_column_name"]
        output_column_name = old_output_column_name
        while output_column_name in data.column_names():
            output_column_name = "%s.%s" % (old_output_column_name, count)
            count = count + 1
        self._state["output_column_name"]  = output_column_name

        if data[self._state["features"]].dtype() != _Image:
            raise ToolkitError(
               "Feature `%s` must be of type Image." % self._state["features"])

        return self
    def item_history_count(self, item_id):
        '''
        Returns the number of interactions per item.
        '''
        _raise_error_if_not_of_type(item_id, self._allowed_item_types,
                                    'item_id')

        user = _gl.SFrame({self.item_id_column: [item_id]})
        return user.join(self._item_counts, on=self.item_id_column)
    def user_history_count(self, user_id):
        '''
        Returns the number of interactions per user.
        '''
        _raise_error_if_not_of_type(user_id, self._allowed_user_types,
                                    'user_id')

        user = _gl.SFrame({self.user_id_column: [user_id]})
        return user.join(self._user_counts, on=self.user_id_column)
    def user_details(self, user_id):
        '''
        Returns the row of the user table given a user id.
        '''
        _raise_error_if_not_of_type(user_id, self._allowed_user_types,
                                    'user_id')

        user = _gl.SFrame({self.user_id_column: [user_id]})
        return user.join(self._users, on=self.user_id_column)
Esempio n. 20
0
    def _set_one_input(self,
                       name='input',
                       value=None,
                       from_task=None,
                       delete=False):
        """
        Set/Update an input for this Task.

        Parameters
        ----------

        name : str
            Name for this input. This will be how the code refers to this
            input at runtime. Default is 'input'.

        value : obj (supported by GL Pickle)
            Value for the object refered to using 'name'.

        from_task : Task|str

            Dependent Task to set as input, specifying the tuple with: (Task,
            output_name). Tasks can be referred to either by name or by
            reference. The output_name needs to be a string.

            For example, if the following is specified:

            >>> task._set_one_input(name='in', from_task='dep')

            then an input named 'in' will be defined on this Task, which
            has a dependency on the output of the Task named 'dep'.


        delete : bool, optional
            If delete is set to True then the name input is removed.
        """

        _raise_error_if_not_of_type(name, str, "name")
        _raise_error_if_not_of_type(from_task, [type(None), Task], "from_task")

        # Delete the input.
        if delete is True and name in self._data['inputs']:
            del self._data['inputs'][name]
            return self

        # Early binding: Set the input
        if from_task is None:
            self._data['inputs'][name] = value
            self._set_dirty_bit()
            return self

        # Late binding: Set an input from a task.
        elif isinstance(from_task, Task):
            task = from_task
            self._data['inputs'][name] = task
            self._set_dirty_bit()
            return self
    def set_inputs(self, names):
        """
        Set input(s) for this Task.

        Inputs can be any object that can be pickled using GL-Pickle but cannot
        come from the output of another task. For that, use the
        set_inputs_from_task function.

        Parameters
        ----------
        names : list [str] | dict [str, obj]
            If a dict is provided, then each key is considered a name for an
            input in this Task, and each value is considered the definition of the
            input.

            When a list is provided,  then each entry is considered a name for
            an input in this Task, and the value for that slot is set to None.

        Returns
        -------
        self : Task

        See Also
        --------
        set_output

        Examples
        --------
        To define only input names for a task, use a list of strings:


        >>> # For late binding
        >>> t1 = graphlab.deploy._task.Task(my_func, 'set_inputs_ex1')
        >>> t1.set_inputs(['one', 'two', 'three'])

        >>> # For early binding
        >>> t3 = graphlab.deploy._task.Task(my_func, 'set_inputs_ex3')
        >>> t3.set_inputs({
        ...     'b' : 'set_inputs_ex2',
        ...     'c' : 'foo',
        ...     'd' : ('foo', 'bar')})
        """
        if names is None:
            raise TypeError('Names are required while binding two tasks.')
        _raise_error_if_not_of_type(names, [list, dict], 'names')

        if isinstance(names, list):
            for name in set(names):
                self._set_one_input(name=name, delete=False)
        elif isinstance(names, dict):
            for key, value in names.items():
                self._set_one_input(name=key, value=value, delete=False)
        return self
    def _set_one_input(self, name='input', value=None, from_task=None, delete=False):
        """
        Set/Update an input for this Task.

        Parameters
        ----------

        name : str
            Name for this input. This will be how the code refers to this
            input at runtime. Default is 'input'.

        value : obj (supported by GL Pickle)
            Value for the object refered to using 'name'.

        from_task : Task|str

            Dependent Task to set as input, specifying the tuple with: (Task,
            output_name). Tasks can be referred to either by name or by
            reference. The output_name needs to be a string.

            For example, if the following is specified:

            >>> task._set_one_input(name='in', from_task='dep')

            then an input named 'in' will be defined on this Task, which
            has a dependency on the output of the Task named 'dep'.


        delete : bool, optional
            If delete is set to True then the name input is removed.
        """

        _raise_error_if_not_of_type(name, str, "name")
        _raise_error_if_not_of_type(from_task,
                       [type(None), Task], "from_task")

        # Delete the input.
        if delete is True and name in self._data['inputs']:
            del self._data['inputs'][name]
            return self

        # Early binding: Set the input
        if from_task is None:
            self._data['inputs'][name] = value
            self._set_dirty_bit()
            return self

        # Late binding: Set an input from a task.
        elif isinstance(from_task, Task):
            task = from_task
            self._data['inputs'][name] = task
            self._set_dirty_bit()
            return self
Esempio n. 23
0
    def set_inputs(self, names):
        """
        Set input(s) for this Task.

        Inputs can be any object that can be pickled using GL-Pickle but cannot
        come from the output of another task. For that, use the
        set_inputs_from_task function.

        Parameters
        ----------
        names : list [str] | dict [str, obj]
            If a dict is provided, then each key is considered a name for an
            input in this Task, and each value is considered the definition of the
            input.

            When a list is provided,  then each entry is considered a name for
            an input in this Task, and the value for that slot is set to None.

        Returns
        -------
        self : Task

        See Also
        --------
        set_output

        Examples
        --------
        To define only input names for a task, use a list of strings:


        >>> # For late binding
        >>> t1 = graphlab.deploy._task.Task(my_func, 'set_inputs_ex1')
        >>> t1.set_inputs(['one', 'two', 'three'])

        >>> # For early binding
        >>> t3 = graphlab.deploy._task.Task(my_func, 'set_inputs_ex3')
        >>> t3.set_inputs({
        ...     'b' : 'set_inputs_ex2',
        ...     'c' : 'foo',
        ...     'd' : ('foo', 'bar')})
        """
        if names is None:
            raise TypeError('Names are required while binding two tasks.')
        _raise_error_if_not_of_type(names, [list, dict], 'names')

        if isinstance(names, list):
            for name in set(names):
                self._set_one_input(name=name, delete=False)
        elif isinstance(names, dict):
            for key, value in names.iteritems():
                self._set_one_input(name=key, value=value, delete=False)
        return self
    def __init__(self, func, name=None, description=None):
        """
        Create a new Task specifying its name and optionally a description.
        """

        # Must be a function
        _raise_error_if_not_function(func, "func")

        # Set the name
        name = func.__name__ if not name else name
        _raise_error_if_not_of_type(name, str, "name")

        self.name = name
        self._data = dict()
        self._data['code'] = None
        self._data['codestr'] = None
        self._data['inputs'] = dict()
        self._data['output'] = None
        self._data['packages'] = set()
        self._data['description'] = ''
        self._modified_since_last_saved = None

        if description is not None:
            self.set_description(description)

        # Inspect the function.
        specs = _inspect.getargspec(func)
        varargs = specs.varargs
        defaults = _copy.copy(specs.defaults)
        args = _copy.copy(specs.args)

        # Set the code to function arguments + *args + **kwargs
        self.set_code(func)

        # Set the inputs
        all_args = _copy.copy(args)
        if varargs:
            all_args.append(varargs)
        self.set_inputs(all_args)

        # Bind default values
        if defaults:
            for index, arg in enumerate(args[-len(defaults):]):
                self.set_inputs({arg : defaults[index]})

        # Set required packages
        if _sys.version_info.major == 3:
            func_dict = func.__dict__
        else:
            func_dict = func.func_dict
    def fit(self, data):
        """
        Fits a transformer using the SFrame `data`. The `fit` phase does not
        train a deep learning model, it only checks that the trained model
        is comptable with the data provided. If the `auto` model is chosen, then
        the fit phase choses the right model to extract features from.

        Parameters
        ----------
        data : SFrame
            The data used to fit the transformer.

        Returns
        -------
        self (A fitted object)

        See Also
        --------
        transform, fit_transform

        Examples
        --------

        # Create data.
        >>> import graphlab as gl

        # Import data from MNIST
        >>> data = gl.SFrame('https://static.turi.com/datasets/mnist/sframe/train6k')

        # Create a DeepFeatureExtractorObject
        >>> extractor = gl.feature_engineering.DeepFeatureExtractor(features = 'image')

        # Fit the encoder for a given dataset.
        >>> extractor = extractor.fit(data)

        # Return the model used for the deep feature extraction.
        >>> extractor['model']
        """
        _mt._get_metric_tracker().track(self.__class__.__module__ + '.fit')

        # Check that the column is in the SFrame.
        _raise_error_if_not_of_type(data, [_SFrame])

        for feature in self._state["features"]:
            _raise_error_if_column_exists(data, feature)
            if data[feature].dtype() != _Image:
                raise ToolkitError("Feature `%s` must be of type Image." %
                                   feature)

        return self
    def __init__(self, name, session_aware=True):
        """
        Constructor for base Environment, should not be instantiated directly.
        """
        if not name:
            raise TypeError("Name is required when creating an Environment.")
        _raise_error_if_not_of_type(name, [str, unicode], 'name')

        self._session = _gl.deploy._default_session
        self.name = name
        self._env_type = type(self).__name__
        self._modified_since_last_saved = None

        if session_aware:
            self._session.register(self)
Esempio n. 27
0
    def __init__(self, func, name=None, description=None):
        """
        Create a new Task specifying its name and optionally a description.
        """

        # Must be a function
        _raise_error_if_not_function(func, "func")

        # Set the name
        name = func.__name__ if not name else name
        _raise_error_if_not_of_type(name, str, "name")

        self.name = name
        self._data = dict()
        self._data['code'] = None
        self._data['codestr'] = None
        self._data['inputs'] = dict()
        self._data['output'] = None
        self._data['packages'] = set()
        self._data['description'] = ''
        self._modified_since_last_saved = None

        if description is not None:
            self.set_description(description)

        # Inspect the function.
        specs = _inspect.getargspec(func)
        varargs = specs.varargs
        defaults = _copy.copy(specs.defaults)
        args = _copy.copy(specs.args)

        # Set the code to function arguments + *args + **kwargs
        self.set_code(func)

        # Set the inputs
        all_args = _copy.copy(args)
        if varargs:
            all_args.append(varargs)
        self.set_inputs(all_args)

        # Bind default values
        if defaults:
            for index, arg in enumerate(args[-len(defaults):]):
                self.set_inputs({arg: defaults[index]})

        # Set required packages
        if 'required_packages' in func.func_dict:
            self.set_required_packages(func.func_dict['required_packages'])
    def users_in_common(self, item_a, item_b, num_results=None):
        """
        Get data on the users in common between two items.

        Parameters
        ----------

        item_a : The id of one item.

        item_b : The id of the other item.

        num_results : int, optional
            The number of users in common to return.

        Returns
        -------
        out : SFrame
           A SFrame with the two item columns given above, the number of
           users that rated each, and a dictionary mapping the user to a
           pair of the ratings, with the first rating being the rating of
           the first item and the second being the rating of the second item.
           If no ratings are provided, these values are always 1.0.

        Returns the observation data relevant to the provided user.
        """
        _raise_error_if_not_of_type(item_a, self._allowed_item_types,
                                    'item_id')
        _raise_error_if_not_of_type(item_b, self._allowed_item_types,
                                    'item_id')

        item_a_history = self.item_history(item_a)
        item_b_history = self.item_history(item_b)

        item_a_users = item_a_history[self.user_id_column].unique()
        item_b_users = item_b_history[self.user_id_column].unique()
        users_in_common = set(item_a_users).intersection(set(item_b_users))

        users = _gl.SFrame({self.user_id_column: list(users_in_common)})
        users = users.join(self._users, on=self.user_id_column)
        if num_results is not None:
            users = users.head(num_results)
        result = {
            'item_a_count': len(item_a_users),
            'item_b_count': len(item_b_users),
            'in_common_count': len(users_in_common),
            'in_common_users': users
        }
        return result
def process_features(features, exclude):
    """
    Parameters
    ----------
    features : list[str] | str | None, optional
        Column names of features to be transformed. If None, all columns
        are selected. If string, that column is transformed. If list of strings,
        this list of column names is selected.

    exclude : list[str] | str | None, optional
        Column names of features to be ignored in transformation. Can be string
        or list of strings. Either 'exclude' or 'features' can be passed, but
        not both.

    Returns
    -------
    (features, exclude) that are processed.

    """

    # Check types
    _raise_error_if_not_of_type(features, [NoneType, str, list], 'features')
    _raise_error_if_not_of_type(exclude, [NoneType, str, list], 'exclude')

    # Make a copy of the parameters.
    _features = _copy.copy(features)
    _exclude = _copy.copy(exclude)

    # Check of both are None or empty.
    if _features and _exclude:
        raise ValueError(
            "The parameters 'features' and 'exclude' cannot both be set."
            " Please set one or the other.")
    if _features == [] and not _exclude:
        raise ValueError("Features cannot be an empty list.")

    # Allow a single list
    _features = [_features] if type(_features) == str else _features
    _exclude = [_exclude] if type(_exclude) == str else _exclude

    # Type check each feature/exclude
    if _features:
        for f in _features:
            _raise_error_if_not_of_type(f, str, "Feature names")
    if _exclude:
        for e in _exclude:
            _raise_error_if_not_of_type(e, str, "Excluded feature names")

    if _exclude is not None and _features is not None:
        feature_set = set(_features)
        for col_name in _exclude:
            if col_name in feature_set:
                raise ValueError(
                    "'%s' appears in both features and excluded_features." %
                    col_name)

    return _features, _exclude
Esempio n. 30
0
    def set_description(self, description):
        """
        Set the description for this Task.

        Parameters
        ----------
        description : str
            A description for the Task.

        Returns
        -------
        self : Task
        """
        _raise_error_if_not_of_type(description, str, "description")
        self._data['description'] = description
        self._set_dirty_bit()
        return self
    def set_description(self, description):
        """
        Set the description for this Task.

        Parameters
        ----------
        description : str
            A description for the Task.

        Returns
        -------
        self : Task
        """
        _raise_error_if_not_of_type(description, str, "description")
        self._data['description'] = description
        self._set_dirty_bit()
        return self
    def user_history(self, user_id, num_results=None):
        """
        Returns the observation data relevant to the provided user.
        """
        _raise_error_if_not_of_type(user_id, self._allowed_user_types,
                                    'user_id')

        try:
            result = self._observations_by_user.get_group(user_id)
        except RuntimeError as e:
            result = self._empty_observation_data
        if self.item_name_column:
            result = result.join(self._items, on=self.item_id_column)

        if num_results is not None:
            result = result.head(num_results)
        return result
    def set_name(self, name):
        """
        Set the name of the Task, which must be unique.

        Parameters
        ----------
        name : str
            Name of the Task.

        Returns
        -------
        self : Task
        """

        _raise_error_if_not_of_type(name, str, "name")
        self.name = str(name)
        self._set_dirty_bit()
        return self
    def __init__(self, name, session_aware=True):
        """
        Constructor for base Environment, should not be instantiated directly.
        """
        if not name:
            raise TypeError("Name is required when creating an Environment.")
        if _sys.version_info.major == 3:
            _raise_error_if_not_of_type(name, [str], 'name')
        else:
            _raise_error_if_not_of_type(name, [str, unicode], 'name')

        self._session = _gl.deploy._default_session
        self.name = name
        self._env_type = type(self).__name__
        self._modified_since_last_saved = None

        if session_aware:
            self._session.register(self)
Esempio n. 35
0
    def set_name(self, name):
        """
        Set the name of the Task, which must be unique.

        Parameters
        ----------
        name : str
            Name of the Task.

        Returns
        -------
        self : Task
        """

        _raise_error_if_not_of_type(name, str, "name")
        self.name = str(name)
        self._set_dirty_bit()
        return self
Esempio n. 36
0
    def __init__(self, steps):
        """
        Parameters
        ----------
        steps: list[Transformer] | list[tuple(name, Transformer)]

            List of Transformers or (name, Transformer) tuples. These are
            chained in the order in which they are provided in the list.

        """
        # Basic type checking.
        _raise_error_if_not_of_type(steps, [list])

        # Split into (name, transformer) pairs. If the name is not present
        # then use the index as name.
        transformers = []
        index = 0
        for step in steps:
            if isinstance(step, tuple):
                name, tr = step
            else:
                tr = step
                name = index

            if isinstance(tr, list):
                tr = TransformerChain(tr)
            if not issubclass(tr.__class__, _TransformerBase):
                raise TypeError(
                    "Each step in the chain must be a Transformer.")
            transformers.append((name, tr))
            index = index + 1

        # Save into a dictionary for lookups by name and index.
        self._state = {}
        self._state["steps"] = steps
        self._state["steps_by_name"] = {}
        index = 0
        for name, tr in transformers:
            self._state["steps_by_name"][name] = tr
            index = index + 1

        # The transformers as (name, obj) tuple (used here for fitting
        # and transforming).
        self._transformers = transformers
Esempio n. 37
0
    def __init__(self, steps):
        """
        Parameters
        ----------
        steps: list[Transformer] | list[tuple(name, Transformer)]

            List of Transformers or (name, Transformer) tuples. These are
            chained in the order in which they are provided in the list.

        """
        # Basic type checking.
        _raise_error_if_not_of_type(steps, [list])

        # Split into (name, transformer) pairs. If the name is not present
        # then use the index as name.
        transformers = []
        index = 0
        for step in steps:
            if isinstance(step, tuple):
                name, tr = step
            else:
                tr = step
                name = index

            if isinstance(tr, list):
                tr = TransformerChain(tr)
            if not issubclass(tr.__class__, _TransformerBase):
                raise TypeError("Each step in the chain must be a Transformer.")
            transformers.append((name, tr))
            index = index + 1

        # Save into a dictionary for lookups by name and index.
        self._state = {}
        self._state["steps"] = steps
        self._state["steps_by_name"] = {}
        index = 0
        for name, tr in transformers:
            self._state["steps_by_name"][name] = tr
            index = index + 1

        # The transformers as (name, obj) tuple (used here for fitting
        # and transforming).
        self._transformers = transformers
    def __init__(self, reference_features=None, feature="feature", verbose=False):

        # Process and make a copy of the reference_features
        _reference_features, _exclude = _internal_utils.process_features(reference_features, None)

        # Type checking
        _raise_error_if_not_of_type(feature, [str])

        # Set up options
        opts = {
          'reference_features': reference_features,
          'feature': feature,
          'verbose': verbose
        }
        opts['reference_features'] = _reference_features

        # Initialize object
        proxy = _gl.extensions._CategoricalImputer()
        proxy.init_transformer(opts)
        super(CategoricalImputer, self).__init__(proxy, self.__class__)
Esempio n. 39
0
    def delete_inputs(self, names):
        """
        Set input(s) for this Task.

        Inputs can be any object that can be pickled using GL-Pickle but cannot
        come from the output of another task. For that, use the
        set_inputs_from_task function.

        Parameters
        ----------
        names : list [str]

            When a list is provided,  then each entry is considered a name for
            an input in this Task, and is hence removed.

        Returns
        -------
        self : Task

        See Also
        --------
        delete_output

        Examples
        --------
        To define only input names for a task, use a list of strings:


        >>> # For late binding
        >>> t1 = graphlab.deploy._task.Task(my_func, 'set_inputs_ex1')
        >>> t1.delete_inputs(['one', 'two', 'three'])

        """
        if names is None:
            return self
        _raise_error_if_not_of_type(names, [list], 'names')

        for name in set(names):
            self._set_one_input(name=name, delete=True)

        return self
    def delete_inputs(self, names):
        """
        Set input(s) for this Task.

        Inputs can be any object that can be pickled using GL-Pickle but cannot
        come from the output of another task. For that, use the
        set_inputs_from_task function.

        Parameters
        ----------
        names : list [str]

            When a list is provided,  then each entry is considered a name for
            an input in this Task, and is hence removed.

        Returns
        -------
        self : Task

        See Also
        --------
        delete_output

        Examples
        --------
        To define only input names for a task, use a list of strings:


        >>> # For late binding
        >>> t1 = graphlab.deploy._task.Task(my_func, 'set_inputs_ex1')
        >>> t1.delete_inputs(['one', 'two', 'three'])

        """
        if names is None:
            return self
        _raise_error_if_not_of_type(names, [list], 'names')

        for name in set(names):
            self._set_one_input(name=name, delete=True)

        return self
    def item_details(self, item_id):
        """
        Obtain data for a given item.

        Parameters
        ----------
        item_id : int, str
            The id of the desired item.

        Returns
        -------
        out : SFrame
            Data for the desired item. If no row has the desired item_id, then
            an empty SFrame is returned.

        """
        _raise_error_if_not_of_type(item_id, self._allowed_item_types,
                                    'item_id')

        item = _gl.SFrame({self.item_id_column: [item_id]})
        return item.join(self._items, on=self.item_id_column)
Esempio n. 42
0
    def fit(self, dataset):
        """
        Fits a transformer using the SFrame `dataset`.

        Parameters
        ----------
        data : SFrame
            The data used to fit the transformer.

        Returns
        -------
        self (A fitted object)

        See Also
        --------
        transform, fit_transform
        """
        _mt._get_metric_tracker().track(self.__class__.__module__ + '.fit')

        _raise_error_if_not_of_type(dataset, [_SFrame])

        fitted_state = {}
        feature_columns = get_column_names(dataset, self._exclude,
                                           self._features)
        feature_columns = select_valid_features(dataset, feature_columns,
                                                [str, list])
        fitted_state['features'] = feature_columns
        validate_feature_columns(dataset.column_names(), feature_columns)

        fitted_state['col_type_map'] = {
            col_name: col_type
            for (col_name, col_type
                 ) in zip(dataset.column_names(), dataset.column_types())
        }

        fitted_state['fitted'] = True

        self.__proxy__.update(fitted_state)

        return self
Esempio n. 43
0
    def __init__(
        self,
        features=None,
        excluded_features=None,
        min_document_frequency=0.0,
        max_document_frequency=1.0,
        output_column_prefix=None,
    ):

        # Process and make a copy of the features, exclude.
        _features, _exclude = _internal_utils.process_features(features, excluded_features)

        # Type checking
        _raise_error_if_not_of_type(min_document_frequency, [float, int])
        _raise_error_if_not_of_type(max_document_frequency, [float, int])
        _raise_error_if_not_of_type(output_column_prefix, [str, _NoneType])

        # Set up options
        opts = {
            "min_document_frequency": min_document_frequency,
            "max_document_frequency": max_document_frequency,
            "output_column_prefix": output_column_prefix,
        }
        if _exclude:
            opts["exclude"] = True
            opts["features"] = _exclude
        else:
            opts["exclude"] = False
            opts["features"] = _features

        # Initialize object
        proxy = _gl.extensions._TFIDF()
        proxy.init_transformer(opts)
        super(TFIDF, self).__init__(proxy, self.__class__)
    def __init__(self, features, model='auto', output_column_prefix=None):
        """
        Parameters
        ----------
        """
        _raise_error_if_not_of_type(features, [str, list, type(None)])
        _raise_error_if_not_of_type(model, [str, _NeuralNetClassifier])
        _raise_error_if_not_of_type(output_column_prefix, [str, type(None)])

        if isinstance(features, str):
            features = [features]

        # Set the model.
        self._state = {}
        self._state["features"] = features
        if not output_column_prefix:
            output_column_prefix = "deep_features"
        self._state["output_column_prefix"] = output_column_prefix

        self._state['model'] = model
        if self._state["model"] == 'auto':
            model_path = \
    "https://static.turi.com/products/graphlab-create/resources/models/python2.7/imagenet_model_iter45"
            import graphlab as gl
            self._state['model'] = gl.load_model(model_path)
        if type(self._state['model']) is not _NeuralNetClassifier:
            raise ValueError(
                "Model parameters must be of type NeuralNetClassifier " +
                "or string literal 'auto'")
    def __init__(self, feature, model = 'auto', output_column_name=None):
        """
        Parameters
        ----------
        """
        _raise_error_if_not_of_type(feature, [str])
        _raise_error_if_not_of_type(model, [str, _NeuralNetClassifier])
        _raise_error_if_not_of_type(output_column_name, [str, _NoneType])

        # Set the model.
        self._state = {}
        self._state["features"] = feature
        if not output_column_name:
            self._state["output_column_name"] = "deep_features_%s" % feature
        else:
            self._state["output_column_name"] = output_column_name
        self._state['model'] = model
        if self._state["model"] == 'auto':
            model_path = \
    "http://s3.amazonaws.com/dato-datasets/deeplearning/imagenet_model_iter45"
            import graphlab as gl
            self._state['model'] = gl.load_model(model_path)
        if type(self._state['model']) is not _NeuralNetClassifier:
            raise ValueError("Model parameters must be of type NeuralNetClassifier " +
                "or string literal 'auto'")
    def __init__(self, features=None, excluded_features=None,
                 min_document_frequency=0.0,
                 max_document_frequency=1.0,
                 output_column_prefix=None):

        # Process and make a copy of the features, exclude.
        _features, _exclude = _internal_utils.process_features(features, excluded_features)

        # Type checking
        _raise_error_if_not_of_type(min_document_frequency, [float, int])
        _raise_error_if_not_of_type(max_document_frequency, [float, int])
        _raise_error_if_not_of_type(output_column_prefix, [str, type(None)])

        # Set up options
        opts = {
          'min_document_frequency': min_document_frequency,
          'max_document_frequency': max_document_frequency,
          'output_column_prefix' : output_column_prefix
        }
        if _exclude:
            opts['exclude'] = True
            opts['features'] = _exclude
        else:
            opts['exclude'] = False
            opts['features'] = _features

        # Initialize object
        proxy = _gl.extensions._TFIDF()
        proxy.init_transformer(opts)
        super(TFIDF, self).__init__(proxy, self.__class__)
    def from_model(cls, model, tree_id = 0):
        import graphlab as _gl
        from graphlab.toolkits import _supervised_learning as _sl
        import json as _json

        _raise_error_if_not_of_type(tree_id, [int,long], "tree_id")
        _numeric_param_check_range("tree_id", tree_id, 0, model.num_trees - 1)

        tree = DecisionTree()
        nodes = {}
        tree_str = _gl.extensions._xgboost_get_tree(model.__proxy__, tree_id)
        metadata_mapping = _gl.extensions._get_metadata_mapping(model.__proxy__)
        trees_json = _json.loads(tree_str)

        # Parse the tree from the JSON.
        tree._make_tree(trees_json, metadata_mapping)
        tree.root_id = 0

        # Keep track of the attributes.
        for key in {"num_examples", "num_features", "num_unpacked_features",
                "max_depth"}:
            setattr(tree, key, model[key])
        return tree
def process_features(features, exclude):
    """
    Parameters
    ----------
    features : list[str] | str | None, optional
        Column names of features to be transformed. If None, all columns
        are selected. If string, that column is transformed. If list of strings,
        this list of column names is selected.

    exclude : list[str] | str | None, optional
        Column names of features to be ignored in transformation. Can be string
        or list of strings. Either 'exclude' or 'features' can be passed, but
        not both.

    Returns
    -------
    (features, exclude) that are processed.

    """
    # Make a copy of the parameters.
    _features = _copy.copy(features)
    _exclude = _copy.copy(exclude)

    # Check of both are None or empty.
    if _features and _exclude:
        raise ValueError("The parameters 'features' and 'exclude' cannot both be set."
                " Please set one or the other.")
    if _features == [] and not _exclude:
        raise ValueError("Features cannot be an empty list.")

    # Check types
    _raise_error_if_not_of_type(_features, [NoneType, str, list], 'features')
    _raise_error_if_not_of_type(_exclude, [NoneType, str, list], 'exclude')

    # Allow a single list
    _features = [_features] if type(_features) == str else _features
    _exclude = [_exclude] if type(_exclude) == str else _exclude

    # Type check each feature/exclude
    if _features:
        for f in _features:
            _raise_error_if_not_of_type(f, str, "Feature names")
    if _exclude:
        for e in _exclude:
            _raise_error_if_not_of_type(e, str, "Excluded feature names")

    return _features, _exclude
Esempio n. 49
0
    def __init__(self, feature, min_document_frequency=0.0, max_document_frequency=1.0, output_column_name=None):

        # Type checking
        _raise_error_if_not_of_type(feature, [str])
        _raise_error_if_not_of_type(min_document_frequency, [float, int])
        _raise_error_if_not_of_type(max_document_frequency, [float, int])
        _raise_error_if_not_of_type(output_column_name, [str, _NoneType])

        # Set up options
        opts = {
            "features": [feature],
            "min_document_frequency": min_document_frequency,
            "max_document_frequency": max_document_frequency,
            "output_column_name": output_column_name,
        }

        # Initialize object
        proxy = _gl.extensions._TFIDF()
        proxy.init_transformer(opts)
        super(TFIDF, self).__init__(proxy, self.__class__)
Esempio n. 50
0
    def __init__(self, feature, query, k1 = 1.5, b = 0.75, min_document_frequency = 0.0,
                 max_document_frequency=1.0, output_column_name=None):

        # Convert query to list if necessary
        if isinstance(query, _gl.SArray):
            query = list(query)
        if isinstance(query, set):
            query = list(query)

        # Type checking
        _raise_error_if_not_of_type(feature, [str])
        for q in query:
            _raise_error_if_not_of_type(q, [str]) # query must be list of strings
        _raise_error_if_not_of_type(k1, [float, int])
        _raise_error_if_not_of_type(b, [float, int])
        _raise_error_if_not_of_type(min_document_frequency, [float, int])
        _raise_error_if_not_of_type(max_document_frequency, [float, int])
        _raise_error_if_not_of_type(output_column_name, [str, _NoneType])

        # Set up options
        opts = {
          'features': [feature],
          'query': query,
          'k1': k1,
          'b': b,
          'min_document_frequency': min_document_frequency,
          'max_document_frequency': max_document_frequency,
          'output_column_name' : output_column_name
        }

        # Initialize object
        proxy = _gl.extensions._BM25()
        proxy.init_transformer(opts)
        super(BM25, self).__init__(proxy, self.__class__)
def create(dataset, item, features=None, min_support=1, max_patterns=100,
           min_length=1):
    """
    Create a :class:`~graphlab.frequent_pattern_mining.FrequentPatternMiner` to
    extract the set of frequently occurring items in an event-series.

    Parameters
    ----------

    dataset : SFrame
        Dataset for training the model.

    item: string
        Name of the column containing the item. The values in this column must
        be of string or integer type.

    features : list[string], optional
        Names of the columns containing features. 'None' (the default) indicates
        that all columns except the target variable should be used as features.

        The feature columns are the ones that together identify a unique
        transaction ID for the item.

    min_support : int, optional
        The minimum number of times that a pattern must occur in order for it
        to be considered `frequent`.

    max_patterns : int, optional
        The maximum number of frequent patterns to be mined.

    min_length: int, optional
        The minimum size (number of elements in the set) of each pattern being
        mined.

    Returns
    -------
    out : FrequentPatternMiner
        A trained model of type
        :class:`~graphlab.frequent_pattern_mining.FrequentPatternMiner`.

    Notes
    -----
    Frequent closed itemests are mined using the `top-k FP growth` algorithm.
    Mining occurs until the top max_patterns closed itemsets of size min_length
    and support greater than min_support are found.

    See Also
    --------
    FrequentPatternMiner

    References
    ----------

    - Wikipedia - Association Rule Learning
      <https://en.wikipedia.org/wiki/Association_rule_learning>
    - Han, Jiawei, et al. "Mining top-k frequent closed patterns without minimum
      support." Data Mining, 2002. ICDM 2003.
    - Wang, Jianyong, et al. "TFP: An efficient algorithm for mining top-k
      frequent closed itemsets." Knowledge and Data Engineering, IEEE Transactions
      on 17.5 (2005): 652-663.

    Examples
    --------

    .. sourcecode:: python

        >>> import graphlab as gl
        >>> bakery_sf = gl.SFrame("http://s3.amazonaws.com/dato-datasets/bakery.sf")
        >>> bakery_sf
        Data:
        +---------+-------------+-------+----------+----------+-----------------+
        | Receipt |   SaleDate  | EmpId | StoreNum | Quantity |       Item      |
        +---------+-------------+-------+----------+----------+-----------------+
        |    1    | 12-JAN-2000 |   20  |    20    |    1     |  GanacheCookie  |
        |    1    | 12-JAN-2000 |   20  |    20    |    5     |     ApplePie    |
        |    2    | 15-JAN-2000 |   35  |    10    |    1     |   CoffeeEclair  |
        |    2    | 15-JAN-2000 |   35  |    10    |    3     |     ApplePie    |
        |    2    | 15-JAN-2000 |   35  |    10    |    4     |   AlmondTwist   |
        |    2    | 15-JAN-2000 |   35  |    10    |    3     |    HotCoffee    |
        |    3    |  8-JAN-2000 |   13  |    13    |    5     |    OperaCake    |
        |    3    |  8-JAN-2000 |   13  |    13    |    3     |   OrangeJuice   |
        |    3    |  8-JAN-2000 |   13  |    13    |    3     | CheeseCroissant |
        |    4    | 24-JAN-2000 |   16  |    16    |    1     |   TruffleCake   |
        +---------+-------------+-------+----------+----------+-----------------+
        [266209 rows x 6 columns]

        >>> model = gl.frequent_pattern_mining.create(train, 'Item',
                         features=['Receipt'], min_length=4, max_patterns=500)
        Model fields
        ------------
        Min support                   : 1
        Max patterns                  : 500
        Min pattern length            : 4

        Most frequent patterns
        ----------------------
        ['CoffeeEclair', 'HotCoffee', 'AlmondTwist', 'ApplePie']: 1704
        ['LemonLemonade', 'LemonCookie', 'RaspberryLemonade', 'RaspberryCookie']: 1565
        ['LemonLemonade', 'LemonCookie', 'RaspberryLemonade', 'GreenTea']: 1290
        ['LemonLemonade', 'RaspberryLemonade', 'RaspberryCookie', 'GreenTea']: 1289
        ['LemonLemonade', 'LemonCookie', 'RaspberryCookie', 'GreenTea']: 1279
        ['LemonCookie', 'RaspberryLemonade', 'RaspberryCookie', 'GreenTea']: 1279
        ['AppleTart', 'AppleDanish', 'AppleCroissant', 'CherrySoda']: 1253
        ['LemonLemonade', 'LemonCookie', 'RaspberryLemonade', 'RaspberryCookie', 'GreenTea']: 1221
        ['CherryTart', 'ApricotDanish', 'OperaCake', 'ApricotTart']: 61
        ['CherryTart', 'ApricotDanish', 'OperaCake', 'RaspberryLemonade']: 55
    """
    _mt._get_metric_tracker().track('toolkit.frequent_pattern_mining.create')

    # Type checking.
    _raise_error_if_not_sframe(dataset, "dataset")
    _raise_error_if_not_of_type(item, str, "item")
    _raise_error_if_not_of_type(features, [list, _types.NoneType], "features")
    _raise_error_if_not_of_type(min_support, [int, float], "min_support")
    _raise_error_if_not_of_type(max_patterns, [int, float], "max_patterns")
    _raise_error_if_not_of_type(min_length, [int, float], "min_length")

    # Value checking.
    column_names = dataset.column_names()

    # If features is None, then use all other column names than item
    if features is None:
        features = column_names
        features.remove(item)

    # Call the C++ create function.
    proxy = _gl.extensions._pattern_mining_create(
            dataset, item, features, min_support, max_patterns, min_length)
    return FrequentPatternMiner(proxy)
def create(data, row_label=None, features=None, feature_model='auto',
           method='lsh', verbose=True):
    """
    Create a similarity search model, which can be used to quickly retrieve
    items similar to a query observation. In the case of images, this model
    automatically performs the appropriate feature engineering steps. NOTE:
    If you are using a CPU for the creation step with feature_model='auto',
    creation time may take a while. This is because extracting features for
    images on a CPU is expensive. With a GPU, one can expect large speedups.

    .. warning::

        The similarity search toolkit is currently in beta, and feedback is
        welcome! Please send comments to [email protected].

    Parameters
    ----------
    dataset : SFrame
        The SFrame that represents the training data for the model, including at
        least one column of images.

    row_label : str, optional
        Name of the SFrame column with row id's. If 'row_label' is not
        specified, row numbers are used to identify reference dataset rows when
        the model is queried.

    features : str, optional
        The name of an image column in the input 'dataset' SFrame.

    feature_model : 'auto' | A model of type NeuralNetClassifier, optional
        A trained model for extracting features from raw data objects. By
        default ('auto'), we choose an appropriate model from our set of
        pre-trained models. See
        :class:`~graphlab.toolkits.feature_engineering.DeepFeatureExtractor` for
        more information.

    method : {'lsh', 'brute_force'}, optional
        The method used for nearest neighbor search. The 'lsh' option uses
        locality-sensitive hashing to find approximate results more quickly.

    verbose : bool, optional
        If True, print verbose output during model creation.

    Returns
    -------
    out : SimilaritySearchModel

    See Also
    --------
    SimilaritySearchModel
    graphlab.toolkits.nearest_neighbors
    graphlab.toolkits.feature_engineering

    Notes
    -----
    The similarity search toolkit currently uses cosine distance to evaluate the
    similarity between each query and candidate results.

    Examples
    --------
    First, split data into reference and query.

    >>> import graphlab as gl

    >>> data = gl.SFrame('http://s3.amazonaws.com/dato-datasets/mnist/sframe/train6k')
    >>> reference, query = data.random_split(0.8)

    Build neuralnet feature extractor for images:

    >>> nn_model = gl.neuralnet_classifier.create(reference, target='label')

    Construct SimilaritySearchModel:

    >>> model = gl.similarity_search.create(reference, features= 'image',
    ...                                     feature_model=nn_model)

    Find the most similar items in the reference set for each item in the query
    set:

    >>> model.search(query)
    """

    _mt._get_metric_tracker().track(__name__ + '.create')

    _raise_error_if_not_of_type(data, [_SFrame])
    _raise_error_if_not_of_type(features, [str])
    _raise_error_if_column_exists(data, features)

    if data[features].dtype() != _Image:
        raise _ToolkitError("Feature `%s` must be of type Image" \
                % features)

    return SimilaritySearchModel(data, row_label=row_label, feature=features,
            feature_model=feature_model, method=method, verbose=verbose)
    def search(self, data, row_label=None, k=5):
        """
        Search for the nearest neighbors from the reference set for each element
        of the query set. The query SFrame must include columns with the same
        names as the row_label and feature columns used to create the
        SimilaritySearchModel.

        Parameters
        ----------
        data : SFrame
            Query data. Must contain columns with the same names and types as
            the features used to train the model. Additional columns are
            allowed, but ignored.

        row_label : string, optional
            Name of the query SFrame column with row id's. If 'row_label' is not
            specified, row numbers are used to identify query dataset rows in
            the output SFrame.

        k : int, optional
            Number of nearest neighbors to return from the reference set for
            each query observation. The default is 5 neighbors.

        Returns
        -------
        out
            A SFrame that contains all the nearest neighbors.

        Examples
        --------
        First, split data into reference and query:

        >>> import graphlab as gl
        >>> data = gl.SFrame('http://s3.amazonaws.com/dato-datasets/mnist/sframe/train6k')
        >>> reference, query = data.random_split(0.8)

        Build a neural net feature extractor for images:

        >>> nn_model = gl.neuralnet_classifier.create(reference, target='label')

        Construct the SimilaritySearchModel:

        >>> model = gl.similarity_search.create(reference, features='image',
        ...                                     feature_model=nn_model)

        Find the most similar items in the reference set for each query:

        >>> model.search(query)
        """

        _raise_error_if_not_of_type(row_label, [str, _NoneType])
        feature = self._state['features']
        _raise_error_if_column_exists(data, feature)

        if (data[feature].dtype() != self._feature_type):
            raise ValueError('Feature columns must have same data type in both reference and query set')

        if row_label != None:
            _raise_error_if_column_exists(data, row_label)

        if data[feature].dtype() == _Image:
            transformed_data = self._extractor.transform(data)
        else:
            transformed_data = data
            transformed_data[self._state['output_column_name']] = transformed_data[feature]

        return self._neighbors_model.query(transformed_data, label=row_label, k=k)
Esempio n. 54
0
    def __init__(self, name, stages=[[]], final_stage=None, environment=None,
                 _exec_dir=None, _task_output_paths=None, _job_type = 'PIPELINE'):
        """
        Construct a job.

        Parameters
        ----------
        name : str
            Name of this Job, must be unique.

        stages: list[list[Task]]
            Collection of task(s) to be executed.

        final_stage : list[task] | task
            Collection of task(s) whose outputs are to be returned._

        environment : Environment, optional
            Environment used for this execution. See
            :py:class:`~graphlab.deploy.environment.LocalAsync` for an example
            environment.

        """
        _raise_error_if_not_of_type(name, [str], 'name')
        _raise_error_if_not_of_type(stages, [list], 'stages')
        _raise_error_if_not_of_type(final_stage,
                                [list, _Task, type(None)], 'final_stage')

        self.name = name
        self.environment = environment

        self._stages = stages
        self._num_tasks = 0
        self._status = 'Pending'
        self._start_time = None
        self._end_time = None
        self._error = None

        self._job_type = _job_type

        # Set the packages
        self._packages = set()
        for task in self._stages:
            for t in task:
                self._num_tasks += 1
                self._packages.update(t.get_required_packages())


        self._final_stage = final_stage
        self._task_status = {}

        self._session = _gl.deploy._default_session
        if not _exec_dir:
            relative_path = "job-results-%s" % str(_uuid())
            self._exec_dir = self.get_path_join_method()(self._session.results_dir, relative_path)
        else:
            self._exec_dir = _exec_dir

        # Location where all the outputs for the tasks are saved.
        if not _task_output_paths:
            Job._update_exec_dir(self, self._exec_dir)
        else:
            self._task_output_paths = _task_output_paths
    def __init__(self, features=None, excluded_features=None, output_column_name='quadratic_features'):

        #Type checking
        _raise_error_if_not_of_type(output_column_name, [str])

        # set up options
        opts = {
            'output_column_name': output_column_name
        }
        # Make a copy of the parameters.
        _features = _copy.copy(features)
        _exclude = _copy.copy(excluded_features)


        # Check of both are None or empty.
        if _features and _exclude:
            raise ValueError("The parameters 'features' and 'exclude' cannot both be set."
                    " Please set one or the other.")
        if _features == [] and not _exclude:
            raise ValueError("Features cannot be an empty list.")

        # Check types
        _raise_error_if_not_of_type(_features, [NoneType, list, str, tuple], 'features')
        _raise_error_if_not_of_type(_exclude, [NoneType,  list, str, tuple], 'exclude')

        # Allow a single list
        _features = [_features] if type(_features) == str or type(_features) == tuple else _features
        _exclude = [_exclude] if type(_exclude) == str or type(_exclude) == tuple else _exclude


        # Type check each feature/exclude
        if _features:
            for f in _features:
                _raise_error_if_not_of_type(f, [str, tuple], "Feature names")
        if _exclude:
            for e in _exclude:
                _raise_error_if_not_of_type(e, [str, tuple], "Excluded feature names")

        if _exclude:
            opts['exclude'] = True
            unprocessed_features = _exclude
        else:
            opts['exclude'] = False
            unprocessed_features = _features

        pair_list = set()

        if unprocessed_features is not None:
            if type(unprocessed_features[0]) is tuple:
                for t in unprocessed_features:
                    pair_list.add(tuple(sorted(t)))
            elif type(unprocessed_features[0]) is str:
                if _exclude:
                    for t in unprocessed_features:
                        pair_list.add(t)
                else:
                    for t in unprocessed_features:
                        for k in unprocessed_features:
                            pair_list.add(tuple(sorted((t, k))))

        if type(output_column_name) is not str:
            raise ValueError("'output_column_name' must be of type str")

        if unprocessed_features is not None:
            if type(unprocessed_features[0]) is str:
                opts['features'] = unprocessed_features
                if _exclude:
                    opts['feature_pairs'] = list(pair_list)
                else:
                    opts['feature_pairs'] = [list(x) for x in pair_list]
            else:
                opts['feature_pairs'] = [list(x) for x in pair_list ]
                opts['features'] = [list(x) for x in unprocessed_features]
        else:
            opts['feature_pairs'] = None
            opts['features'] = None


        # initialize object
        proxy = _gl.extensions._QuadraticFeatures()
        proxy.init_transformer(opts)
        super(QuadraticFeatures, self).__init__(proxy, self.__class__)
Esempio n. 56
0
  def sample(self, k, diversity=0.5, method=None, side_data=None, **kwargs):
    """
    After constructing a diverse sampler, sample a diverse set stochastically.
    The stochastic algorithm depends on the sampling method itself.

    Parameters
    ----------
    k : int
      The number of items to sample.

    diversity : double in [0, 1], optional 
      This is a tunable parameter that trades off between quality and diversity.
      A diversity factor of 0 will only consider quality when building a set
      (equivalent to using the method "quality_only"), while a diversity factor
      of 1 will only consider item similarity and will ignore quality. A value
      between 0 and 1 will force the algorithm to trade off between quality and
      diversity.

      The actual effect of the diversity factor depends on the algorithm:
        - When method="weighted_vertex_cover", the diversity factor changes the
          number of nearest-neighbors to remove when sampling an item.
          Specifically, the number of neighbors is set to the value floor(
          (N-1)/(k-1) * diversity_factor).

        - When method="ipsen", the diversity factor will scale the similarity
          values by the value of diversity, and the quality values by
          (1-diversity).

    method : {'random', 'quality_only', 'weighted_vertex_cover', 'ipsen'},
              optional
      The sampling method to use. The options available are:

      - *"random"*: Returns a completely random set of items, with no reference
        to item qualities or similarities. Note that the greedy method is
        undefined for a random sampler.

      - *"quality_only"*: Form a sampling distribution with the item qualities,
        and return a set from this distribution. The sample() method will sample
        a set from this distribution, while the greedy() method will return the
        top-k items according to item quality. 

        Requirements: The column quality_feature must be present.

      - *"ipsen"*: Sample a diverse set using an approximation to the log-
        determinant. 

        One method of sampling diverse sets is to use determinantal point
        process (DPP) sampling (see: http://arxiv.org/abs/1207.6083). Given any
        set of items, one measure of diversity is the log-determinant of the
        items' similarity matrix L. The diagonal entry L_{ii} corresponds to the
        quality of item i, while the off-diagonal entries L_{ij} correspond to
        the similarity between items i and j. The log-determinant of this matrix
        corresponds directly to the joint quality-diversity of the items that
        define L (high-qualities lead to a larger value of the determinant,
        while large similarities diminish the log-determinant). However, DPP
        sampling does not scale well, so we can instead approximate the log-
        determinant of a similarity matrix using more scalable methods.

        The Ipsen sampler uses the block-approximation of the determinant given
        in http://arxiv.org/pdf/1105.0437v1.pdf in order to mimic DPP sampling
        in a scalable fashion.

        Requirements: The columns quality_feature and similarity_features
        must be present.

      - *"weighted_vertex_cover"*: Sample a set of items with high quality, and
        with no (or a minimum) of nearest-neighbors also in the set. Given a
        graph with a quality field on the vertices and edges connecting similar
        items, for each item, this algorithm either samples from a distribution
        formed by item qualities or selects the item with the maximum quality,
        and then "covers" (or removes from consideration) that item's neighbors.

        There are two options depending on whether you pass in an SGraph or an
        SFrame. You can:
         1. Define similarity by passing in an SGraph, where an edge between two
            vertices denotes the fact that those items are neighbors. Any time a
            point is sampled, all of its neighbors in the graph are removed.
         2. Or, you can pass in an SFrame and the additional keyword argument
            "num_neighbors". Then when a point is sampled, its num_neighbors
            nearest-neighbors will be removed from consideration.

        Requirements: The parameters quality_feature and similarity_features
        must be defined (where they either match column names or vertex and edge
        fields).

      If no sampling_method is given, then the default option is to use the
      weighted_vertex_cover algorithm with a diversity factor of 0.1.

    side_data: sframe, optional
      An ID-based subset of the original data to sample from. Sometimes you may
      wish to sample from only a subset of the original data - e.g., only
      provide a diverse sample of movies from a particular user's top
      recommendations. In addition, some features may not be initially available
      when creating the sampler object. In order to sample from a subset of IDs,
      with the option to add additional features, set side_data to  an SFrame
      with a column of IDs and (optionally) additional features. Note that the
      model must be aware of these initial features when creating it by adding
      the column names for the side-quality or side-similarity features.

      The sampler will first subset the groundset by the list of IDs passed.
      Then the sampler will use any updated or additional quality or similarity
      features in side_data. If some feature is not available in side_data, the
      sampler uses the original features in the SFrame or SGraph passed in with
      create().

      If side_data is empty, then the sampler will return subsets from the
      original SFrame or SGraph passed in with the data parameter used in
      create().

    similarity_function : string
      TODO: I haven't added this yet

    **kwargs : optional
      Additional method-specific parameters for fine-tuning.

      - *wvc_neighbors*: For sampling_method=weighted_vertex_cover and a
        sampler constructed with an SFrame, remove num_neighbors when a point
        is sampled.

    Examples
    --------
    Sample k items directly from the ground set passed in via create() with the
    default sampling methods:

    >>> sf = graphlab.SFrame.read_csv(
          'https://s3.amazonaws.com/dato-datasets/auto-mpg/auto-mpg.csv')
    >>> sampler = graphlab.diversity.diverse_sampler.create(data=sf, 
                                              item_id='name', 
                                              quality_feature='accel', 
                                              similarity_features=['mpg', 
                                                                   'displ', 
                                                                   'hp', 
                                                                   'weight'])
    >>> sampler.sample(k=5)
    +-----+-----+-------+-----+--------+-------+----+--------+----------------------+
    | mpg | cyl | displ |  hp | weight | accel | yr | origin |         name         |
    +-----+-----+-------+-----+--------+-------+----+--------+----------------------+
    |  15 |  8  | 318.0 | 150 |  3777  |  12.5 | 73 |   1    | dodge coronet custom |
    |  15 |  6  | 258.0 | 110 |  3730  |  19.0 | 75 |   1    |     amc matador      |
    |  30 |  4  |  97.0 |  67 |  1985  |  16.4 | 77 |   3    |      subaru dl       |
    |  34 |  4  |  86.0 |  65 |  1975  |  15.2 | 79 |   3    |   maxda glc deluxe   |
    |  32 |  4  |  98.0 |  70 |  2120  |  15.5 | 80 |   1    |  chevrolet chevette  |
    +-----+-----+-------+-----+--------+-------+----+--------+----------------------+

    This method returns an SFrame (or SGraph, depending on what was used to
    create the sampler) containing the sampled items. If the diverse sampler was
    created with an SGraph, the sampler will return an SFrame containing the
    sampled vertices and their associated fields. You can change the sampling
    method with the "method" keyword. The default algorithm is weighted vertex
    cover.

    >>> sf = sampler.sample(k=5, method='ipsen')
    +-----+-----+-------+-----+--------+-------+----+--------+-----------------------+
    | mpg | cyl | displ |  hp | weight | accel | yr | origin |          name         |
    +-----+-----+-------+-----+--------+-------+----+--------+-----------------------+
    |  15 |  8  | 350.0 | 165 |  3693  |  11.5 | 70 |   1    |   buick skylark 320   |
    |  17 |  8  | 302.0 | 140 |  3449  |  10.5 | 70 |   1    |      ford torino      |
    |  15 |  8  | 400.0 | 150 |  3761  |  9.5  | 70 |   1    | chevrolet monte carlo |
    |  22 |  6  | 198.0 |  95 |  2833  |  15.5 | 70 |   1    |    plymouth duster    |
    |  19 |  6  | 232.0 | 100 |  2634  |  13.0 | 71 |   1    |      amc gremlin      |
    +-----+-----+-------+-----+--------+-------+----+--------+-----------------------+

    Instead of stochastic sampling, you can also force the algorithm to try to
    form the best possible set by using the greedy method:

    >>> sf = sampler.sample(k=5, greedy=True)

    It's possible to tune the methods with the "diversity" keyword, which can
    range between 0 and 1. Larger values will favor reducing inter-item
    similarity (increasing diversity), while smaller values will favor high-
    quality items (decreasing diversity).

    >>> sf = sampler.sample(k=5, diversity=0.0, method='ipsen')
    +-----+-----+-------+-----+--------+-------+----+--------+--------------------+
    | mpg | cyl | displ |  hp | weight | accel | yr | origin |        name        |
    +-----+-----+-------+-----+--------+-------+----+--------+--------------------+
    |  14 |  8  | 440.0 | 215 |  4312  |  8.5  | 70 |   1    | plymouth fury iii  |
    |  15 |  8  | 390.0 | 190 |  3850  |  8.5  | 70 |   1    | amc ambassador dpl |
    |  18 |  6  | 199.0 |  97 |  2774  |  15.5 | 70 |   1    |     amc hornet     |
    |  18 |  6  | 232.0 | 100 |  3288  |  15.5 | 71 |   1    |    amc matador     |
    |  11 |  8  | 429.0 | 208 |  4633  |  11.0 | 72 |   1    |  mercury marquis   |
    +-----+-----+-------+-----+--------+-------+----+--------+--------------------+

    >>> sf = sampler.sample(k=5, diversity=1.0, method='ipsen')
    +-----+-----+-------+-----+--------+-------+----+--------+---------------------------+
    | mpg | cyl | displ |  hp | weight | accel | yr | origin |            name           |
    +-----+-----+-------+-----+--------+-------+----+--------+---------------------------+
    |  18 |  8  | 307.0 | 130 |  3504  |  12.0 | 70 |   1    | chevrolet chevelle malibu |
    |  15 |  8  | 350.0 | 165 |  3693  |  11.5 | 70 |   1    |     buick skylark 320     |
    |  18 |  8  | 318.0 | 150 |  3436  |  11.0 | 70 |   1    |     plymouth satellite    |
    |  16 |  8  | 304.0 | 150 |  3433  |  12.0 | 70 |   1    |       amc rebel sst       |
    |  18 |  6  | 171.0 |  97 |  2984  |  14.5 | 75 |   1    |         ford pinto        |
    +-----+-----+-------+-----+--------+-------+----+--------+---------------------------+

    Finally, if you want to restrict the ground set to a smaller subset, you can
    pass in a list of IDs with the "side_data" keyword:

    >>> ford_sf = sf[sf['name'].apply(lambda x: 'ford' in x)]['name']
    >>> sampler.sample(k=5, side_data=graphlab.SFrame({'name':names_sf}))
    +-----------------------+-----+-----+-------+-----+--------+-------+----+--------+
    |          name         | mpg | cyl | displ |  hp | weight | accel | yr | origin |
    +-----------------------+-----+-----+-------+-----+--------+-------+----+--------+
    |  ford pinto runabout  |  21 |  4  | 122.0 |  86 |  2226  |  16.5 | 72 |   1    |
    |     ford maverick     |  18 |  6  | 250.0 |  88 |  3021  |  16.5 | 73 |   1    |
    | ford gran torino (sw) |  14 |  8  | 302.0 | 140 |  4638  |  16.0 | 74 |   1    |
    |  ford fairmont (auto) |  20 |  6  | 200.0 |  85 |  2965  |  15.8 | 78 |   1    |
    |    ford ltd landau    |  17 |  8  | 302.0 | 129 |  3725  |  13.4 | 79 |   1    |
    +-----------------------+-----+-----+-------+-----+--------+-------+----+--------+

    You can also add updated features, or even features that weren't passed in
    when creating the model (as long as they are one of the features specified
    in "quality_feature" or "similarity_featurs"). These new features will be
    joined to the original dataset. However, if any new features were not
    specified in the "similarity_features" parameter during sampler creation,
    they will not be included when computing similarity between items.

    """
    _raise_error_if_not_of_type(k, int)

    if side_data is not None:
      _raise_error_if_not_of_type(side_data, _gl.SFrame)

    opts = dict()

    if method is not None:
      opts["method"] = method

    if diversity < 0.0 or diversity > 1.0:
      raise ValueError("The diversity parameter must be between 0.0 and 1.0.")
    opts["diversity"] = diversity

    if "wvc_neighbors" in kwargs.keys():
      opts["num_neighbors"] = kwargs["wvc_neighbors"]
    if "greedy" in kwargs.keys():
      opts["greedy"] = kwargs["greedy"]

    if side_data is None:
      return self.__proxy__.sample_from_ground_set(k, opts)
    else:
      return self.__proxy__.sample_from_frame_ref_data(k, side_data, opts)
def create(graph, label_field,
           threshold=1e-3,
           weight_field='',
           self_weight=1.0,
           undirected=False,
           max_iterations=None,
           _single_precision=False,
           _distributed='auto',
           verbose=True):
    """
    Given a weighted graph with observed class labels of a subset of vertices,
    infer the label probability for the unobserved vertices using the
    "label propagation" algorithm.

    The algorithm iteratively updates the label probability of current vertex
    as a weighted sum of label probability of self and the neighboring vertices
    until converge.  See
    :class:`graphlab.label_propagation.LabelPropagationModel` for the details
    of the algorithm.

    Parameters
    ----------
    graph : SGraph
        The graph on which to compute the label propagation.

    label_field: str
        Vertex field storing the initial vertex labels. The values in
        must be [0, num_classes). None values indicate unobserved vertex labels.

    threshold : float, optional
        Threshold for convergence, measured in the average L2 norm
        (the sum of squared values) of the delta of each vertex's
        label probability vector.

    max_iterations: int, optional
        The max number of iterations to run. Default is unlimited.
        If set, the algorithm terminates when either max_iterations
        or convergence threshold is reached.

    weight_field: str, optional
        Vertex field for edge weight. If empty, all edges are assumed
        to have unit weight.

    self_weight: float, optional
        The weight for self edge.

    undirected: bool, optional
        If true, treat each edge as undirected, and propagates label in
        both directions.

    _single_precision : bool, optional
        If true, running label propagation in single precision. The resulting
        probability values may less accurate, but should run faster
        and use less memory.

    _distributed : distributed environment, internal

    verbose : bool, optional
        If True, print progress updates.

    Returns
    -------
    out : LabelPropagationModel

    References
    ----------
    - Zhu, X., & Ghahramani, Z. (2002). `Learning from labeled and unlabeled data
      with label propagation <http://www.cs.cmu.edu/~zhuxj/pub/CMU-CALD-02-107.pdf>`_.

    Examples
    --------
    If given an :class:`~graphlab.SGraph` ``g``, we can create
    a :class:`~graphlab.label_propagation.LabelPropagationModel` as follows:

    >>> g = graphlab.load_graph('http://snap.stanford.edu/data/email-Enron.txt.gz',
    ...                         format='snap')
    # Initialize random classes for a subset of vertices
    # Leave the unobserved vertices with None label.
    >>> import random
    >>> def init_label(vid):
    ...     x = random.random()
    ...     if x < 0.2:
    ...         return 0
    ...     elif x > 0.9:
    ...         return 1
    ...     else:
    ...         return None
    >>> g.vertices['label'] = g.vertices['__id'].apply(init_label, int)
    >>> m = graphlab.label_propagation.create(g, label_field='label')

    We can obtain for each vertex the predicted label and the probability of
    each label in the graph ``g`` using:

    >>> labels = m['labels']     # SFrame
    >>> labels
    +------+-------+-----------------+-------------------+----------------+
    | __id | label | predicted_label |         P0        |       P1       |
    +------+-------+-----------------+-------------------+----------------+
    |  5   |   1   |        1        |        0.0        |      1.0       |
    |  7   |  None |        0        |    0.8213214997   |  0.1786785003  |
    |  8   |  None |        1        | 5.96046447754e-08 | 0.999999940395 |
    |  10  |  None |        0        |   0.534984718273  | 0.465015281727 |
    |  27  |  None |        0        |   0.752801638549  | 0.247198361451 |
    |  29  |  None |        1        | 5.96046447754e-08 | 0.999999940395 |
    |  33  |  None |        1        | 5.96046447754e-08 | 0.999999940395 |
    |  47  |   0   |        0        |        1.0        |      0.0       |
    |  50  |  None |        0        |   0.788279032657  | 0.211720967343 |
    |  52  |  None |        0        |   0.666666666667  | 0.333333333333 |
    +------+-------+-----------------+-------------------+----------------+
    [36692 rows x 5 columns]

    See Also
    --------
    LabelPropagationModel
    """
    _mt._get_metric_tracker().track('toolkit.graph_analytics.label_propagation.create')

    _raise_error_if_not_of_type(label_field, str)
    _raise_error_if_not_of_type(weight_field, str)

    if not isinstance(graph, _SGraph):
        raise TypeError('graph input must be a SGraph object.')

    if graph.vertices[label_field].dtype() != int:
        raise TypeError('label_field %s must be integer typed.' % label_field)

    opts = {'label_field': label_field,
            'threshold': threshold,
            'weight_field': weight_field,
            'self_weight': self_weight,
            'undirected': undirected,
            'max_iterations': max_iterations,
            'single_precision': _single_precision,
            'graph': graph.__proxy__}

    distributed_context = _get_distributed_execution_environment()
    if distributed_context is None:
        params = _main.run('label_propagation', opts, verbose)
        model = params['model']
    else:
        model = _distributed_run('distributed_labelprop', opts, env=_distributed, verbose=verbose)
    return LabelPropagationModel(model)
Esempio n. 58
0
    def sample(self, k, diversity=0.1, subset_ids=None, **kwargs):
        """
        After constructing a diverse sampler, sample a diverse set
        stochastically. The stochastic algorithm depends on the sampling method
        itself.

        Parameters
        ----------
        k : int
            The number of items to sample.

        diversity : double in [0, 1], optional 
            This is a tunable parameter that trades off between quality and
            diversity. A diversity factor of 0 will only consider quality when
            building a set, while a diversity factor of 1 will only consider
            item similarity and will ignore quality. A value between 0 and 1
            will force the algorithm to trade off between quality and diversity.
            Note that this keyword argument is only applicable if both quality
            and similarity features were passed to create().

            The actual effect of the diversity factor depends on the algorithm:
                - When the method is vertex cover or weighted vertex cover, the
                  diversity factor changes the number of nearest-neighbors to
                  remove when sampling an item. Specifically, the number of
                  neighbors is set to the value floor( (N-1)/(k-1) *
                  diversity_factor).


        subset_ids: SArray, optional
            A list of IDs to sample from. Sometimes you may wish to sample from
            only a subset of the original data - e.g., only provide a diverse
            sample of movies from a particular user's top recommendations.
            If subset_ids is empty, then the sampler will return subsets from
            the original SFrame or SGraph passed in with the data parameter used
            in create().


        **kwargs : optional
            Additional method-specific parameters for fine-tuning.

            - *greedy*: Use the greedy algorithm to generate a set. Instead of
              stochastically building a set based on a distribution, for each
              item, take the mode of the current distribution. For instance, if
              only quality features are being considered, using the greedy
              option will return the top-k items. Usually the greedy algorithm
              provides the highest-quality and most-diverse set, but for each
              set of items and algorithm, there is only one set that greedy can
              generate.


        Based on which features were given to create(), different sampling
        methods will be used. One of the four following algorithms are chosen
        based on the initial feature set.

        - *"random"*: If no quality or similarity features are given. Returns a
          completely random set of items, with no reference to item qualities or
          similarities. Note that the greedy method is undefined for a random
          sampler, so it is ignored.

        - *"quality-only"*: If only a quality feature are given. Generate a
          distribution over items based on their quality, and sample from this
          distribution. If greedy is specified, the top-k items in terms of
          quality are returned.

        - *"vertex-cover"*: If only similarity features are given. An internal
          graph is generated if an SFrame is given, and each item is connected
          to it's k-nearest neighbors, where k is determined by the diversity
          factor. When an item is sampled at random, its neighbors are removed
          from the candidate set. If an SGraph is given initially, all vertices
          connected to a sampled point are removed. Note that the greedy method
          for this algorithm is undefined, so it is ignored.

        - *"weighted_vertex_cover"*: The same as vertex cover, except each
          vertex has an associated quality field. When selecting the next point,
          it is sampled from a distribution over the remaining points'
          qualities. If greedy is specified, then the next point is the point
          with the highest quality in the remaining points.

        Examples
        --------
        Sample k items directly from the reference set passed in via create()
        with the default sampling methods:

        >>> cars = graphlab.SFrame.read_csv('https://s3.amazonaws.com/dato-datasets/auto-mpg/auto-mpg.csv')
        >>> sampler = graphlab.diverse_sampler.create(data=cars, 
                                                      item_id='name', 
                                                      quality_feature='accel', 
                                                      similarity_features=['mpg', 
                                                      'displ', 
                                                      'hp', 
                                                      'weight',
                                                      'origin'])
        >>> sampler.sample(k=5)
        +-----+-----+-------+-----+--------+-------+----+--------+----------------+
        | mpg | cyl | displ |  hp | weight | accel | yr | origin |      name      |
        +-----+-----+-------+-----+--------+-------+----+--------+----------------+
        |  26 |  4  | 121.0 | 113 |  2234  |  12.5 | 70 |   2    |    bmw 2002    |
        |  18 |  6  | 232.0 | 100 |  2945  |  16.0 | 73 |   1    |   amc hornet   |
        |  24 |  4  | 116.0 |  75 |  2158  |  15.5 | 73 |   2    |   opel manta   |
        |  36 |  4  |  98.0 |  70 |  2125  |  17.3 | 82 |   1    | mercury lynx l |
        |  44 |  4  |  97.0 |  52 |  2130  |  24.6 | 82 |   2    |   vw pickup    |
        +-----+-----+-------+-----+--------+-------+----+--------+----------------+

        This method returns an SFrame (or SGraph, depending on what was used to
        create the sampler) containing the sampled items. If the diverse sampler
        was created with an SGraph, the sampler will return an SFrame containing
        the sampled vertices and their associated fields. 

        Instead of stochastic sampling, you can also force the algorithm to try
        to form the best possible set by using the greedy method:

        >>> sampler.sample(k=5, greedy=True)
        +-----+-----+-------+----+--------+-------+----+--------+-------------------------------+
        | mpg | cyl | displ | hp | weight | accel | yr | origin |              name             |
        +-----+-----+-------+----+--------+-------+----+--------+-------------------------------+
        |  19 |  4  | 120.0 | 88 |  3270  |  21.9 | 76 |   2    |          peugeot 504          |
        |  27 |  4  | 141.0 | 71 |  3190  |  24.8 | 79 |   2    |          peugeot 504          |
        |  23 |  8  | 260.0 | 90 |  3420  |  22.2 | 79 |   1    | oldsmobile cutlass salon b... |
        |  43 |  4  |  90.0 | 48 |  2335  |  23.7 | 80 |   2    |       vw dasher (diesel)      |
        |  44 |  4  |  97.0 | 52 |  2130  |  24.6 | 82 |   2    |           vw pickup           |
        +-----+-----+-------+----+--------+-------+----+--------+-------------------------------+

        In this example, two Peugeot cars were selected. Although they were
        somewhat different based on the original similarity features we
        specified, it's possible to get an even more diverse sample. To increase
        diversity, the "diversity" keyword (which can range between 0 and 1) can
        be increased. Larger values will favor reducing inter-item similarity
        (increasing diversity), while smaller values will favor high- quality
        items (decreasing diversity).

        >>> sampler.sample(k=5, diversity=0.8, greedy=True)
        +-----+-----+-------+-----+--------+-------+----+--------+-------------------------------+
        | mpg | cyl | displ |  hp | weight | accel | yr | origin |              name             |
        +-----+-----+-------+-----+--------+-------+----+--------+-------------------------------+
        |  27 |  4  |  97.0 |  60 |  1834  |  19.0 | 71 |   2    |      volkswagen model 111     |
        |  32 |  4  |  71.0 |  65 |  1836  |  21.0 | 74 |   3    |      toyota corolla 1200      |
        |  17 |  6  | 231.0 | 110 |  3907  |  21.0 | 75 |   1    |         buick century         |
        |  27 |  4  | 141.0 |  71 |  3190  |  24.8 | 79 |   2    |          peugeot 504          |
        |  23 |  8  | 260.0 |  90 |  3420  |  22.2 | 79 |   1    | oldsmobile cutlass salon b... |
        +-----+-----+-------+-----+--------+-------+----+--------+-------------------------------+

        Finally, if you want to restrict the reference set to a smaller subset,
        you can pass in a list of IDs with the "subset_ids" keyword:

        >>> ford_names = gl.SArray([n for n in cars['name'] if 'ford' in n])
        >>> sampler.sample(5, diversity=1.0, subset_ids=ford_names)
        +-----------------------+-----+-----+-------+-----+--------+-------+----+--------+
        |          name         | mpg | cyl | displ |  hp | weight | accel | yr | origin |
        +-----------------------+-----+-----+-------+-----+--------+-------+----+--------+
        | ford gran torino (sw) |  13 |  8  | 302.0 | 140 |  4294  |  16.0 | 72 |   1    |
        |     ford maverick     |  15 |  6  | 250.0 |  72 |  3158  |  19.5 | 75 |   1    |
        |      ford fiesta      |  36 |  4  |  98.0 |  66 |  1800  |  14.4 | 78 |   1    |
        |     ford escort 2h    |  29 |  4  |  98.0 |  65 |  2380  |  20.7 | 81 |   1    |
        |  ford fairmont futura |  24 |  4  | 140.0 |  92 |  2865  |  16.4 | 82 |   1    |
        +-----------------------+-----+-----+-------+-----+--------+-------+----+--------+
        """
        _raise_error_if_not_of_type(k, int)

        if subset_ids is not None:
            _raise_error_if_not_of_type(subset_ids, _gl.SArray)

        if diversity < 0.0 or diversity > 1.0:
            raise ValueError("The diversity parameter must be between 0.0 and 1.0.")

        if k <= 0:
            raise ValueError("k must be greater than 0.")

        opts = dict()
        opts["diversity"] = diversity

        if "wvc_neighbors" in kwargs.keys():
            opts["num_neighbors"] = kwargs["wvc_neighbors"]
        if "greedy" in kwargs.keys():
            opts["greedy"] = kwargs["greedy"]

        if subset_ids is None:
            return self.__proxy__.sample_from_ground_set(k, opts)
        else:
            return self.__proxy__.sample_from_id_subset(k, subset_ids, opts)
Esempio n. 59
0
def create(data, features=None,
           bm25_k1=1.5,
           bm25_b=0.75,
           tfidf_threshold=0.01):
    """
    Create a searchable index of text columns in an SFrame.

    .. warning::
        This toolkit is currently in beta, and feedback is welcome!
        Please send comments to [email protected].

    Parameters
    ----------
    data : SFrame
      An SFrame containing at least one str column containing text that should
      be indexed.

    features : list of str
      A list of column names that contain text that should be indexed.
      Default: all str columns in the provided dataset.

    bm25_k1 : float
      Tuning parameter for the relative importance of term frequencies when
      computing the BM25 score between a query token and a document.

    bm25_b : float
      Tuning parameter to downweight scores of long documents when
      computing the BM25 score between a query token and a document.

    tfidf_threshold : float
      Tuning parameter to skip indexing words that have a TF-IDF score below
      this value.

    query_expansion_k : int
      Maximum number of nearest words to include from query token.

    query_expansion_epsilon : float
      Maximum distance to allow between query token and nearby word when
      doing query expansion. Must be between 0 and 1.

    query_expansion_near_match_weight : float
      Multiplier to use on BM25 scores for documents indexed via an
      approximate match with a given token. Must be between 0 and 1.


    Returns
    -------
    out
       SearchModel

    See Also
    --------
    SearchModel.query

    References
    ----------

    Christopher D. Manning, Hinrich Schutze, and Prabhakar Raghavan.
    Introduction to information retrieval.
    http://nlp.stanford.edu/IR-book/pdf/irbookonlinereading.pdf

    Examples
    --------

    >>> import graphlab as gl
    >>> sf = gl.SFrame({'text': ['Hello my friend', 'I love this burrito']})
    >>> m = gl._internal.search.create(sf)
    >>> print m.query('burrito')

    """

    # Input validation on data and features
    if features is None:
        features = _get_str_columns(data)

    if not isinstance(features, list):
        raise ValueError("Expected features to be a list.")

    _raise_error_if_not_of_type(data, [_gl.SFrame])
    _raise_error_if_not_of_type(features, [list])
    for f in features:
        if data[f].dtype() != str:
            raise _ToolkitError("Feature `%s` must be of type str" % f)

    # Store options
    options = {}
    options['bm25_b'] = bm25_b
    options['bm25_k1'] = bm25_k1
    options['tfidf_threshold'] = tfidf_threshold

    # Construct model
    proxy = _gl.extensions._SearchIndex()
    proxy.init_options(options)
    proxy.init_indexer(data)
    for f in features:
        proxy.index(f)

    return SearchModel(proxy)
    def __init__(self, features=None, excluded_features=None,
        n=2, method="word", to_lower=True, ignore_punct=True, ignore_space=True,
        delimiters=["\r", "\v", "\n", "\f", "\t", " ",
                    "!", "#", "$", "%", "&", "'", "(", ")",
                    "*", "+", ",", "-", ".", "/", ":", ";",
                    "<", "=", ">", "?", "@", "[", "\\", "]",
                    "^", "_", "`", "{", "|", "}", "~"],
        output_column_prefix=None):

        # Process and make a copy of the features, exclude.
        _features, _exclude = _internal_utils.process_features(features, excluded_features)

        # Type checking
        _raise_error_if_not_of_type(features, [list, str, _NoneType])
        _raise_error_if_not_of_type(excluded_features, [list, str, _NoneType])
        _raise_error_if_not_of_type(n, [int])
        _raise_error_if_not_of_type(method, [str])
        _raise_error_if_not_of_type(to_lower, [bool])
        _raise_error_if_not_of_type(ignore_punct, [bool])
        _raise_error_if_not_of_type(ignore_space, [bool])
        _raise_error_if_not_of_type(delimiters, [list, _NoneType])
        _raise_error_if_not_of_type(output_column_prefix, [str, _NoneType])

        if delimiters != None:
            for delim in delimiters:
                _raise_error_if_not_of_type(delim, str, "delimiters")
                if (len(delim) != 1):
                    raise ValueError("Delimiters must be single-character strings")

        if n < 1:
            raise ValueError("Input 'n' must be greater than 0")

        if n > 5 and method == 'word':
            warnings.warn("It is unusual for n-grams to be of size larger than 5.")

        if method != "word" and method != "character":
            raise ValueError("Invalid 'method' input  value. Please input " +
                             "either 'word' or 'character' ")

        # Set up options
        opts = {
          'n': n,
          'features': features,
          'ngram_type': method,
          'to_lower': to_lower,
          'ignore_punct': ignore_punct,
          'ignore_space': ignore_space,
          'delimiters': delimiters,
          'output_column_prefix' : output_column_prefix
        }
        if _exclude:
            opts['exclude'] = True
            opts['features'] = _exclude
        else:
            opts['exclude'] = False
            opts['features'] = _features

        # Initialize object
        proxy = _gl.extensions._NGramCounter()
        proxy.init_transformer(opts)
        super(NGramCounter, self).__init__(proxy, self.__class__)