Python _SFrame Examples, graphlab.data_structures.sframe._SFrame Python Examples

Example #1

0

Show file

File: _parallel.py Project: eb777ez/Yelp-Recommendation-System

def _combine(task):
    '''
    The actual code that will be ran inside of a task to combine all results and add the parameter
    column to the final SFrame(s).
    '''
    # Initialize empty SFrames for each output.
    for out_name in task.get_outputs():
        task.outputs[out_name] = _SFrame()

    params_to_outputs = task.params[_COMBINE_PARAMETER_NAME]
    for params, path in params_to_outputs:
        for out_name in task.get_outputs():

            try: 
                cur_result = _SFrame(_path_join(path, out_name))
            except IOError:
                _log.info("No output for %s with parameters: %s " % (out_name, str(params)))
                continue

            # Add the 'Parameters' column and append to previous results.
            cur_result['parameters'] = _SArray.from_const(params, len(cur_result))
            cur_result.__materialize__()
            task.outputs[out_name] = task.outputs[out_name].append(cur_result)

Example #2

0

Show file

File: similarity_search.py Project: Mawul4j/Machine-Learning-Course

def get_default_options():
    """
    Return default options information for the similarity search toolkit.

    Returns
    -------
    out : SFrame
        Each row in the output SFrames correspond to a parameter, and
        includes columns for default values, lower and upper bounds,
        description, and type.
    """
    out = _SFrame({'name': ['method', 'feature_model', 'verbose'],
                  'default_value' : ['lsh', 'auto', 'True'],
                  'lower_bound': [None, None, 0],
                  'upper_bound': [None, None, 1],
                  'description': ['Method for searching reference data',
                                  'Trained model for extracting features from raw data objects',
                                  'Whether progress output is printed'],
                  'parameter_type': ['string', 'model', 'boolean']})

    return out

Example #3

0

Show file

File: similarity_search.py Project: divya2661/food-recommendation-engine-

def get_default_options():
    """
    Return default options information for the similarity search toolkit.

    Returns
    -------
    out : SFrame
        Each row in the output SFrames correspond to a parameter, and
        includes columns for default values, lower and upper bounds,
        description, and type.
    """
    out = _SFrame({'name': ['method', 'feature_model', 'verbose'],
                  'default_value' : ['lsh', 'auto', 'True'],
                  'lower_bound': [None, None, 0],
                  'upper_bound': [None, None, 1],
                  'description': ['Method for searching reference data',
                                  'Trained model for extracting features from raw data objects',
                                  'Whether progress output is printed'],
                  'parameter_type': ['string', 'model', 'boolean']})

    return out

Example #4

0

Show file

File: _model.py Project: ishmnnit/Yelp-Recommendation-System

    def get_default_options_for_model(output_type = 'sframe'):
        """
        Get the default options for the toolkit :class:`~graphlab.{module_name}.{python_class_name}`.

        Parameters
        ----------
        output_type : str, optional

            The output can be of the following types.

            - `sframe`: A table description each option used in the model.
            - `json`: A list of option dictionaries.

            | Each dictionary/row in the JSON/SFrame object describes the
              following parameters of the given model.

            +------------------+-------------------------------------------------------+
            |      Name        |                  Description                          |
            +==================+=======================================================+
            | name             | Name of the option used in the model.                 |
            +------------------+---------+---------------------------------------------+
            | description      | A detailed description of the option used.            |
            +------------------+-------------------------------------------------------+
            | type             | Option type (REAL, BOOL, INTEGER or CATEGORICAL)      |
            +------------------+-------------------------------------------------------+
            | default_value    | The default value for the option.                     |
            +------------------+-------------------------------------------------------+
            | possible_values  | List of acceptable values (CATEGORICAL only)          |
            +------------------+-------------------------------------------------------+
            | lower_bound      | Smallest acceptable value for this option (REAL only) |
            +------------------+-------------------------------------------------------+
            | upper_bound      | Largest acceptable value for this option (REAL only)  |
            +------------------+-------------------------------------------------------+

        Returns
        -------
        out : JSON/SFrame

        See Also
        --------
        graphlab.{module_name}.{python_class_name}.get_current_options

        Examples
        --------
        .. sourcecode:: python

          >>> import graphlab

          # Returns an output as an SFrame
          >>> out_sframe = graphlab.{module_name}.get_default_options()

          # Returns the output as a JSON
          >>> out_sframe = graphlab.{module_name}.get_default_options('json')
        """
        _mt._get_metric_tracker().track('toolkit.%s.get_default_options' % module_name)
        response = _gl.extensions._toolkits_get_default_options(
                                                      unity_server_model_name)
        for k in response.keys():
            response[k] = json.loads(response[k],
               parse_int = lambda x: float(x) if type(int(x)) is long else int(x))

        if output_type == 'json':
          return response
        else:
          json_list = [{'name': k, '': v} for k,v in response.items()]
          return _SFrame(json_list).unpack('X1', column_name_prefix='')\
                                   .unpack('X1', column_name_prefix='')

Example #5

0

Show file

File: _internal_utils.py Project: eb777ez/Yelp-Recommendation-System

from array import array as _array
import json
from graphlab.data_structures.sframe import SArray as _SArray
from graphlab.data_structures.sframe import SFrame as _SFrame
from graphlab.data_structures.sgraph import SGraph as _SGraph
from graphlab.data_structures.sgraph import Vertex as _Vertex
from graphlab.data_structures.sgraph import Edge as _Edge
from graphlab.cython.cy_sarray import UnitySArrayProxy
from graphlab.cython.cy_sframe import UnitySFrameProxy
from graphlab.cython.cy_graph import UnityGraphProxy
from graphlab.toolkits._main import ToolkitError
import logging as _logging


_proxy_map = {UnitySFrameProxy: (lambda x: _SFrame(_proxy=x)),
              UnitySArrayProxy: (lambda x: _SArray(_proxy=x)),
              UnityGraphProxy: (lambda x: _SGraph(_proxy=x))}

def _add_docstring(format_dict):
  """
  Format a doc-string on the fly.
  @arg format_dict: A dictionary to format the doc-strings
  Example:

    @add_docstring({'context': __doc_string_context})
    def predict(x):
      '''
      {context}
        >> model.predict(data)
      '''

Example #6

0

Show file

File: topic_model.py Project: ishmnnit/Yelp-Recommendation-System

    def get_topics(self, topic_ids=None, num_words=5, cdf_cutoff=1.0,
                   output_type='topic_probabilities'):

        """
        Get the words associated with a given topic. The score column is the
        probability of choosing that word given that you have chosen a
        particular topic.

        Parameters
        ----------
        topic_ids : list of int, optional
            The topics to retrieve words. Topic ids are zero-based.
            Throws an error if greater than or equal to m['num_topics'], or
            if the requested topic name is not present.

        num_words : int, optional
            The number of words to show.

        cdf_cutoff : float, optional
            Allows one to only show the most probable words whose cumulative
            probability is below this cutoff. For example if there exist
            three words where

            .. math::
               p(word_1 | topic_k) = .1

               p(word_2 | topic_k) = .2

               p(word_3 | topic_k) = .05

            then setting :math:`cdf_{cutoff}=.3` would return only
            :math:`word_1` and :math:`word_2` since
            :math:`p(word_1 | topic_k) + p(word_2 | topic_k) <= cdf_{cutoff}`

        output_type : {'topic_probabilities' | 'topic_words'}, optional
            Determine the type of desired output. See below.

        Returns
        -------
        out : SFrame
            If output_type is 'topic_probabilities', then the returned value is
            an SFrame with a column of words ranked by a column of scores for
            each topic. Otherwise, the returned value is a SArray where
            each element is a list of the most probable words for each topic.

        Examples
        --------
        Get the highest ranked words for all topics.

        >>> docs = graphlab.SArray('http://s3.amazonaws.com/GraphLab-Datasets/nips-text')
        >>> m = graphlab.topic_model.create(docs,
                                            num_iterations=50)
        >>> m.get_topics()
        +-------+----------+-----------------+
        | topic |   word   |      score      |
        +-------+----------+-----------------+
        |   0   |   cell   |  0.028974400831 |
        |   0   |  input   | 0.0259470208503 |
        |   0   |  image   | 0.0215721599763 |
        |   0   |  visual  | 0.0173635081992 |
        |   0   |  object  | 0.0172447874156 |
        |   1   | function | 0.0482834508265 |
        |   1   |  input   | 0.0456270024091 |
        |   1   |  point   | 0.0302662839454 |
        |   1   |  result  | 0.0239474934631 |
        |   1   | problem  | 0.0231750116011 |
        |  ...  |   ...    |       ...       |
        +-------+----------+-----------------+

        Get the highest ranked words for topics 0 and 1 and show 15 words per
        topic.

        >>> m.get_topics([0, 1], num_words=15)
        +-------+----------+------------------+
        | topic |   word   |      score       |
        +-------+----------+------------------+
        |   0   |   cell   |  0.028974400831  |
        |   0   |  input   | 0.0259470208503  |
        |   0   |  image   | 0.0215721599763  |
        |   0   |  visual  | 0.0173635081992  |
        |   0   |  object  | 0.0172447874156  |
        |   0   | response | 0.0139740298286  |
        |   0   |  layer   | 0.0122585145062  |
        |   0   | features | 0.0115343177265  |
        |   0   | feature  | 0.0103530459301  |
        |   0   | spatial  | 0.00823387994361 |
        |  ...  |   ...    |       ...        |
        +-------+----------+------------------+

        If one wants to instead just get the top words per topic, one may
        change the format of the output as follows.

        >>> topics = m.get_topics(output_type='topic_words')
        dtype: list
        Rows: 10
        [['cell', 'image', 'input', 'object', 'visual'],
         ['algorithm', 'data', 'learning', 'method', 'set'],
         ['function', 'input', 'point', 'problem', 'result'],
         ['model', 'output', 'pattern', 'set', 'unit'],
         ['action', 'learning', 'net', 'problem', 'system'],
         ['error', 'function', 'network', 'parameter', 'weight'],
         ['information', 'level', 'neural', 'threshold', 'weight'],
         ['control', 'field', 'model', 'network', 'neuron'],
         ['hidden', 'layer', 'system', 'training', 'vector'],
         ['component', 'distribution', 'local', 'model', 'optimal']]
        """
        _mt._get_metric_tracker().track('toolkit.text.topic_model.get_topics')

        _check_categorical_option_type('output_type', output_type,
            ['topic_probabilities', 'topic_words'])

        if topic_ids is None:
            topic_ids = range(self.get('num_topics'))

        assert isinstance(topic_ids, list), \
            "The provided topic_ids is not a list."

        if any([type(x) == str for x in topic_ids]):
            raise ValueError, \
                "Only integer topic_ids can be used at this point in time."
        if not all([x >= 0 and x < self['num_topics']]):
            raise ValueError, \
                "Topic id values must be non-negative and less than the " + \
                "number of topics used to fit the model."

        opts = {'model': self.__proxy__,
                'topic_ids': topic_ids,
                'num_words': num_words,
                'cdf_cutoff': cdf_cutoff}
        response = _graphlab.toolkits._main.run('text_topicmodel_get_topic',
                                               opts)
        ret = _map_unity_proxy_to_object(response['top_words'])

        if output_type != 'topic_probabilities':
            sa = ret.unstack(['word','score'], 'word')['word'].dict_keys()
            ret = _SFrame({'words': sa})

        return ret

Example #7

0

Show file

File: topic_model.py Project: ishmnnit/Yelp-Recommendation-System

    def get(self, field):
        """
        Return the value of a given field. The list of all queryable fields is
        detailed below, and can be obtained with the
        :py:func:`~TopicModel.list_fields` method.

        +-----------------------+----------------------------------------------+
        |      Field            | Description                                  |
        +=======================+==============================================+
        | topics                | An SFrame containing a column with the unique|
        |                       | words observed during training, and a column |
        |                       | of arrays containing the probability values  |
        |                       | for each word given each of the topics.      |
        +-----------------------+----------------------------------------------+
        | vocabulary            | An SArray containing the words used. This is |
        |                       | same as the vocabulary column in the topics  |
        |                       | field above.                                 |
        +-----------------------+----------------------------------------------+

        Parameters
        ----------
        field : string
            Name of the field to be retrieved.

        Returns
        -------
        out
            Value of the requested field.

        See Also
        --------
        list_fields

        Examples
        --------
        >>> docs = graphlab.SArray('http://s3.amazonaws.com/GraphLab-Datasets/nips-text')
        >>> m = graphlab.topic_model.create(docs)
        >>> m.get('topics')
        +--------------------------------+------------+
        |      topic_probabilities       | vocabulary |
        +--------------------------------+------------+
        | array('d', [0.000514752462 ... |  limited   |
        | array('d', [6.120718939647 ... |  consider  |
        | array('d', [0.000337251613 ... | represent  |
        | array('d', [0.000104664293 ... |    lack    |
        | array('d', [6.120718939647 ... | desirable  |
        | array('d', [6.120718939647 ... |   focus    |
        | array('d', [6.120718939647 ... | generaliza |
        | array('d', [6.120718939647 ... | generalize |
        | array('d', [6.120718939647 ... |    row     |
        | array('d', [6.120718939647 ... |   depend   |
        |              ...               |    ...     |
        +--------------------------------+------------+

        You may also do m['topics'].
        """

        _mt._get_metric_tracker().track('toolkit.text.topic_model.get')
        opts = {'model': self.__proxy__, 'field': field}
        response = _graphlab.toolkits._main.run("text_topicmodel_get_value", opts)
        if field == 'vocabulary':
            return _SArray(None, _proxy=response['value'])
        elif field == 'topics':
            return _SFrame(None, _proxy=response['value'])
        return response['value']

Example #8

0

Show file

"""

import json
from graphlab.data_structures.sframe import SArray as _SArray
from graphlab.data_structures.sframe import SFrame as _SFrame
from graphlab.data_structures.sgraph import SGraph as _SGraph
from graphlab.data_structures.sgraph import Vertex as _Vertex
from graphlab.data_structures.sgraph import Edge as _Edge
from graphlab.cython.cy_sarray import UnitySArrayProxy
from graphlab.cython.cy_sframe import UnitySFrameProxy
from graphlab.cython.cy_graph import UnityGraphProxy
from graphlab.toolkits._main import ToolkitError
import logging as _logging


_proxy_map = {UnitySFrameProxy: (lambda x: _SFrame(_proxy=x)),
              UnitySArrayProxy: (lambda x: _SArray(_proxy=x)),
              UnityGraphProxy: (lambda x: _SGraph(_proxy=x))}

def _add_docstring(format_dict):
  """
  Format a doc-string on the fly.
  @arg format_dict: A dictionary to format the doc-strings
  Example:

    @add_docstring({'context': __doc_string_context})
    def predict(x):
      '''
      {context}
        >> model.predict(data)
      '''

Example #9

0

Show file

File: _nearest_neighbors.py Project: Mawul4j/Machine-Learning-Course

    def similarity_graph(self, k=5, radius=None, include_self_edges=False,
                         output_type='SGraph', verbose=True):
        """
        Construct the similarity graph on the reference dataset, which is
        already stored in the model. This is conceptually very similar to
        running `query` with the reference set, but this method is optimized for
        the purpose, syntactically simpler, and automatically removes
        self-edges.

        Parameters
        ----------
        k : int, optional
            Maximum number of neighbors to return for each point in the dataset.
            Setting this to ``None`` deactivates the constraint, so that all
            neighbors are returned within ``radius`` of a given point.

        radius : float, optional
            For a given point, only neighbors within this distance are returned.
            The default is ``None``, in which case the ``k`` nearest neighbors
            are returned for each query point, regardless of distance.

        include_self_edges : bool, optional
            For most distance functions, each point in the model's reference
            dataset is its own nearest neighbor. If this parameter is set to
            False, this result is ignored, and the nearest neighbors are
            returned *excluding* the point itself.

        output_type : {'SGraph', 'SFrame'}, optional
            By default, the results are returned in the form of an SGraph, where
            each point in the reference dataset is a vertex and an edge A -> B
            indicates that vertex B is a nearest neighbor of vertex A. If
            'output_type' is set to 'SFrame', the output is in the same form as
            the results of the 'query' method: an SFrame with columns indicating
            the query label (in this case the query data is the same as the
            reference data), reference label, distance between the two points,
            and the rank of the neighbor.

        verbose : bool, optional
            If True, print progress updates and model details.

        Returns
        -------
        out : SFrame or SGraph
            The type of the output object depends on the 'output_type'
            parameter. See the parameter description for more detail.

        Notes
        -----
        - If both ``k`` and ``radius`` are set to ``None``, each data point is
          matched to the entire dataset. If the reference dataset has
          :math:`n` rows, the output is an SFrame with :math:`n^2` rows (or an
          SGraph with :math:`n^2` edges).

        - For models created with the 'lsh' method, the output similarity graph
          may have fewer vertices than there are data points in the original
          reference set. Because LSH is an approximate method, a query point may
          have fewer than 'k' neighbors. If LSH returns no neighbors at all for
          a query and self-edges are excluded, the query point is omitted from
          the results.

        Examples
        --------
        First construct an SFrame and create a nearest neighbors model:

        >>> sf = graphlab.SFrame({'x1': [0.98, 0.62, 0.11],
        ...                       'x2': [0.69, 0.58, 0.36]})
        ...
        >>> model = graphlab.nearest_neighbors.create(sf, distance='euclidean')

        Unlike the ``query`` method, there is no need for a second dataset with
        ``similarity_graph``.

        >>> g = model.similarity_graph(k=1)  # an SGraph
        >>> g.show()
        >>> g.edges
        +----------+----------+----------------+------+
        | __src_id | __dst_id |    distance    | rank |
        +----------+----------+----------------+------+
        |    0     |    1     | 0.376430604494 |  1   |
        |    2     |    1     | 0.55542776308  |  1   |
        |    1     |    0     | 0.376430604494 |  1   |
        +----------+----------+----------------+------+
        """
        _mt._get_metric_tracker().track(
            'toolkit.nearest_neighbors.similarity_graph')

        ## Validate inputs.
        if k is not None:
            if not isinstance(k, int):
                raise ValueError("Input 'k' must be an integer.")

            if k <= 0:
                raise ValueError("Input 'k' must be larger than 0.")

        if radius is not None:
            if not isinstance(radius, (int, float)):
                raise ValueError("Input 'radius' must be an integer or float.")

            if radius < 0:
                raise ValueError("Input 'radius' must be non-negative.")


        ## Set k and radius to special values to indicate 'None'
        if k is None:
            k = -1

        if radius is None:
            radius = -1.0

        opts = {'model': self.__proxy__,
                'model_name': self.__name__,
                'k': k,
                'radius': radius,
                'include_self_edges': include_self_edges}

        result = _graphlab.toolkits._main.run('_nearest_neighbors.similarity_graph',
                                              opts, verbose)

        knn = _SFrame(None, _proxy=result['neighbors'])

        if output_type == "SFrame":
            return knn

        else:
            sg = _SGraph(edges=knn, src_field='query_label',
                         dst_field='reference_label')
            return sg

Example #10

0

Show file

File: topic_model.py Project: divya2661/food-recommendation-engine-

    def get(self, field):
        """
        Return the value of a given field. The list of all queryable fields is
        detailed below, and can be obtained with the
        :py:func:`~TopicModel.list_fields` method.

        +-----------------------+----------------------------------------------+
        |      Field            | Description                                  |
        +=======================+==============================================+
        | topics                | An SFrame containing a column with the unique|
        |                       | words observed during training, and a column |
        |                       | of arrays containing the probability values  |
        |                       | for each word given each of the topics.      |
        +-----------------------+----------------------------------------------+
        | vocabulary            | An SArray containing the words used. This is |
        |                       | same as the vocabulary column in the topics  |
        |                       | field above.                                 |
        +-----------------------+----------------------------------------------+

        Parameters
        ----------
        field : string
            Name of the field to be retrieved.

        Returns
        -------
        out
            Value of the requested field.

        See Also
        --------
        list_fields

        Examples
        --------
        >>> docs = graphlab.SArray('https://static.turi.com/datasets/nips-text')
        >>> m = graphlab.topic_model.create(docs)
        >>> m.get('topics')
        +--------------------------------+------------+
        |      topic_probabilities       | vocabulary |
        +--------------------------------+------------+
        | array('d', [0.000514752462 ... |  limited   |
        | array('d', [6.120718939647 ... |  consider  |
        | array('d', [0.000337251613 ... | represent  |
        | array('d', [0.000104664293 ... |    lack    |
        | array('d', [6.120718939647 ... | desirable  |
        | array('d', [6.120718939647 ... |   focus    |
        | array('d', [6.120718939647 ... | generaliza |
        | array('d', [6.120718939647 ... | generalize |
        | array('d', [6.120718939647 ... |    row     |
        | array('d', [6.120718939647 ... |   depend   |
        |              ...               |    ...     |
        +--------------------------------+------------+

        You may also do m['topics'].
        """

        opts = {'model': self.__proxy__, 'field': field}
        response = _graphlab.toolkits._main.run("text_topicmodel_get_value", opts)
        if field == 'vocabulary':
            return _SArray(None, _proxy=response['value'])
        elif field == 'topics':
            return _SFrame(None, _proxy=response['value'])
        return response['value']

Example #11

0

Show file

File: kmeans.py Project: divya2661/food-recommendation-engine-

    def get(self, field):
        """
        Return the value of a given field.

        The list of all queryable fields is detailed below, and can be obtained
        with the ``list_fields`` method.

        +-----------------------+----------------------------------------------+
        |      Field            | Description                                  |
        +=======================+==============================================+
        | batch_size            | Number of randomly chosen examples to use in |
        |                       | each training iteration.                     |
        +-----------------------+----------------------------------------------+
        | cluster_id            | Cluster assignment for each data point and   |
        |                       | Euclidean distance to the cluster center     |
        +-----------------------+----------------------------------------------+
        | cluster_info          | Cluster centers, sum of squared Euclidean    |
        |                       | distances from each cluster member to the    |
        |                       | assigned center, and the number of data      |
        |                       | points belonging to the cluster              |
        +-----------------------+----------------------------------------------+
        | features              | Names of feature columns                     |
        +-----------------------+----------------------------------------------+
        | max_iterations        | Maximum number of iterations to perform      |
        +-----------------------+----------------------------------------------+
        | method                | Algorithm used to train the model.           |
        +-----------------------+----------------------------------------------+
        | num_clusters          | Number of clusters                           |
        +-----------------------+----------------------------------------------+
        | num_examples          | Number of examples in the dataset            |
        +-----------------------+----------------------------------------------+
        | num_features          | Number of feature columns used               |
        +-----------------------+----------------------------------------------+
        | num_unpacked_features | Number of features unpacked from the         |
        |                       | feature columns                              |
        +-----------------------+----------------------------------------------+
        | training_iterations   | Total number of iterations performed         |
        +-----------------------+----------------------------------------------+
        | training_time         | Total time taken to cluster the data         |
        +-----------------------+----------------------------------------------+
        | unpacked_features     | Names of features unpacked from the          |
        |                       | feature columns                              |
        +-----------------------+----------------------------------------------+
        | verbose               | True if model training should print progress |
        +-----------------------+----------------------------------------------+

        Parameters
        ----------
        field : str
            The name of the field to query.

        Returns
        -------
        out
            Value of the requested field

        See Also
        --------
        list_fields

        Examples
        --------

        >>> model.get("cluster_info")
                d1        d2        d3        d4    sum_squared_distance  size
        0 -0.777484  1.048897  0.523926  0.487775       2.459470           4
        1  0.844906 -0.613151 -0.088785 -0.212908       3.651614           5
        2 -1.114592 -1.129836 -1.651781 -0.886557       0.000000           1

        [3 rows x 6 columns]
        """

        opts = {'model': self.__proxy__,
                'model_name': self.__name__,
                'field': field}
        response = _gl.toolkits._main.run('kmeans_get_value', opts)

        # cluster_id and cluster_info both return a unity SFrame. Cast to an SFrame.
        if field == 'cluster_id' or field == 'cluster_info':
            return _SFrame(None, _proxy=response['value'])
        else:
            return response['value']

Example #12

0

Show file

File: _nearest_neighbors.py Project: divya2661/food-recommendation-engine-

    def query(self, dataset, label=None, k=5, radius=None, verbose=True):
        """
        For each row of the input 'dataset', retrieve the nearest neighbors
        from the model's stored data. In general, the query dataset does not
        need to be the same as the reference data stored in the model, but if
        it is, the 'include_self_edges' parameter can be set to False to
        exclude results that match query points to themselves.

        Parameters
        ----------
        dataset : SFrame
            Query data. Must contain columns with the same names and types as
            the features used to train the model. Additional columns are
            allowed, but ignored. Please see the nearest neighbors
            :func:`~graphlab.nearest_neighbors.create` documentation for more
            detail on allowable data types.

        label : str, optional
            Name of the query SFrame column with row labels. If 'label' is not
            specified, row numbers are used to identify query dataset rows in
            the output SFrame.

        k : int, optional
            Number of nearest neighbors to return from the reference set for
            each query observation. The default is 5 neighbors, but setting it
            to ``None`` will return all neighbors within ``radius`` of the
            query point.

        radius : float, optional
            Only neighbors whose distance to a query point is smaller than this
            value are returned. The default is ``None``, in which case the
            ``k`` nearest neighbors are returned for each query point,
            regardless of distance.

        verbose: bool, optional
            If True, print progress updates and model details.

        Returns
        -------
        out : SFrame
            An SFrame with the k-nearest neighbors of each query observation.
            The result contains four columns: the first is the label of the
            query observation, the second is the label of the nearby reference
            observation, the third is the distance between the query and
            reference observations, and the fourth is the rank of the reference
            observation among the query's k-nearest neighbors.

        See Also
        --------
        similarity_graph

        Notes
        -----
        - The `dataset` input to this method *can* have missing values (in
          contrast to the reference dataset used to create the nearest
          neighbors model). Missing numeric values are imputed to be the mean
          of the corresponding feature in the reference dataset, and missing
          strings are imputed to be empty strings.

        - If both ``k`` and ``radius`` are set to ``None``, each query point
          returns all of the reference set. If the reference dataset has
          :math:`n` rows and the query dataset has :math:`m` rows, the output
          is an SFrame with :math:`nm` rows.

        - For models created with the 'lsh' method, the query results may have
          fewer query labels than input query points. Because LSH is an
          approximate method, a query point may have fewer than 'k' neighbors.
          If LSH returns no neighbors at all for a query, the query point is
          omitted from the results.

        Examples
        --------
        First construct a toy SFrame and create a nearest neighbors model:

        >>> sf = graphlab.SFrame({'label': range(3),
        ...                       'feature1': [0.98, 0.62, 0.11],
        ...                       'feature2': [0.69, 0.58, 0.36]})
        >>> model = graphlab.nearest_neighbors.create(sf, 'label')

        A new SFrame contains query observations with same schema as the
        reference SFrame. This SFrame is passed to the ``query`` method.

        >>> queries = graphlab.SFrame({'label': range(3),
        ...                            'feature1': [0.05, 0.61, 0.99],
        ...                            'feature2': [0.06, 0.97, 0.86]})
        >>> model.query(queries, 'label', k=2)
        +-------------+-----------------+----------------+------+
        | query_label | reference_label |    distance    | rank |
        +-------------+-----------------+----------------+------+
        |      0      |        2        | 0.305941170816 |  1   |
        |      0      |        1        | 0.771556867638 |  2   |
        |      1      |        1        | 0.390128184063 |  1   |
        |      1      |        0        | 0.464004310325 |  2   |
        |      2      |        0        | 0.170293863659 |  1   |
        |      2      |        1        | 0.464004310325 |  2   |
        +-------------+-----------------+----------------+------+
        """

        _mt._get_metric_tracker().track('toolkit.nearest_neighbors.query')

        ## Validate the 'dataset' input
        _tkutl._raise_error_if_not_sframe(dataset, "dataset")
        _tkutl._raise_error_if_sframe_empty(dataset, "dataset")

        ## Get model features
        ref_features = self.get('features')
        sf_features = _tkutl._toolkits_select_columns(dataset, ref_features)

        ## Validate and preprocess the 'label' input
        if label is None:
            query_labels = _graphlab.SArray.from_sequence(len(dataset))

        else:
            if not label in dataset.column_names():
                raise ValueError(
                    "Input 'label' must be a string matching the name of a " +\
                    "column in the reference SFrame 'dataset'.")

            if not dataset[label].dtype() == str and not dataset[label].dtype(
            ) == int:
                raise TypeError(
                    "The label column must contain integers or strings.")

            if label in ref_features:
                raise ValueError(
                    "The label column cannot be one of the features.")

            query_labels = dataset[label]

        ## Validate neighborhood parameters 'k' and 'radius'
        if k is not None:
            if not isinstance(k, int):
                raise ValueError("Input 'k' must be an integer.")

            if k <= 0:
                raise ValueError("Input 'k' must be larger than 0.")

        if radius is not None:
            if not isinstance(radius, (int, float)):
                raise ValueError("Input 'radius' must be an integer or float.")

            if radius < 0:
                raise ValueError("Input 'radius' must be non-negative.")

        ## Set k and radius to special values to indicate 'None'
        if k is None:
            k = -1

        if radius is None:
            radius = -1.0

        opts = {
            'model': self.__proxy__,
            'model_name': self.__name__,
            'features': sf_features,
            'query_labels': query_labels,
            'k': k,
            'radius': radius
        }

        result = _graphlab.toolkits._main.run('_nearest_neighbors.query', opts,
                                              verbose)
        return _SFrame(None, _proxy=result['neighbors'])

Example #13

0

Show file

File: _nearest_neighbors.py Project: divya2661/food-recommendation-engine-

    def similarity_graph(self,
                         k=5,
                         radius=None,
                         include_self_edges=False,
                         output_type='SGraph',
                         verbose=True):
        """
        Construct the similarity graph on the reference dataset, which is
        already stored in the model. This is conceptually very similar to
        running `query` with the reference set, but this method is optimized
        for the purpose, syntactically simpler, and automatically removes
        self-edges.

        Parameters
        ----------
        k : int, optional
            Maximum number of neighbors to return for each point in the
            dataset. Setting this to ``None`` deactivates the constraint, so
            that all neighbors are returned within ``radius`` of a given point.

        radius : float, optional
            For a given point, only neighbors within this distance are
            returned. The default is ``None``, in which case the ``k`` nearest
            neighbors are returned for each query point, regardless of
            distance.

        include_self_edges : bool, optional
            For most distance functions, each point in the model's reference
            dataset is its own nearest neighbor. If this parameter is set to
            False, this result is ignored, and the nearest neighbors are
            returned *excluding* the point itself.

        output_type : {'SGraph', 'SFrame'}, optional
            By default, the results are returned in the form of an SGraph,
            where each point in the reference dataset is a vertex and an edge A
            -> B indicates that vertex B is a nearest neighbor of vertex A. If
            'output_type' is set to 'SFrame', the output is in the same form as
            the results of the 'query' method: an SFrame with columns
            indicating the query label (in this case the query data is the same
            as the reference data), reference label, distance between the two
            points, and the rank of the neighbor.

        verbose : bool, optional
            If True, print progress updates and model details.

        Returns
        -------
        out : SFrame or SGraph
            The type of the output object depends on the 'output_type'
            parameter. See the parameter description for more detail.

        Notes
        -----
        - If both ``k`` and ``radius`` are set to ``None``, each data point is
          matched to the entire dataset. If the reference dataset has
          :math:`n` rows, the output is an SFrame with :math:`n^2` rows (or an
          SGraph with :math:`n^2` edges).

        - For models created with the 'lsh' method, the output similarity graph
          may have fewer vertices than there are data points in the original
          reference set. Because LSH is an approximate method, a query point
          may have fewer than 'k' neighbors. If LSH returns no neighbors at all
          for a query and self-edges are excluded, the query point is omitted
          from the results.

        Examples
        --------
        First construct an SFrame and create a nearest neighbors model:

        >>> sf = graphlab.SFrame({'x1': [0.98, 0.62, 0.11],
        ...                       'x2': [0.69, 0.58, 0.36]})
        ...
        >>> model = graphlab.nearest_neighbors.create(sf, distance='euclidean')

        Unlike the ``query`` method, there is no need for a second dataset with
        ``similarity_graph``.

        >>> g = model.similarity_graph(k=1)  # an SGraph
        >>> g.show()
        >>> g.edges
        +----------+----------+----------------+------+
        | __src_id | __dst_id |    distance    | rank |
        +----------+----------+----------------+------+
        |    0     |    1     | 0.376430604494 |  1   |
        |    2     |    1     | 0.55542776308  |  1   |
        |    1     |    0     | 0.376430604494 |  1   |
        +----------+----------+----------------+------+
        """
        _mt._get_metric_tracker().track(
            'toolkit.nearest_neighbors.similarity_graph')

        ## Validate inputs.
        if k is not None:
            if not isinstance(k, int):
                raise ValueError("Input 'k' must be an integer.")

            if k <= 0:
                raise ValueError("Input 'k' must be larger than 0.")

        if radius is not None:
            if not isinstance(radius, (int, float)):
                raise ValueError("Input 'radius' must be an integer or float.")

            if radius < 0:
                raise ValueError("Input 'radius' must be non-negative.")

        ## Set k and radius to special values to indicate 'None'
        if k is None:
            k = -1

        if radius is None:
            radius = -1.0

        opts = {
            'model': self.__proxy__,
            'model_name': self.__name__,
            'k': k,
            'radius': radius,
            'include_self_edges': include_self_edges
        }

        result = _graphlab.toolkits._main.run(
            '_nearest_neighbors.similarity_graph', opts, verbose)

        knn = _SFrame(None, _proxy=result['neighbors'])

        if output_type == "SFrame":
            return knn

        else:
            sg = _SGraph(edges=knn,
                         src_field='query_label',
                         dst_field='reference_label')
            return sg

Example #14

0

Show file

File: _nearest_neighbors.py Project: eb777ez/Yelp-Recommendation-System

    def query(self, dataset, label=None, k=5, radius=None, verbose=True):
        """
        Retrieve the nearest neighbors from the reference set for each element
        of the query set. The query SFrame must include columns with the same
        names as the label and feature columns used to create the
        NearestNeighborsModel.

        Parameters
        ----------
        dataset : SFrame
            Query data. Must contain columns with the same names and types as
            the features used to train the model. Additional columns are
            allowed, but ignored. Please see the nearest neighbors
            :func:`~graphlab.nearest_neighbors.create` documentation for more
            detail on allowable data types.

        label : string, optional
            Name of the query SFrame column with row labels. If 'label' is not
            specified, row numbers are used to identify query dataset rows in
            the output SFrame.

        k : int, optional
            Number of nearest neighbors to return from the reference set for
            each query observation. The default is 5 neighbors, but setting it
            to ``None`` will return all neighbors within ``radius`` of the query
            point.

        radius : float, optional
            Only neighbors whose distance to a query point is smaller than this
            value are returned. The default is ``None``, in which case the ``k``
            nearest neighbors are returned for each query point, regardless of
            distance.

        verbose: bool, optional
            If True, print progress updates and model details.

        Returns
        -------
        out : SFrame
            An SFrame with the k-nearest neighbors of each query observation.
            The result contains four columns: the first is the label of the
            query observation, the second is the label of the nearby reference
            observation, the third is the distance between the query and
            reference observations, and the fourth is the rank of the reference
            observation among the query's k-nearest neighbors.

        Notes
        -----
        - If both ``k`` and ``radius`` are set to ``None``, each query point
          returns all of the reference set. If the reference dataset has
          :math:`n` rows and the query dataset has :math:`m` rows, the output is
          an SFrame with :math:`nm` rows.

        Examples
        --------
        First construct a toy SFrame and create a nearest neighbors model:

        >>> sf = graphlab.SFrame({'label': range(3),
        ...                       'feature1': [0.98, 0.62, 0.11],
        ...                       'feature2': [0.69, 0.58, 0.36]})
        >>> model = graphlab.nearest_neighbors.create(sf, 'label')

        A new SFrame contains query observations with same schema as the
        reference SFrame. This SFrame is passed to the ``query`` method.

        >>> queries = graphlab.SFrame({'label': range(3),
        ...                            'feature1': [0.05, 0.61, 0.99],
        ...                            'feature2': [0.06, 0.97, 0.86]})
        >>> model.query(queries, 'label', k=2)
        +-------------+-----------------+----------------+------+
        | query_label | reference_label |    distance    | rank |
        +-------------+-----------------+----------------+------+
        |      0      |        2        | 0.305941170816 |  1   |
        |      0      |        1        | 0.771556867638 |  2   |
        |      1      |        1        | 0.390128184063 |  1   |
        |      1      |        0        | 0.464004310325 |  2   |
        |      2      |        0        | 0.170293863659 |  1   |
        |      2      |        1        | 0.464004310325 |  2   |
        +-------------+-----------------+----------------+------+
        """

        _mt._get_metric_tracker().track(
            'toolkit.nearest_neighbors.query')

        ## Validate the 'dataset' input
        _tkutl._raise_error_if_not_sframe(dataset, "dataset")
        _tkutl._raise_error_if_sframe_empty(dataset, "dataset")

        ## Get model features
        ref_features = self.get('features')
        sf_features = _tkutl._toolkits_select_columns(dataset, ref_features)

        ## Validate and preprocess the 'label' input
        ref_label = self.get('label')

        if label is None:
            sf_features = sf_features.add_row_number(column_name=ref_label)
            sf_label = sf_features[[ref_label]]
            sf_features.remove_column(ref_label)

        else:
            if not label in dataset.column_names():
                raise ValueError(
                    "Input 'label' must be a string matching the name of a " +\
                    "column in the reference SFrame 'dataset'.")

            if not dataset[label].dtype() == str and not dataset[label].dtype() == int:
                raise TypeError("The label column must contain integers or strings.")

            if label in ref_features:
                raise ValueError("The label column cannot be one of the features.")

            sf_label = _tkutl._toolkits_select_columns(dataset, [label])

            if label != ref_label:
                sf_label.rename({label: ref_label})


        ## Validate neighborhood parameters 'k' and 'radius'
        if k is not None:
            if not isinstance(k, int):
                raise ValueError("Input 'k' must be an integer.")

            if k <= 0:
                raise ValueError("Input 'k' must be larger than 0.")

        if radius is not None:
            if not isinstance(radius, (int, float)):
                raise ValueError("Input 'radius' must be an integer or float.")

            if radius < 0:
                raise ValueError("Input 'radius' must be non-negative.")


        ## Set k and radius to special values to indicate 'None'
        if k is None:
            k = -1

        if radius is None:
            radius = -1.0

        opts = {'model': self.__proxy__,
                'model_name': self.__name__,
                'features': sf_features,
                'label': sf_label,
                'k': k,
                'radius': radius}

        if verbose is True:
            print "Starting model querying..."

        result = _graphlab.toolkits._main.run('_nearest_neighbors.query', opts,
                                             verbose)
        return _SFrame(None, _proxy=result['neighbors'])

Example #15

0

Show file

File: vowpal_wabbit.py Project: eb777ez/Yelp-Recommendation-System

def create(dataset, target,
           loss_function='squared',
           quadratic=[],
           l1_penalty=0.0, l2_penalty=0.0,
           bigram=False,
           step_size=0.5, num_bits=18, verbose=False,
           max_iterations=1,
           command_line_args=''):
    """
    create(dataset, target, loss_function='squared', quadratic=list(), 
    l1_penalty=0.0, l2_penalty=0.0, bigram=False, step_size=0.5, num_bits=18, 
    verbose=False, max_iterations=1, command_line_args='')

    Learn a large linear model using Vowpal Wabbit.

    Parameters
    ----------
    dataset : SFrame
        A data set. Due to the way Vowpal Wabbit creates features from each
        entry, ':' and '|' characters are not allowed in any columns containing
        strings. Each row of the dataset is translated into a string and passed
        to Vowpal Wabbit. Currently, the upper bound on the size of the string
        is 1MB. Based on the type of the SArray column, the values are passed in
        the following ways.

        - *integer* or *float*: the value is passed directly to VW.

        - *str*: the name of the column is used as the namespace, followed by
          the entire string.

        - *dict*: the name of the column is used as the namespace, and each
          key-value pair is a feature. The keys of the dictionary must be string
          or numeric and the values must be numeric (integer or float).

        - *array*: the name of the column is used as the namespace, the index of
          the array element is used as the name of the feature, and only numeric
          elements in the array are passed to VW.

        - *list (recursive type)*: the name of the column is used as the
          namespace, the index of the list element is used as the name of the
          feature, and currently only numeric elements (integer or float) are
          passed to VW.

        See the `VW input format guidelines
        <https://github.com/JohnLangford/vowpal_wabbit/wiki/Input-format>`_ for
        more details.

    target : string
        The name of the column in ``dataset`` that is the prediction target.
        This column must have a numeric type.

    loss_function : {'squared', 'hinge', 'logistic', 'quantile'}, optional
        This defines the `loss function
        <http://en.wikipedia.org/wiki/Loss_function>`_ used during optimization.
        Typical choices:

        - *real-valued target*: `squared error loss
          <http://en.wikipedia.org/wiki/Mean_squared_error>`_.

        - *binary target*: `logistic
          <http://en.wikipedia.org/wiki/Logistic_regression>`_. The target
          column must only contain -1 or 1.

        The `hinge loss <http://en.wikipedia.org/wiki/Hinge_loss>`_ is also used
        for classification, while `quantile loss
        <http://en.wikipedia.org/wiki/Quantile_regression>`_ can be good when
        one aims to predict quantities other than the mean.

    quadratic : list of pairs, optional
        This will add `interaction terms
        <http://en.wikipedia.org/wiki/Interaction_(statistics)>`_ to a linear
        model between a pair of columns. Quadratic terms add a parameter in the
        model for the product of two features, i.e. if we include an interaction
        between :math:`x_1` and :math:`x_2`, we can add a parameter :math:`b_3`.

            .. math:: y_i =  a + b_1 * x_{i1} + b_2 * x_{i2} + b_3 * x_{i1} * x_{i2}

        Multiple quadratic terms can be added by including multiple pairs, e.g.
        ``quadratic = [('a', 'b'), ('b', 'c')]`` would add interaction terms
        between columns names 'a' and 'b' as well as terms for interactions
        between 'b' and 'c'. Including ':' as one of the items in the pairs is a
        shortcut for adding quadratic terms for all pairs of features. Due to
        Vowpal Wabbit's implementation, quadratic terms are determined by the
        first letter of the column name.

    l1_penalty : float, optional
        This defines how strongly you want to keep parameters to be zero.

    l2_penalty : float, optional
        This defines how strongly you want to keep parameters near zero.
        Specifically it adds a penalty of :math:`.5 * \lambda * |w|_2^2` to the
        weight vector w, where lambda is the provided regularization value.

    bigram : bool, optional
        Add bigram features. For columns containing the text "my name is bob"
        this will add bigram features for "my name", "name is", "is bob".

    step_size : float, optional
        Set the learning rate for online learning.

    verbose : bool, optional
        Print first 10 rows as they are seen by VowpalWabbit.
        This is useful for debugging.

    max_iterations : int, optional
        Number of passes to take over the data set.

    command_line_args : string, optional
        Additional arguments to pass to Vowpal Wabbit, just as one would use
        when using VW via the command line.

    Returns
    -------
    out : VowpalWabbitModel
        A model that can be used for predicting new cases.

    See Also
    --------
    VowpalWabbitModel.predict, VowpalWabbitModel.evaluate

    Notes
    -----
    - Other desired command line arguments can be provided manually through the
      command_line_args keyword argument. See the `VW documentation <http://gith
      ub.com/JohnLangford/vowpal_wabbit/wiki/Command-line-arguments>`_ for more
      details.

    - Several Vowpal Wabbit features are not yet supported, including importance
      weighted learning.

    Examples
    --------
    >>> data =  graphlab.SFrame('http://s3.amazonaws.com/GraphLab-Datasets/regression/houses.csv')
    >>> data['price'] = data['price'].apply(lambda x: 1 if x > 30000 else -1)
    >>> m = graphlab.vowpal_wabbit.create(data, 'price')

    To add quadratic terms between 'user' and 'movie' columns:

    >>> m = graphlab.vowpal_wabbit.create(sf, 'rating', quadratic=[('user', 'movie')])

    If a column contains text, each space-separated word is used as a
    unique feature. Often times it is useful to also include bigrams as
    features. This can be done easily with the ``bigram`` argument:

    >>> m = graphlab.vowpal_wabbit.create(sf, 'rating', bigram=True)
    """

    _mt._get_metric_tracker().track('toolkit.vowpal_wabbit.create')

    if not (isinstance(dataset, _SFrame)):
        raise TypeError("Input 'dataset' must be an SFrame")

    if type(dataset) != _SFrame:
        dataset = _SFrame(dataset)

    assert target in dataset.column_names(), "No target provided."

    quadratic_command = ''
    for (feature_a, feature_b) in quadratic:
        # VW uses first letter to describe namespace
        quadratic_command += ' -q ' + feature_a[0] + feature_b[0]

    opts = {'verbose': verbose,
            'target': target,
            'loss_function': loss_function,
            'quadratic': quadratic_command,
            'step_size': step_size,
            'l1_penalty': l1_penalty,
            'l2_penalty': l2_penalty,
            'num_bits' : num_bits,
            'max_iterations': max_iterations,
            'bigram': bigram,
            'extra_command_line_args': command_line_args}

    # Initialize the model with basic parameters
    response = _graphlab.toolkits._main.run("vw_init", opts)
    m = VowpalWabbitModel(response['model'])

    # Train the model on the given data set and retrieve predictions
    opts = {'model': m.__proxy__,
            'data': dataset}
    response = _graphlab.toolkits._main.run("vw_train", opts)
    m = VowpalWabbitModel(response['model'])

    yhat = _SArray(None, _proxy=response['predictions'])

    # Evaluate model
    start_time = _time.time()
    y = dataset[target]

    if loss_function == 'logistic':
        is_one_or_neg_one = y.apply(lambda x: x == 1 or x == -1)
        if not all(is_one_or_neg_one):
            raise TypeError('When using `logistic` as a loss function, the target column must contain only 1\'s and -1\'s.')
        y = y.apply(lambda x: int(x*.5 + .5))
        m = m._set('training_accuracy', _graphlab.evaluation.accuracy(y, yhat))
    else:
        m = m._set('training_rmse', _graphlab.evaluation.rmse(y, yhat))
    return m

Example #16

0

Show file

File: kmeans.py Project: vandosant/flask-spike

    def get(self, field):
        """
        Return the value of a given field.

        The list of all queryable fields is detailed below, and can be obtained
        with the ``list_fields`` method.

        +-----------------------+----------------------------------------------+
        |      Field            | Description                                  |
        +=======================+==============================================+
        | batch_size            | Number of randomly chosen examples to use in |
        |                       | each training iteration.                     | 
        +-----------------------+----------------------------------------------+
        | cluster_id            | Cluster assignment for each data point and   |
        |                       | Euclidean distance to the cluster center     |
        +-----------------------+----------------------------------------------+
        | cluster_info          | Cluster centers, sum of squared Euclidean    |
        |                       | distances from each cluster member to the    | 
        |                       | assigned center, and the number of data      | 
        |                       | points belonging to the cluster              | 
        +-----------------------+----------------------------------------------+
        | features              | Names of feature columns                     |
        +-----------------------+----------------------------------------------+
        | max_iterations        | Maximum number of iterations to perform      |
        +-----------------------+----------------------------------------------+
        | method                | Algorithm used to train the model.           |
        +-----------------------+----------------------------------------------+
        | num_clusters          | Number of clusters                           |
        +-----------------------+----------------------------------------------+
        | num_examples          | Number of examples in the dataset            |
        +-----------------------+----------------------------------------------+
        | num_features          | Number of feature columns used               |
        +-----------------------+----------------------------------------------+
        | num_unpacked_features | Number of features unpacked from the         |
        |                       | feature columns                              |
        +-----------------------+----------------------------------------------+
        | training_iterations   | Total number of iterations performed         |
        +-----------------------+----------------------------------------------+
        | training_time         | Total time taken to cluster the data         |
        +-----------------------+----------------------------------------------+
        | unpacked_features     | Names of features unpacked from the          |
        |                       | feature columns                              |
        +-----------------------+----------------------------------------------+
        | verbose               | True if model training should print progress |
        +-----------------------+----------------------------------------------+

        Parameters
        ----------
        field : str
            The name of the field to query.

        Returns
        -------
        out
            Value of the requested field

        See Also
        --------
        list_fields

        Examples
        --------

        >>> model.get("cluster_info")
                d1        d2        d3        d4    sum_squared_distance  size
        0 -0.777484  1.048897  0.523926  0.487775       2.459470           4
        1  0.844906 -0.613151 -0.088785 -0.212908       3.651614           5
        2 -1.114592 -1.129836 -1.651781 -0.886557       0.000000           1

        [3 rows x 6 columns]
        """

        _mt._get_metric_tracker().track('toolkit.kmeans.get')

        opts = {'model': self.__proxy__,
                'model_name': self.__name__,
                'field': field}
        response = _graphlab.toolkits._main.run('kmeans_get_value',
                                               opts)

        # cluster_id and cluster_info both return a unity SFrame. Cast to an SFrame.
        if field == 'cluster_id' or field == 'cluster_info':
            return _SFrame(None, _proxy=response['value'])
        else:
            return response['value']

Example #17

0

Show file

File: _model.py Project: zhmz90/Dato-Core

    def get_default_options_for_model(output_type='sframe'):
        """
        Get the default options for the toolkit 
        :class:`~graphlab.{module_name}.{python_class_name}`.

        Parameters
        ----------
        output_type : str, optional

            The output can be of the following types.

            - `sframe`: A table description each option used in the model.
            - `json`: A list of option dictionaries.

            | Each dictionary/row in the JSON/SFrame object describes the
              following parameters of the given model.

            +------------------+-------------------------------------------------------+
            |      Name        |                  Description                          |
            +==================+=======================================================+
            | name             | Name of the option used in the model.                 |
            +------------------+---------+---------------------------------------------+
            | description      | A detailed description of the option used.            |
            +------------------+-------------------------------------------------------+
            | type             | Option type (REAL, BOOL, INTEGER or CATEGORICAL)      |
            +------------------+-------------------------------------------------------+
            | default_value    | The default value for the option.                     |
            +------------------+-------------------------------------------------------+
            | possible_values  | List of acceptable values (CATEGORICAL only)          |
            +------------------+-------------------------------------------------------+
            | lower_bound      | Smallest acceptable value for this option (REAL only) |
            +------------------+-------------------------------------------------------+
            | upper_bound      | Largest acceptable value for this option (REAL only)  |
            +------------------+-------------------------------------------------------+

        Returns
        -------
        out : JSON/SFrame

        See Also
        --------
        graphlab.{module_name}.{python_class_name}.get_current_options

        Examples
        --------
        .. sourcecode:: python

          >>> import graphlab

          # SFrame formatted output.
          >>> out_sframe = graphlab.{module_name}.get_default_options()

          # JSON formatted output.
          >>> out_sframe = graphlab.{module_name}.get_default_options('json')
        """
        _mt._get_metric_tracker().track('toolkit.%s.get_default_options' %
                                        module_name)
        if sdk_model:
            response = _gl.extensions._toolkits_sdk_get_default_options(
                unity_server_model_name)
        else:
            response = _gl.extensions._toolkits_get_default_options(
                unity_server_model_name)

        for k in response.keys():
            response[k] = json.loads(response[k],
                                     parse_int=lambda x: float(x)
                                     if type(int(x)) is long else int(x))

        if output_type == 'json':
            return response
        else:
            json_list = [{'name': k, '': v} for k, v in response.items()]
            return _SFrame(json_list).unpack('X1', column_name_prefix='')\
                                     .unpack('X1', column_name_prefix='')

Example #18

0

Show file

File: topic_model.py Project: divya2661/food-recommendation-engine-

    def get_topics(self, topic_ids=None, num_words=5, cdf_cutoff=1.0,
                   output_type='topic_probabilities'):

        """
        Get the words associated with a given topic. The score column is the
        probability of choosing that word given that you have chosen a
        particular topic.

        Parameters
        ----------
        topic_ids : list of int, optional
            The topics to retrieve words. Topic ids are zero-based.
            Throws an error if greater than or equal to m['num_topics'], or
            if the requested topic name is not present.

        num_words : int, optional
            The number of words to show.

        cdf_cutoff : float, optional
            Allows one to only show the most probable words whose cumulative
            probability is below this cutoff. For example if there exist
            three words where

            .. math::
               p(word_1 | topic_k) = .1

               p(word_2 | topic_k) = .2

               p(word_3 | topic_k) = .05

            then setting :math:`cdf_{cutoff}=.3` would return only
            :math:`word_1` and :math:`word_2` since
            :math:`p(word_1 | topic_k) + p(word_2 | topic_k) <= cdf_{cutoff}`

        output_type : {'topic_probabilities' | 'topic_words'}, optional
            Determine the type of desired output. See below.

        Returns
        -------
        out : SFrame
            If output_type is 'topic_probabilities', then the returned value is
            an SFrame with a column of words ranked by a column of scores for
            each topic. Otherwise, the returned value is a SArray where
            each element is a list of the most probable words for each topic.

        Examples
        --------
        Get the highest ranked words for all topics.

        >>> docs = graphlab.SArray('https://static.turi.com/datasets/nips-text')
        >>> m = graphlab.topic_model.create(docs,
                                            num_iterations=50)
        >>> m.get_topics()
        +-------+----------+-----------------+
        | topic |   word   |      score      |
        +-------+----------+-----------------+
        |   0   |   cell   |  0.028974400831 |
        |   0   |  input   | 0.0259470208503 |
        |   0   |  image   | 0.0215721599763 |
        |   0   |  visual  | 0.0173635081992 |
        |   0   |  object  | 0.0172447874156 |
        |   1   | function | 0.0482834508265 |
        |   1   |  input   | 0.0456270024091 |
        |   1   |  point   | 0.0302662839454 |
        |   1   |  result  | 0.0239474934631 |
        |   1   | problem  | 0.0231750116011 |
        |  ...  |   ...    |       ...       |
        +-------+----------+-----------------+

        Get the highest ranked words for topics 0 and 1 and show 15 words per
        topic.

        >>> m.get_topics([0, 1], num_words=15)
        +-------+----------+------------------+
        | topic |   word   |      score       |
        +-------+----------+------------------+
        |   0   |   cell   |  0.028974400831  |
        |   0   |  input   | 0.0259470208503  |
        |   0   |  image   | 0.0215721599763  |
        |   0   |  visual  | 0.0173635081992  |
        |   0   |  object  | 0.0172447874156  |
        |   0   | response | 0.0139740298286  |
        |   0   |  layer   | 0.0122585145062  |
        |   0   | features | 0.0115343177265  |
        |   0   | feature  | 0.0103530459301  |
        |   0   | spatial  | 0.00823387994361 |
        |  ...  |   ...    |       ...        |
        +-------+----------+------------------+

        If one wants to instead just get the top words per topic, one may
        change the format of the output as follows.

        >>> topics = m.get_topics(output_type='topic_words')
        dtype: list
        Rows: 10
        [['cell', 'image', 'input', 'object', 'visual'],
         ['algorithm', 'data', 'learning', 'method', 'set'],
         ['function', 'input', 'point', 'problem', 'result'],
         ['model', 'output', 'pattern', 'set', 'unit'],
         ['action', 'learning', 'net', 'problem', 'system'],
         ['error', 'function', 'network', 'parameter', 'weight'],
         ['information', 'level', 'neural', 'threshold', 'weight'],
         ['control', 'field', 'model', 'network', 'neuron'],
         ['hidden', 'layer', 'system', 'training', 'vector'],
         ['component', 'distribution', 'local', 'model', 'optimal']]
        """
        _mt._get_metric_tracker().track('toolkit.text.topic_model.get_topics')

        _check_categorical_option_type('output_type', output_type,
            ['topic_probabilities', 'topic_words'])

        if topic_ids is None:
            topic_ids = list(range(self.get('num_topics')))

        assert isinstance(topic_ids, list), \
            "The provided topic_ids is not a list."

        if any([type(x) == str for x in topic_ids]):
            raise ValueError("Only integer topic_ids can be used at this point in time.")
        if not all([x >= 0 and x < self['num_topics'] for x in topic_ids]):
            raise ValueError("Topic id values must be non-negative and less than the " + \
                "number of topics used to fit the model.")

        opts = {'model': self.__proxy__,
                'topic_ids': topic_ids,
                'num_words': num_words,
                'cdf_cutoff': cdf_cutoff}
        response = _graphlab.toolkits._main.run('text_topicmodel_get_topic',
                                               opts)
        ret = _map_unity_proxy_to_object(response['top_words'])

        def sort_wordlist_by_prob(z):
            words = sorted(z.items(), key=_operator.itemgetter(1), reverse=True)
            return [word for (word, prob) in words]

        if output_type != 'topic_probabilities':
            ret = ret.groupby('topic',
                    {'word': _graphlab.aggregate.CONCAT('word', 'score')})
            words = ret.sort('topic')['word'].apply(sort_wordlist_by_prob)
            ret = _SFrame({'words': words})

        return ret

Example #19

0

Show file

File: _nearest_neighbors.py Project: Mawul4j/Machine-Learning-Course

    def query(self, dataset, label=None, k=5, radius=None, verbose=True):
        """
        For each row of the input 'dataset', retrieve the nearest neighbors from
        the model's stored data. In general, the query dataset does not need to
        be the same as the reference data stored in the model, but if it is, the
        'include_self_edges' parameter can be set to False to exclude results
        that match query points to themselves.

        Parameters
        ----------
        dataset : SFrame
            Query data. Must contain columns with the same names and types as
            the features used to train the model. Additional columns are
            allowed, but ignored. Please see the nearest neighbors
            :func:`~graphlab.nearest_neighbors.create` documentation for more
            detail on allowable data types.

        label : str, optional
            Name of the query SFrame column with row labels. If 'label' is not
            specified, row numbers are used to identify query dataset rows in
            the output SFrame.

        k : int, optional
            Number of nearest neighbors to return from the reference set for
            each query observation. The default is 5 neighbors, but setting it
            to ``None`` will return all neighbors within ``radius`` of the query
            point.

        radius : float, optional
            Only neighbors whose distance to a query point is smaller than this
            value are returned. The default is ``None``, in which case the ``k``
            nearest neighbors are returned for each query point, regardless of
            distance.

        verbose: bool, optional
            If True, print progress updates and model details.

        Returns
        -------
        out : SFrame
            An SFrame with the k-nearest neighbors of each query observation.
            The result contains four columns: the first is the label of the
            query observation, the second is the label of the nearby reference
            observation, the third is the distance between the query and
            reference observations, and the fourth is the rank of the reference
            observation among the query's k-nearest neighbors.

        See Also
        --------
        similarity_graph

        Notes
        -----
        - The `dataset` input to this method *can* have missing values (in
          contrast to the reference dataset used to create the nearest
          neighbors model). Missing numeric values are imputed to be the mean
          of the corresponding feature in the reference dataset, and missing
          strings are imputed to be empty strings.

        - If both ``k`` and ``radius`` are set to ``None``, each query point
          returns all of the reference set. If the reference dataset has
          :math:`n` rows and the query dataset has :math:`m` rows, the output is
          an SFrame with :math:`nm` rows.

        - For models created with the 'lsh' method, the query results may have
          fewer query labels than input query points. Because LSH is an
          approximate method, a query point may have fewer than 'k' neighbors.
          If LSH returns no neighbors at all for a query, the query point is
          omitted from the results.

        Examples
        --------
        First construct a toy SFrame and create a nearest neighbors model:

        >>> sf = graphlab.SFrame({'label': range(3),
        ...                       'feature1': [0.98, 0.62, 0.11],
        ...                       'feature2': [0.69, 0.58, 0.36]})
        >>> model = graphlab.nearest_neighbors.create(sf, 'label')

        A new SFrame contains query observations with same schema as the
        reference SFrame. This SFrame is passed to the ``query`` method.

        >>> queries = graphlab.SFrame({'label': range(3),
        ...                            'feature1': [0.05, 0.61, 0.99],
        ...                            'feature2': [0.06, 0.97, 0.86]})
        >>> model.query(queries, 'label', k=2)
        +-------------+-----------------+----------------+------+
        | query_label | reference_label |    distance    | rank |
        +-------------+-----------------+----------------+------+
        |      0      |        2        | 0.305941170816 |  1   |
        |      0      |        1        | 0.771556867638 |  2   |
        |      1      |        1        | 0.390128184063 |  1   |
        |      1      |        0        | 0.464004310325 |  2   |
        |      2      |        0        | 0.170293863659 |  1   |
        |      2      |        1        | 0.464004310325 |  2   |
        +-------------+-----------------+----------------+------+
        """

        _mt._get_metric_tracker().track(
            'toolkit.nearest_neighbors.query')

        ## Validate the 'dataset' input
        _tkutl._raise_error_if_not_sframe(dataset, "dataset")
        _tkutl._raise_error_if_sframe_empty(dataset, "dataset")

        ## Get model features
        ref_features = self.get('features')
        sf_features = _tkutl._toolkits_select_columns(dataset, ref_features)

        ## Validate and preprocess the 'label' input
        if label is None:
            query_labels = _graphlab.SArray.from_sequence(len(dataset))

        else:
            if not label in dataset.column_names():
                raise ValueError(
                    "Input 'label' must be a string matching the name of a " +\
                    "column in the reference SFrame 'dataset'.")

            if not dataset[label].dtype() == str and not dataset[label].dtype() == int:
                raise TypeError("The label column must contain integers or strings.")

            if label in ref_features:
                raise ValueError("The label column cannot be one of the features.")

            query_labels = dataset[label]


        ## Validate neighborhood parameters 'k' and 'radius'
        if k is not None:
            if not isinstance(k, int):
                raise ValueError("Input 'k' must be an integer.")

            if k <= 0:
                raise ValueError("Input 'k' must be larger than 0.")

        if radius is not None:
            if not isinstance(radius, (int, float)):
                raise ValueError("Input 'radius' must be an integer or float.")

            if radius < 0:
                raise ValueError("Input 'radius' must be non-negative.")


        ## Set k and radius to special values to indicate 'None'
        if k is None:
            k = -1

        if radius is None:
            radius = -1.0

        opts = {'model': self.__proxy__,
                'model_name': self.__name__,
                'features': sf_features,
                'query_labels': query_labels,
                'k': k,
                'radius': radius}

        result = _graphlab.toolkits._main.run('_nearest_neighbors.query', opts,
                                             verbose)
        return _SFrame(None, _proxy=result['neighbors'])