Example #1
0
class _SampleTransformer(Transformer):

    get_default_options = staticmethod(
        _get_default_options_wrapper('_SampleTransformer',
                                     '_SampleTransformer',
                                     '_SampleTransformer', True))

    def __init__(self, features=None, constant=0.5):

        # Set up options
        opts = {}
        opts['features'] = features
        opts['constant'] = constant

        # Initialize object
        proxy = _gl.extensions._SampleTransformer()
        proxy.init_transformer(opts)
        super(_SampleTransformer, self).__init__(proxy, self.__class__)

    def _get_summary_struct(self):
        """
        Returns a structured description of the model, including (where
        relevant) the schema of the training data, description of the training
        data, training statistics, and model hyperparameters.

        Returns
        -------
        sections : list (of list of tuples)
            A list of summary sections.
              Each section is a list.
                Each item in a section list is a tuple of the form:
                  ('<label>','<field>')

        section_titles: list
            A list of section titles.
              The order matches that of the 'sections' object.
        """
        section = []
        section_titles = ['Attributes']
        for f in self.list_fields():
            section.append(("%s" % f, "%s" % f))

        return ([section], section_titles)

    def __repr__(self):
        (section, section_titles) = self._get_summary_struct()
        return _toolkit_repr_print(self, section, section_titles, width=30)
from graphlab.toolkits._model import Model as _Model
from graphlab.data_structures.sframe import SFrame as _SFrame
from graphlab.data_structures.sarray import SArray as _SArray
from graphlab.toolkits.text_analytics._util import _check_input
from graphlab.toolkits.text_analytics._util import random_split as _random_split
from graphlab.toolkits._internal_utils import _check_categorical_option_type
from graphlab.toolkits._internal_utils import _map_unity_proxy_to_object
from itertools import izip as _izip
import array as _array
import json as _json
from graphlab.toolkits._model import _get_default_options_wrapper



get_default_options = _get_default_options_wrapper(
                          'cgs_topic_model',
                          'topic_model',
                          'TopicModel')

def create(dataset,
           num_topics=10,
           initial_topics=None,
           alpha=None,
           beta=.1,
           num_iterations=10,
           associations=None,
           verbose=False,
           print_interval=10,
           validation_set=None,
           method='auto'):
    """
    Create a topic model from the given data set. A topic model assumes each
import graphlab.toolkits._supervised_learning as _sl
from graphlab.toolkits._supervised_learning import Classifier as _Classifier
from graphlab.toolkits._model import _get_default_options_wrapper
from graphlab.toolkits._internal_utils import _raise_error_if_not_sframe, \
                                            _map_unity_proxy_to_object, \
                                            _toolkit_repr_print, \
                                            _numeric_param_check_range
from graphlab.toolkits._model_workflow import _collect_model_workflow
from graphlab.util import cloudpickle as _cloudpickle

import logging as _logging
from copy import copy as _copy


get_default_options = _get_default_options_wrapper(
                          'neuralnet_classifier_v2',
                          'neuralnet_classifier',
                          'NeuralNetClassifier')

_context_doc_string = '''
>>> data = graphlab.SFrame('http://s3.amazonaws.com/dato-datasets/mnist/sframe/train')
>>> training_data, validation_data = data.random_split(0.8)
>>> net = graphlab.deeplearning.get_builtin_neuralnet('mnist')
>>> m = graphlab.neuralnet_classifier.create(training_data,
...                                          target='label',
...                                          network=net,
...                                          max_iterations=3)
'''


class NeuralNetClassifier(_Classifier):
    """
detection tool.
"""

import time as _time
import graphlab as _gl
import graphlab.connect as _mt
from graphlab.toolkits._model import SDKModel as _SDKModel
import graphlab.toolkits._internal_utils as _tkutl
from graphlab.toolkits._private_utils import _summarize_accessible_fields
from graphlab.toolkits._main import ToolkitError as _ToolkitError
from graphlab.toolkits._model import _get_default_options_wrapper
import datetime as _dt
import logging as _logging

get_default_options = _get_default_options_wrapper(
    '_BayesianOnlineChangepoint', '_BayesianOnlineChangepoint',
    '_BayesianOnlineChangepoint', True)


def create(dataset, feature=None, expected_runlength=250, lag=7):
    """
    Create a `BayesianChangepointsModel`. The changepoint detection
    calculates where there is a shift in mean or variance in a univariate
    timeseries. This model calculates a probability that a given point is
    changepoint, given the data up to the point. The BayesianChangepointsModel
    works with either TimeSeries, SArray, or SFrame inputs.

    The model created by this function contains a table `scores` that contains
    the computed anomaly scores. The type of `scores` matches the type of the
    input `dataset`, and the table contains 4 columns:
from graphlab.util import _make_internal_url

_RANDOM_FOREST_MODEL_PARAMS_KEYS = [
    'max_depth', 'min_child_weight', 'min_loss_reduction', 'row_subsample'
]
_RANDOM_FOREST_TRAINING_PARAMS_KEYS = [
    'objective', 'training_time', 'training_error', 'validation_error',
    'evaluation_metric'
]
_RANDOM_FOREST_TRAINING_DATA_PARAMS_KEYS = [
    'target', 'features', 'num_features', 'num_examples',
    'num_validation_examples'
]

get_default_options = _get_default_options_wrapper('random_forest_regression',
                                                   'random_forest_regression',
                                                   'RandomForestRegression')


class RandomForestRegression(_SupervisedLearningModel, _TreeModelMixin):
    """
    Encapsulates random forest models for regression tasks.

    The prediction is based on a collection of base learners, `regression trees
    <http://en.wikipedia.org/wiki/Decision_tree_learning>`_ and combines them
    through a technique called `random forest <http://en.wikipedia.org/wiki/Random_forest>`_.

    Different from linear models, e.g. linear regression,
    the random forests are able to model non-linear interactions
    between the features and the target using decision trees as the subroutine.
    It is good for handling numerical features and categorical features with
Example #6
0
class FeatureHasher(Transformer):
    '''
    Hashes an input feature space to an n-bit feature space.

    Feature hashing is an efficient way of vectorizing features, and performing
    dimensionality reduction or expansion along the way. Supported types include
    array.array, list, dict, float, int, and string.  The behavior for creating
    keys and values for different input data column types is given below.

    * **array.array** : Keys are created by 1) combining the index
      of an element and the column name, 2) hashing the combination of the two.
      Each element in the array are the values in the returned dictionary.

    * **list** : Behaves the same as array.array, but if the element is non-numerical
      the element is combined with the column name and hashed, and 1 is used
      as the value.

    * **dict** : Each key in the dictionary is combined with the column name and hashed,
      and the value is kept. If the value is is non-numerical, the element is
      combined with the column name and hashed, and 1 is used as the value.

    * **float** : the column name is hashed, and the column
      entry becomes the value

    * **int** : Same behavior as float

    * **string** : Hash the string and use it as a key, and use 1 as the value.

    The hashed values are collapsed into a single sparse representation of a
    vector, so all hashed columns are replaced by a single column with name
    specified by 'output_column_name'.

    Parameters
    ----------
    features : list[str] | str | None, optional
        Name(s) of feature column(s) to be transformed. If set to None, then all
        columns are used.

    excluded_features : list[str] | str | None, optional
        Name(s) of feature columns in the input dataset to be ignored. Either
        `excluded_features` or `features` can be passed, but not both.

    num_bits : int, optional
        The number of bits to hash to. There will be :math:`2^{num\_bits}`
        indices in the resulting vector.

    output_column_name : str, optional
        The name of the output column. If the column already exists, then a
        suffix is append to the name.

    Returns
    -------
    out : FeatureHasher
        A FeatureHasher object which is initialized with the defined
        parameters.

    Notes
    -----
    - Each time a key is hashed, the corresponding value is multipled by
      either 1.0 or -1.0,  chosen with equal probability.  The final hashed
      feature value is the accumulation of values for all keys hashed to that
      bucket.

    References
    ----------
    - Collaborative Spam Filtering with the Hashing Trick. J. Attenberg,
      K. Q. Weinberger, A. Smola, A. Dasgupta, M. Zinkevich Virus Bulletin
      (VB) 2009.

    See Also
    --------
    graphlab.toolkits.feature_engineering._feature_hasher.FeatureHasher
    graphlab.toolkits.feature_engineering.create

    Examples
    --------

    .. sourcecode:: python

        from graphlab.toolkits.feature_engineering import *

        # Hash the feature space ['a', 'b, 'c'] into a single space.
        >>> sf = graphlab.SFrame({'a': [1,2,3], 'b' : [2,3,4], 'c': [9,10,11]})
        >>> hasher = graphlab.feature_engineering.create(sf,
                                FeatureHasher(features = ['a', 'b', 'c']))

        # Transform the data using the hasher.
        >>> hashed_sf = hasher.transform(sf)
        >>> hashed_sf

        Columns:
          hashed_features  dict

        Rows: 3

        Data:
        +-------------------------------+
        |        hashed_features        |
        +-------------------------------+
        | {79785: -1, 188475: -2, 21... |
        | {79785: -2, 188475: -3, 21... |
        | {79785: -3, 188475: -4, 21... |
        +-------------------------------+
        [3 rows x 1 columns]


        # Save the transformer.
        >>> hasher.save('save-path')
    '''

    _fit_examples_doc = _fit_examples_doc
    _fit_transform_examples_doc = _fit_transform_examples_doc
    _transform_examples_doc = _transform_examples_doc

    get_default_options = staticmethod(
        _get_default_options_wrapper(
            '_FeatureHasher', 'toolkits.feature_engineering._feature_hasher',
            'FeatureHasher', True))

    def __init__(self,
                 features=None,
                 excluded_features=None,
                 num_bits=18,
                 output_column_name='hashed_features'):

        # Process and make a copy of the features, exclude.
        _features, _exclude = _internal_utils.process_features(
            features, excluded_features)

        # Type checking
        _raise_error_if_not_of_type(num_bits, [int])
        _raise_error_if_not_of_type(output_column_name, [str])

        # Set up options
        opts = {
            'num_bits': num_bits,
            'output_column_name': output_column_name,
        }
        if _exclude:
            opts['exclude'] = True
            opts['features'] = _exclude
        else:
            opts['exclude'] = False
            opts['features'] = _features

        # Initialize object
        proxy = _gl.extensions._FeatureHasher()
        proxy.init_transformer(opts)
        super(FeatureHasher, self).__init__(proxy, self.__class__)

    def _get_summary_struct(self):
        """
        Returns a structured description of the model, including (where relevant)
        the schema of the training data, description of the training data,
        training statistics, and model hyperparameters.

        Returns
        -------
        sections : list (of list of tuples)
            A list of summary sections.
              Each section is a list.
                Each item in a section list is a tuple of the form:
                  ('<label>','<field>')
        section_titles: list
            A list of section titles.
              The order matches that of the 'sections' object.
        """
        _features = _precomputed_field(
            _internal_utils.pretty_print_list(self.get('features')))
        _exclude = _precomputed_field(
            _internal_utils.pretty_print_list(self.get('excluded_features')))
        fields = [("Features", _features), ("Excluded features", _exclude),
                  ("Output column name", 'output_column_name'),
                  ("Number of bits", 'num_bits')]
        section_titles = ['Model fields']

        return ([fields], section_titles)

    def __repr__(self):

        (sections, section_titles) = self._get_summary_struct()
        return _toolkit_repr_print(self, sections, section_titles, width=30)

    @classmethod
    def _get_instance_and_data(cls):
        sf = _gl.SFrame({'a': [1, 2, 3], 'b': [2, 3, 4]})
        hasher = _gl.feature_engineering.FeatureHasher(features=['a', 'b'])
        return hasher.fit(sf), sf
from graphlab.toolkits._internal_utils import _raise_error_evaluation_metric_is_valid
from graphlab.toolkits._internal_utils import _raise_error_if_column_exists
from graphlab.toolkits._model_workflow import _collect_model_workflow
from graphlab.toolkits._tree_model_mixin import TreeModelMixin as _TreeModelMixin


_RANDOM_FOREST_MODEL_PARAMS_KEYS = ['num_trees', 'step_size', 'max_depth',
 'min_child_weight', 'min_loss_reduction', 'row_subsample']
_RANDOM_FOREST_TRAINING_PARAMS_KEYS = ['objective', 'training_time',
'training_error', 'validation_error', 'evaluation_metric']
_RANDOM_FOREST_TRAINING_DATA_PARAMS_KEYS = ['target', 'features',
'num_features', 'num_examples', 'num_validation_examples']


get_default_options = _get_default_options_wrapper(
                          'random_forest_regression',
                          'random_forest_regression',
                          'RandomForestRegression')


class RandomForestRegression(_SupervisedLearningModel, _TreeModelMixin):
    """
    Encapsulates random forest models for regression tasks.

    The prediction is based on a collection of base learners, `regression trees
    <http://en.wikipedia.org/wiki/Decision_tree_learning>`_ and combines them
    through a technique called `random forest <http://en.wikipedia.org/wiki/Random_forest>`_.

    Different from linear models, e.g. linear regression,
    the random forests are able to model non-linear interactions
    between the features and the target using decision trees as the subroutine.
    It is good for handling numerical features and categorical features with
class BM25(Transformer):
    '''
    Transform an SFrame into BM25 scores for a given query.

    If we have a query with words :math:`q_1, ..., q_n` the BM25 score for
    a document is:

    .. math:: \sum_{i=1}^N IDF(q_i)\\frac{f(q_i) * (k_1+1)}{f(q_i) + k_1 * (1-b+b*|D|/d_{avg}))}

    where we use the natural logarithm and

      * :math:`\mbox{IDF}(q_i) = log((N - n(q_i) + .5)/(n(q_i) + .5)` is the inverse document frequency of :math:`q_i`
      * :math:`N` is the number of documents (in the training corpus)
      * :math:`n(q_i)` is the number of documents (in the training corpus) containing :math:`q_i`
      * :math:`f(q_i)` is the number of times :math:`q_i` occurs in the document
      * :math:`|D|` is the number of words in the document
      * :math:`d_{avg}` is the average number of words per document (in the training corpus)
      * :math:`k_1` and :math:`b` are free parameters.

    The transformed output is a column of type float with the BM25 score for each document.

    The behavior of BM25 for different input data column types is as follows:

    * **dict** : Each (key, value) pair is treated as count associated with
      the key for this row. A common example is to have a dict
      element contain a bag-of-words representation of a document,
      where each key is a word and each value is the number of times
      that word occurs in the document. All non-numeric values are
      ignored.
    * **list** : The list is converted to bag of words of format, where the keys
      are the unique elements in the list and the values are the
      counts of those unique elements. After this step, the behaviour
      is identical to dict.
    * **string** : Behaves identically to a **dict**, where the dictionary is
      generated by converting the string into a bag-of-words format. For
      example, "I really like really fluffy dogs" would get converted to
      {'I' : 1, 'really': 2, 'like': 1, 'fluffy': 1, 'dogs':1}.

    Parameters
    ----------

    features : str
        Name of feature column to be transformed.

    query : A list, set, or SArray of type str
        A list, set or SArray where each element is a word.

    k1 : float, optional
        Free parameter which controls the relative importance of term frequencies.
        Recommend values are [1.2, 2.0]. Default is 1.5.

    b : float, optional
        Free parameter which controls how much to downweight scores for long documents.
        Recommend value is 0.75. Default is 0.75.

    max_document_frequency: float, optional
        The maximum ratio of document_frequency to num_documents that is
        encoded. All query terms with a document frequency higher than this are
        discarded. This value must be between 0 and 1.

    min_document_frequency: float, optional
        The minimum ratio of document_frequency to num_documents that is
        encoded. All query terms with a document frequency lower than this are
        discarded. This value must be between 0 and 1.


    output_column_name: str, optional
        The output column name of the transform. If specified, it a new column
        name with the specified column name is added to the input SFrame.
        Otherwise, the 'feature' column is overwritten.

    Returns
    -------

    out : BM25
        A BM25 object which is initialized with the defined
        parameters.

    Notes
    -----
    - `None` values are treated as separate categories and are encoded
       along with the rest of the values.

    References
    ----------

    - For more details about BM25,
      see http://en.wikipedia.org/wiki/Okapi_BM25

    See Also
    --------

    graphlab.toolkits.feature_engineering._tfidf.TFIDF,
    graphlab.toolkits.feature_engineering.create

    Examples
    --------
    .. sourcecode:: python

      >>> import graphlab as gl

      # Create data
      >>> sf = gl.SFrame(
          {'docs': [{'this': 1, 'is': 1, 'a': 2, 'sample': 1},
          {'this': 1, 'is': 1, 'another': 2, 'example': 3},
          {'final': 1, 'doc': 1, 'here': 2}]})

      # Create a query set
      >>> query = ['a','query','example']

      # Create a BM25 encoder
      >>> from graphlab.toolkits.feature_engineering import BM25
      >>> encoder = gl.feature_engineering.create(dataset = sf, transformers = BM25('docs'))

      # Transform the data
      >>> transformed_sf = encoder.transform(data = sf)
      Data:
      +----------------+
      |      docs      |
      +----------------+
      | 0.744711615513 |
      | 0.789682123696 |
      |      0.0       |
      +----------------+
      [3 rows x 1 columns]

      # Save the transformer.
      >>> encoder.save('save-path')

      # Return the indices in the encoding.
      >>> encoder['document_frequencies']
      Data:
      +----------------+---------+--------------------+
      | feature_column |   term  | document_frequency |
      +----------------+---------+--------------------+
      |      docs      |    a    |         1          |
      |      docs      | example |         1          |
      +----------------+---------+--------------------+

    '''

    # Doc strings
    _fit_examples_doc = _fit_examples_doc
    _fit_transform_examples_doc = _fit_transform_examples_doc
    _transform_examples_doc  = _transform_examples_doc

    # Default options
    get_default_options = staticmethod(_get_default_options_wrapper(
            '_BM25', 'toolkits.feature_engineering._bm25',
                                                'BM25', True))

    def __init__(self, feature, query, k1 = 1.5, b = 0.75, min_document_frequency = 0.0,
                 max_document_frequency=1.0, output_column_name=None):

        # Convert query to list if necessary
        if isinstance(query, _gl.SArray):
            query = list(query)
        if isinstance(query, set):
            query = list(query)

        # Type checking
        _raise_error_if_not_of_type(feature, [str])
        for q in query:
            _raise_error_if_not_of_type(q, [str]) # query must be list of strings
        _raise_error_if_not_of_type(k1, [float, int])
        _raise_error_if_not_of_type(b, [float, int])
        _raise_error_if_not_of_type(min_document_frequency, [float, int])
        _raise_error_if_not_of_type(max_document_frequency, [float, int])
        _raise_error_if_not_of_type(output_column_name, [str, type(None)])

        # Set up options
        opts = {
          'features': [feature],
          'query': query,
          'k1': k1,
          'b': b,
          'min_document_frequency': min_document_frequency,
          'max_document_frequency': max_document_frequency,
          'output_column_name' : output_column_name
        }

        # Initialize object
        proxy = _gl.extensions._BM25()
        proxy.init_transformer(opts)
        super(BM25, self).__init__(proxy, self.__class__)

    def _get_summary_struct(self):
        _features = _precomputed_field(
            _internal_utils.pretty_print_list(self.get('features')))
        fields = [
            ("Features", _features),
            ("query", 'query'),
            ("k1", 'k1'),
            ("b", 'b'),
            ("Minimimum Document Frequency", 'min_document_frequency'),
            ("Maximimum Document Frequency", 'max_document_frequency'),
            ("Output Column Name", 'output_column_name')
        ]
        section_titles = ['Model fields']
        return ([fields], section_titles)

    def __repr__(self):
        (sections, section_titles) = self._get_summary_struct()
        return _toolkit_repr_print(self, sections, section_titles, 30)

    @classmethod
    def _get_instance_and_data(self):
        sf = _gl.SFrame({'docs': ["this is a test", "this is another test"]})
        encoder = _gl.feature_engineering.BM25('docs', ['a', 'test'])
        encoder = encoder.fit(sf)
        return encoder, sf
class FeatureBinner(Transformer):
    '''
    Feature binning is a method of turning continuous variables into categorical
    values.

    This is accomplished by grouping the values into a pre-defined number of bins.
    The continuous value then gets replaced by a string describing the bin
    that contains that value.

    FeatureBinner supports both 'logarithmic' and 'quantile' binning strategies
    for either int or float columns.

    Parameters
    ----------
    features : list[str] , optional
        Column names of features to be transformed. If None, all columns are
        selected.

    excluded_features : list[str] | str | None, optional
        Column names of features to be ignored in transformation. Can be string
        or list of strings. Either 'excluded_features' or 'features' can be
        passed, but not both.

    strategy : 'logarithmic' | 'quantiles', optional
        If the strategy is 'logarithmic', bin break points are defined by
        :math:`10^i` for i in [0,...,num_bins-2]. For instance, if
        num_bins = 2, the bins become (-Inf, 1], (1, Inf]. If num_bins = 3,
        the bins become (-Inf, 1], (1, 10], (10, Inf].

        If the strategy is 'quantile', the bin breaks are defined by the
        'num_bins'-quantiles for that columns data. Quantiles are values that
        separate the data into roughly equal-sized subsets.

    num_bins : int, optional
        The number of bins to group the continuous variables into.

    output_column_prefix : str, optional
        The prefix to use for the column name of each transformed column.
        When provided, the transformation will add columns to the input data,
        where the new name is "`output_column_prefix`.original_column_name".
        If `output_column_prefix=None` (default), then the output column name
        is the same as the original feature column name.

    Returns
    -------
    out : FeatureBinner
        A FeatureBinner object which is initialized with the defined
        parameters.

    See Also
    --------
    graphlab.toolkits.feature_engineering._feature_binner.FeatureBinner
    graphlab.toolkits.feature_engineering.create

    Notes
    -----
    - If the SFrame to be transformed already contains a column with the
      designated output column name, then that column will be replaced with the
      new output. In particular, this means that `output_column_prefix=None` will
      overwrite the original feature columns.

    Examples
    --------

    .. sourcecode:: python

        >>> from graphlab.toolkits.feature_engineering import *

        # Construct a feature binner with default options.
        >>> sf = graphlab.SFrame({'a': [1,2,3], 'b' : [2,3,4], 'c': [9,10,11]})
        >>> binner = graphlab.feature_engineering.create(sf,
              FeatureBinner(features = ['a', 'b', 'c'], strategy = 'quantile'))

        # Transform the data using the binner.
        >>> binned_sf = binner.transform(sf)

        # Save the transformer.
        >>> binner.save('save-path')

        # Return the details about the bins
        >>> binner['bins']

       Columns:
        column  str
        name    str
        left    float
        right   float

        Rows: 30

        Data:
        +--------+------+---------------------+--------------------+
        | column | name |         left        |       right        |
        +--------+------+---------------------+--------------------+
        |   a    | a_0  | -1.79769313486e+308 |        1.0         |
        |   a    | a_1  |         1.0         |        1.0         |
        |   a    | a_2  |         1.0         |        1.0         |
        |   a    | a_3  |         1.0         |        1.0         |
        |   a    | a_4  |         1.0         |        2.0         |
        |   a    | a_5  |         2.0         |        2.0         |
        |   a    | a_6  |         2.0         |        2.0         |
        |   a    | a_7  |         2.0         |        3.0         |
        |   a    | a_8  |         3.0         |        3.0         |
        |   a    | a_9  |         3.0         | 1.79769313486e+308 |
        +--------+------+---------------------+--------------------+
        [30 rows x 4 columns]
        Note: Only the head of the SFrame is printed.
        You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.
        '''

    _fit_examples_doc = _fit_examples_doc
    _transform_examples_doc = _transform_examples_doc
    _fit_transform_examples_doc = _fit_transform_examples_doc

    get_default_options = staticmethod(
        _get_default_options_wrapper(
            '_FeatureBinner', 'toolkits.feature_engineering._feature_binner',
            'FeatureBinner', True))

    def __init__(self,
                 features=None,
                 excluded_features=None,
                 strategy='logarithmic',
                 num_bins=10,
                 output_column_prefix=None):

        # Process and make a copy of the features, exclude.
        _features, _exclude = _internal_utils.process_features(
            features, excluded_features)

        # Type checking
        _raise_error_if_not_of_type(num_bins, [int])
        _raise_error_if_not_of_type(strategy, [str])

        # Set up options
        opts = {
            'strategy': strategy,
            'num_bins': num_bins,
            'output_column_prefix': output_column_prefix
        }
        if _exclude:
            opts['exclude'] = True
            opts['features'] = _exclude
        else:
            opts['exclude'] = False
            opts['features'] = _features

        # Initialize object
        proxy = _gl.extensions._FeatureBinner()
        proxy.init_transformer(opts)
        super(FeatureBinner, self).__init__(proxy, self.__class__)

    def _get_summary_struct(self):
        """
        Returns a structured description of the model, including (where relevant)
        the schema of the training data, description of the training data,
        training statistics, and model hyperparameters.

        Returns
        -------
        sections : list (of list of tuples)
            A list of summary sections.
              Each section is a list.
                Each item in a section list is a tuple of the form:
                  ('<label>','<field>')
        section_titles: list
            A list of section titles.
              The order matches that of the 'sections' object.
        """
        _features = _precomputed_field(
            _internal_utils.pretty_print_list(self.get('features')))
        _exclude = _precomputed_field(
            _internal_utils.pretty_print_list(self.get('excluded_features')))

        fields = [("Features", _features), ("Excluded_features", _exclude),
                  ("Strategy for creating bins", 'strategy'),
                  ("Number of bins to use", 'num_bins')]
        section_titles = ['Model fields']

        return ([fields], section_titles)

    def __repr__(self):
        (sections, section_titles) = self._get_summary_struct()
        return _toolkit_repr_print(self, sections, section_titles, width=30)

    @classmethod
    def _get_instance_and_data(cls):
        sf = _gl.SFrame({'a': [1, 2, 3], 'b': [2, 3, 4]})
        binner = _gl.feature_engineering.FeatureBinner(features=['a', 'b'],
                                                       strategy='quantile')
        return binner.fit(sf), sf
class NumericImputer(Transformer):
    '''
    Impute missing values with feature means.

    Input columns to the NumericImputer must be of type *int*, *float*,
    *dict*, *list*, or *array.array*.  For each column in the input, the transformed output is
    a column where the input is retained as is if:

     * there is no missing value.

    Inputs that do not satisfy the above are set to the mean value of that
    feature.

    The behavior for different input data column types is as follows:
    (see :func:`~graphlab.feature_engineering.NumericImputer.transform` for
    for examples).


    * **float** : If there is a missing value, it is replaced with the mean
      of that column.

    * **int**   : Behaves the same way as *float*.

    * **list**  : Each index of the list is treated as a feature column, and
      missing values are replaced with per-feature means. This is the same as
      unpacking, computing the mean, and re-packing. All elements must be of
      type *float*, *int*, or *None*. See :func:`~graphlab.SFrame.pack_columns`
      for more information.

    * **array** : Same behavior as *list*

    * **dict**  : Same behavior as *list*, except keys not present in a
      particular row are implicitly interpreted as having the value 0. This
      makes the  *dict* type a sparse representation of a vector.


    Parameters
    ----------
    features : list[str] | str | None, optional
        Name(s) of feature column(s) to be transformed. If set to None, then all
        feature columns are used.

    excluded_features : list[str] | str | None, optional
        Name(s) of feature columns in the input dataset to be ignored. Either
        `excluded_features` or `features` can be passed, but not both.

    strategy: 'auto'|'mean', optional
        The strategy with which to perform imputation.Currently can be 'auto'
        or 'mean'. Both currently perform mean imputation.

    output_column_prefix : str, optional
        The prefix to use for the column name of each transformed column.
        When provided, the transformation will add columns to the input data,
        where the new name is "`output_column_prefix`.original_column_name".
        If `output_column_prefix=None` (default), then the output column name
        is the same as the original feature column name.

    Returns
    -------
    out : NumericImputer
        A NumericImputer object which is initialized with the defined parameters.

    See Also
    --------
    graphlab.toolkits.feature_engineering._numeric_imputer.NumericImputer
    graphlab.toolkits.feature_engineering.create

    Notes
    -----
    - If the SFrame to be transformed already contains a column with the
      designated output column name, then that column will be replaced with the
      new output. In particular, this means that `output_column_prefix=None` will
      overwrite the original feature columns.

    Examples
    --------

    .. sourcecode:: python

        # Create data.
        >>> sf = graphlab.SFrame({'a': [1,3], 'b' : [2,4]})

        # Create a transformer.
        >>> from graphlab.toolkits.feature_engineering import NumericImputer
        >>> imputer = graphlab.feature_engineering.create(sf,
                     NumericImputer(features = ['a', 'b'], strategy = 'mean'))

        # Transform the data.
        >>> new_sf = graphlab.SFrame({'a': [1,None,3], 'b' : [2, None,4]})
        >>> transformed_sf = imputer.transform(new_sf)

        # Save the transformer.
        >>> imputer.save('save-path')

        # Return the means.
        >>> imputer['means']
        Columns:
            a  float
            b  float

        Rows: 1

        Data:
        +-----+-----+
        |  a  |  b  |
        +-----+-----+
        | 2.0 | 3.0 |
        +-----+-----+
        [1 rows x 2 columns]
    '''
    # Doc strings
    _fit_examples_doc = _fit_examples_doc
    _transform_examples_doc = _transform_examples_doc
    _fit_transform_examples_doc = _fit_transform_examples_doc

    # Default options
    get_default_options = staticmethod(
        _get_default_options_wrapper(
            '_MeanImputer', 'toolkits.feature_engineering._mean_imputer',
            'MeanImputer', True))

    def __init__(self,
                 features=None,
                 excluded_features=None,
                 strategy='auto',
                 output_column_prefix=None):

        # Process and make a copy of the features, exclude.
        _features, _exclude = _internal_utils.process_features(
            features, excluded_features)

        # Type checking
        _raise_error_if_not_of_type(strategy, [str])

        # Set up options
        opts = {
            'strategy': strategy,
            'output_column_prefix': output_column_prefix
        }
        if _exclude:
            opts['exclude'] = True
            opts['features'] = _exclude
        else:
            opts['exclude'] = False
            opts['features'] = _features

        # Initialize object
        proxy = _gl.extensions._MeanImputer()
        proxy.init_transformer(opts)
        super(NumericImputer, self).__init__(proxy, self.__class__)

    def _get_summary_struct(self):
        _features = _precomputed_field(
            _internal_utils.pretty_print_list(self.get('features')))
        _exclude = _precomputed_field(
            _internal_utils.pretty_print_list(self.get('excluded_features')))
        fields = [
            ("Features", _features),
            ("Excluded features", _exclude),
        ]
        section_titles = ['Model fields']
        return ([fields], section_titles)

    def __repr__(self):
        (sections, section_titles) = self._get_summary_struct()
        return _toolkit_repr_print(self, sections, section_titles, 30)

    @classmethod
    def _get_instance_and_data(cls):
        sf = _gl.SFrame({'a': [1, 2, 3], 'b': [2, 3, 4]})
        imputer = _gl.feature_engineering.NumericImputer(features=['a', 'b'],
                                                         strategy='mean')
        return imputer.fit(sf), sf
Example #11
0
            possible_args = set(get_default_options()["name"])
        except (RuntimeError, KeyError):
            possible_args = set()

        bad_arguments = set(kwargs.keys()).difference(possible_args)
        if bad_arguments:
            raise TypeError("Bad Keyword Arguments: " + ', '.join(bad_arguments))

        opts.update(kwargs)

    response = _graphlab.toolkits._main.run('recsys_train', opts, verbose)

    return FactorizationRecommender(response['model'])

get_default_options = _get_default_options_wrapper(
                          'factorization_recommender',
                          'recommender.factorization_recommender',
                          'FactorizationRecommender')

class FactorizationRecommender(_Recommender):
    r"""
    A FactorizationRecommender learns latent factors for each
    user and item and uses them to make rating predictions.

    FactorizationRecommender [Koren_et_al]_ contains a number of options that
    tailor to a variety of datasets and evaluation metrics, making this one of
    the most powerful model in the GraphLab Create recommender toolkit.

    **Side information**

    Side features may be provided via the `user_data` and `item_data` options
    when the model is created.
Example #12
0
class CategoricalImputer(Transformer):
    '''
    The purpose of this imputer is to fill missing values (None) in data sets
    that have categorical data. For instance, if the data set has a "feature" column
    where some rows have values, and some rows have None, this imputer will fill
    the Nones with values. It will also return a probability associated with
    the imputed value.

    This is accomplished by grouping the data based on provided reference_features
    (unsupervised clustering) then by assigning reference_features to the clusters following
    a graph walk among the resulting clusters.

    Parameters
    ----------
    reference_features : list[str] , optional
        Column names of reference_features to be used for clustering. If None, all columns are
        selected.

    feature : 'feature', optional
        Name of the column to impute. This column should contain some categorical
        values, as well as rows with None. Those rows will be imputed.


    Returns
    -------
    out : CategoricalImputer
        A CategoricalImputer object which is initialized with the defined
        parameters.

    See Also
    --------
    graphlab.toolkits.feature_engineering._categorical_imputer.CategoricalImputer
    graphlab.toolkits.feature_engineering.create

    Examples
    --------

    .. sourcecode:: python

        from graphlab.toolkits.feature_engineering import *

        # Impute the column "feature" using information from columns ['a', 'b']
        >>> sf = graphlab.SFrame({'a' : [0,1,1], 'b' : [1,0,0], 'label' : [1,2,None]})
        >>> imputer = graphlab.feature_engineering.CategoricalImputer(
                             feature = 'label', reference_features = ['a', 'b'])
        >>> imputer.fit(sf)

        # Print the input data.
        >>> sf
        Columns:
        a    int
        b    int
        label    int

        Rows: 3

        Data:
        +---+---+-------+
        | a | b | label |
        +---+---+-------+
        | 0 | 1 |   1   |
        | 1 | 0 |   2   |
        | 1 | 0 |  None |
        +---+---+-------+
        [3 rows x 3 columns]

        # Transform the data using the imputer.
        >>> imputed_sf = imputer.transform(sf)

        # Retrieve the imputed data.
        >>> imputed_sf
        Columns:
            a    int
            b    int
            label    int
            predicted_feature_label    int
            feature_probability_label    float

        Rows: 3

        Data:
        +---+---+---------+-------------------------+---------------------------+
        | a | b | feature | predicted_feature_label | feature_probability_label |
        +---+---+---------+-------------------------+---------------------------+
        | 0 | 1 |    1    |            1            |            1.0            |
        | 1 | 0 |    2    |            2            |            1.0            |
        | 1 | 0 |   None  |            2            |            1.0            |
        +---+---+---------+-------------------------+---------------------------+
        [3 rows x 5 columns]

        # Save the transformer.
        >>> imputer.save('save-path')

        # Bin only a single column 'a'.
        >>> imputer = graphlab.feature_engineering.create(sf,
                graphlab.feature_engineering.CategoricalImputer(
                    reference_features = ['a'], feature='label'))


    '''

    _fit_examples_doc = _fit_examples_doc
    _transform_examples_doc = _transform_examples_doc
    _fit_transform_examples_doc = _fit_transform_examples_doc

    get_default_options = staticmethod(
        _get_default_options_wrapper(
            '_CategoricalImputer',
            'toolkits.feature_engineering._categorical_imputer',
            'CategoricalImputer', True))

    def __init__(self,
                 reference_features=None,
                 feature="feature",
                 verbose=False):

        # Process and make a copy of the reference_features
        _reference_features, _exclude = _internal_utils.process_features(
            reference_features, None)

        # Type checking
        _raise_error_if_not_of_type(feature, [str])

        # Set up options
        opts = {
            'reference_features': reference_features,
            'feature': feature,
            'verbose': verbose
        }
        opts['reference_features'] = _reference_features

        # Initialize object
        proxy = _gl.extensions._CategoricalImputer()
        proxy.init_transformer(opts)
        super(CategoricalImputer, self).__init__(proxy, self.__class__)

    def _get_summary_struct(self):
        """
        Returns a structured description of the model, including (where relevant)
        the schema of the training data, description of the training data,
        training statistics, and model hyperparameters.

        Returns
        -------
        sections : list (of list of tuples)
            A list of summary sections.
              Each section is a list.
                Each item in a section list is a tuple of the form:
                  ('<feature>','<field>')
        section_titles: list
            A list of section titles.
              The order matches that of the 'sections' object.
        """
        _reference_features = _precomputed_field(
            _internal_utils.pretty_print_list(self.get('reference_features')))

        fields = [("reference_features", _reference_features),
                  ("Column to impute", 'feature')]
        section_titles = ['Model fields']

        return ([fields], section_titles)

    def __repr__(self):
        (sections, section_titles) = self._get_summary_struct()
        return _toolkit_repr_print(self, sections, section_titles, width=30)

    @classmethod
    def _get_instance_and_data(cls):
        sf = _gl.SFrame({
            'a': [1, 1, 1],
            'b': [1, 0, 1],
            'feature': [1, 2, None]
        })
        imputer = _gl.feature_engineering.CategoricalImputer(
            feature='feature', reference_features=['a', 'b'])
        return imputer.fit(sf), sf
Example #13
0
from graphlab.toolkits._internal_utils import _map_unity_proxy_to_object
from graphlab.toolkits._tree_model_mixin import TreeModelMixin as _TreeModelMixin

_DECISION_TREE_MODEL_PARAMS_KEYS = [
    'max_depth', 'min_child_weight', 'min_loss_reduction'
]
_DECISION_TREE_TRAINING_PARAMS_KEYS = [
    'objective', 'training_time', 'training_error', 'validation_error',
    'evaluation_metric'
]
_DECISION_TREE_TRAINING_DATA_PARAMS_KEYS = [
    'target', 'features', 'num_features', 'num_examples',
    'num_validation_examples'
]
get_default_options = _get_default_options_wrapper('decision_tree_classifier',
                                                   'decision_tree_classifier',
                                                   'DecisionTreeClassifier')

__doc_string_context = '''
      >>> url = 'https://static.turi.com/datasets/xgboost/mushroom.csv'
      >>> data = graphlab.SFrame.read_csv(url)

      >>> train, test = data.random_split(0.8)
      >>> model = graphlab.decision_tree_classifier.create(train, target='label')
'''


class DecisionTreeClassifier(_Classifier, _TreeModelMixin):
    """
    Special case of gradient boosted trees with the number of trees set to 1.
                                    _SupervisedLearningModel
from graphlab.toolkits._internal_utils import _toolkit_repr_print, \
                                        _toolkit_get_topk_bottomk, \
                                        _summarize_coefficients, \
                                        _raise_error_evaluation_metric_is_valid
from graphlab.toolkits._model_workflow import _collect_model_workflow
from graphlab.toolkits._model import _get_default_options_wrapper

_DEFAULT_SOLVER_OPTIONS = {
    'convergence_threshold': 1e-2,
    'step_size': 1.0,
    'lbfgs_memory_level': 11,
    'max_iterations': 10
}

get_default_options = _get_default_options_wrapper(
    'regression_linear_regression', 'linear_regression', 'LinearRegression')


def create(
        dataset,
        target,
        features=None,
        l2_penalty=1e-2,
        l1_penalty=0.0,
        solver='auto',
        feature_rescaling=True,
        convergence_threshold=_DEFAULT_SOLVER_OPTIONS['convergence_threshold'],
        step_size=_DEFAULT_SOLVER_OPTIONS['step_size'],
        lbfgs_memory_level=_DEFAULT_SOLVER_OPTIONS['lbfgs_memory_level'],
        max_iterations=_DEFAULT_SOLVER_OPTIONS['max_iterations'],
        validation_set="auto",
Example #15
0
            possible_args = set()

        bad_arguments = set(kwargs.keys()).difference(possible_args)
        if bad_arguments:
            raise TypeError("Bad Keyword Arguments: " +
                            ', '.join(bad_arguments))

        opts.update(kwargs)

    response = _graphlab.toolkits._main.run('recsys_train', opts, verbose)

    return RankingFactorizationRecommender(response['model'])


get_default_options = _get_default_options_wrapper(
    'ranking_factorization_recommender',
    'recommender.RankingFactorizationRecommender',
    'RankingFactorizationRecommender')


class RankingFactorizationRecommender(_Recommender):
    r"""
    A RankingFactorizationRecommender learns latent factors for each
    user and item and uses them to rank recommended items according to
    the likelihood of observing those (user, item) pairs. This is
    commonly desired when performing collaborative filtering for
    implicit feedback datasets or datasets with explicit ratings
    for which ranking prediction is desired.

    RankingFactorizationRecommender contains a number of options that
    tailor to a variety of datasets and evaluation metrics, making
    this one of the most powerful models in the GraphLab Create
                                        _toolkit_get_topk_bottomk, \
                                        _raise_error_if_not_sframe, \
                                        _check_categorical_option_type, \
                                        _map_unity_proxy_to_object, \
                                        _raise_error_evaluation_metric_is_valid, \
                                        _summarize_coefficients
from graphlab.toolkits._model_workflow import _collect_model_workflow

_DEFAULT_SOLVER_OPTIONS = {
'convergence_threshold': 1e-2,
'step_size': 1.0,
'lbfgs_memory_level': 11,
'max_iterations': 10}

get_default_options = _get_default_options_wrapper(
                          'classifier_logistic_regression',
                          'logistic_classifier',
                          'LogisticClassifier')

def create(dataset, target, features=None,
    l2_penalty=0.01, l1_penalty=0.0,
    solver='auto', feature_rescaling=True,
    convergence_threshold = _DEFAULT_SOLVER_OPTIONS['convergence_threshold'],
    step_size = _DEFAULT_SOLVER_OPTIONS['step_size'],
    lbfgs_memory_level = _DEFAULT_SOLVER_OPTIONS['lbfgs_memory_level'],
    max_iterations = _DEFAULT_SOLVER_OPTIONS['max_iterations'],
    class_weights = None,
    validation_set = 'auto',
    verbose=True):
    """
    Create a :class:`~graphlab.logistic_classifier.LogisticClassifier` (using
    logistic regression as a classifier) to predict the class of a discrete
_BOOSTED_TREE_TRAINING_DATA_PARAMS_KEYS = ['target', 'features',
'num_features', 'num_examples', 'num_validation_examples']


DEFAULT_HYPER_PARAMETER_RANGE = {
    'max_depth': [6, 8, 10],
    'step_size': 0.3,
    'min_loss_reduction': [0, 1, 10],
    'min_child_weight': 0.1,
    'row_subsample': 1,
    'column_subsample': 1,
    'max_iterations': [10, 50, 100]
}

get_default_options = _get_default_options_wrapper(
                          'boosted_trees_regression', 
                          'boosted_trees_regression', 
                          'BoostedTreesRegression')


class BoostedTreesRegression(_SupervisedLearningModel):
    """
    Encapsulates gradient boosted trees for regression tasks.

    The prediction is based on a collection of base learners, `regression trees
    <http://en.wikipedia.org/wiki/Decision_tree_learning>`_.


    Different from linear models, e.g. linear regression,
    the gradient boost trees model is able to model non-linear interactions
    between the features and the target using decision trees as the subroutine.
    It is good for handling numerical features and categorical features with
Example #18
0
        'user_id': user_id,
        'item_id': item_id,
        'target': target,
        'user_data': user_data,
        'item_data': item_data,
        'nearest_items': _graphlab.SFrame(),
        'model': model_proxy,
        'random_seed': 1
    }

    response = _graphlab.toolkits._main.run('recsys_train', opts, verbose)
    return PopularityRecommender(response['model'])


get_default_options = _get_default_options_wrapper(
    'popularity', 'recommender.popularity_recommender',
    'PopularityRecommender')


class PopularityRecommender(_Recommender):
    """
    The Popularity Model ranks an item according to its overall popularity.

    When making recommendations, the items are scored by the number of times it
    is seen in the training set. The item scores are the same for all users.
    Hence the recommendations are not tailored for individuals.

    The Popularity Recommender is simple and fast and provides a reasonable baseline.
    It can work well when observation data is sparse. It can be used as a
    "background" model for new users.
class TFIDF(Transformer):
    '''
    Transform an SFrame into TF-IDF scores.

    The prototypical application of TF-IDF transformations involves
    document collections, where each element represents a document in
    bag-of-words format, i.e. a dictionary whose keys are words and whose
    values are the number of times the word occurs in the document. For more
    details, check the reference section for further reading.

    The TF-IDF transformation performs the following computation

    .. math::
        \mbox{TF-IDF}(w, d) = tf(w, d) * log(N / f(w))

    where :math:`tf(w, d)` is the number of times word :math:`w` appeared in
    document :math:`d`, :math:`f(w)` is the number of documents word :math:`w`
    appeared in, :math:`N` is the number of documents, and we use the
    natural logarithm.

    The transformed output is a column of type dictionary
    (`max_categories` per column dimension sparse vector) where the key
    corresponds to the index of the categorical variable and the value is `1`.

    The behavior of TF-IDF for each input data column type for supported types
    is as follows. (see :func:`~graphlab.feature_engineering.TFIDF.transform`
    for examples of the same).


    * **dict** : Each (key, value) pair is treated as count associated with
      the key for this row. A common example is to have a dict element contain
      a bag-of-words representation of a document, where each key is a word
      and each value is the number of times that word occurs in the document.
      All non-numeric values are ignored.

    * **list** : The list is converted to bag of words of format, where the keys
      are the unique elements in the list and the values are the counts of
      those unique elements. After this step, the behaviour is identical to
      dict.

    * **string** : Behaves identically to a **dict**, where the dictionary is
      generated by converting the string into a bag-of-words format. For
      example, 'I really like really fluffy dogs" would get converted to
      {'I' : 1, 'really': 2, 'like': 1, 'fluffy': 1, 'dogs':1}.


    Parameters
    ----------
    features : str
        Name of feature column to be transformed.

    max_document_frequency: float
        The maximum ratio of document_frequency to num_documents that is
        encoded. All terms with a document frequency higher than this are
        discarded. This value must be between 0 and 1.

    min_document_frequency: int, optional
        The minimum ratio of document_frequency to num_documents that is
        encoded. All terms with a document frequency lower than this are
        discarded. This value must be between 0 and 1.

    output_column_prefix : str, optional
        The prefix to use for the column name of each transformed column.
        When provided, the transformation will add columns to the input data,
        where the new name is "`output_column_prefix`.original_column_name".
        If `output_column_prefix=None` (default), then the output column name
        is the same as the original feature column name.

    Returns
    -------
    out : TFIDF
        A TFIDF object which is initialized with the defined
        parameters.

    Notes
    -------
    - `None` values are treated as separate categories and are encoded along
      with the rest of the values.
    - If the SFrame to be transformed already contains a column with the
      designated output column name, then that column will be replaced with the
      new output. In particular, this means that `output_column_prefix=None` will
      overwrite the original feature columns.

    References
    ----------
    For more details about tf-idf,
    see http://en.wikipedia.org/wiki/Tf%E2%80%93idf

    See Also
    --------
    graphlab.toolkits.feature_engineering._tfidf.TFIDF,
    graphlab.toolkits.feature_engineering.create

    Examples
    --------

    .. sourcecode:: python

        >>> import graphlab as gl

        # Create the data
        >>> sf = gl.SFrame(
            {'docs': [{'this': 1, 'is': 1, 'a': 2, 'sample': 1},
                      {'this': 1, 'is': 1, 'another': 2, 'example': 3}]})

        # Create a TFIDF encoder object.
        >>> encoder = gl.feature_engineering.TFIDF('docs')

        # Fit the encoder for a given dataset.
        >>> encoder = encoder.fit(sf)

        >>> result = transformed_sf = encoder.transform(sf)
        >>> result.print_rows(max_column_width=60)
        +-------------------------------------------------------------+
        |                             docs                            |
        +-------------------------------------------------------------+
        | {'this': 0.0, 'a': 1.3862943611198906, 'is': 0.0, 'sampl... |
        | {'this': 0.0, 'is': 0.0, 'example': 2.0794415416798357, ... |
        +-------------------------------------------------------------+

        '''

    # Doc strings
    _fit_examples_doc = _fit_examples_doc
    _fit_transform_examples_doc = _fit_transform_examples_doc
    _transform_examples_doc  = _transform_examples_doc

    # Default options
    get_default_options = staticmethod(_get_default_options_wrapper(
            '_TFIDF', 'toolkits.feature_engineering._tfidf',
                                                'TFIDF', True))

    def __init__(self, features=None, excluded_features=None,
                 min_document_frequency=0.0,
                 max_document_frequency=1.0,
                 output_column_prefix=None):

        # Process and make a copy of the features, exclude.
        _features, _exclude = _internal_utils.process_features(features, excluded_features)

        # Type checking
        _raise_error_if_not_of_type(min_document_frequency, [float, int])
        _raise_error_if_not_of_type(max_document_frequency, [float, int])
        _raise_error_if_not_of_type(output_column_prefix, [str, type(None)])

        # Set up options
        opts = {
          'min_document_frequency': min_document_frequency,
          'max_document_frequency': max_document_frequency,
          'output_column_prefix' : output_column_prefix
        }
        if _exclude:
            opts['exclude'] = True
            opts['features'] = _exclude
        else:
            opts['exclude'] = False
            opts['features'] = _features

        # Initialize object
        proxy = _gl.extensions._TFIDF()
        proxy.init_transformer(opts)
        super(TFIDF, self).__init__(proxy, self.__class__)

    def _get_summary_struct(self):
        _features = _precomputed_field(
            _internal_utils.pretty_print_list(self.get('features')))
        fields = [
            ("Features", _features),
            ("Minimimum Document Frequency", 'min_document_frequency'),
            ("Maximimum Document Frequency", 'max_document_frequency'),
            ("Output Column Prefix", 'output_column_prefix')
        ]
        section_titles = ['Model fields']
        return ([fields], section_titles)

    def __repr__(self):
        (sections, section_titles) = self._get_summary_struct()
        return _toolkit_repr_print(self, sections, section_titles, 30)

    @classmethod
    def _get_instance_and_data(self):
        sf = _gl.SFrame(
            {'docs': [{'this': 1, 'is': 1, 'a': 2, 'sample': 1},
                      {'this': 1, 'is': 1, 'another': 2, 'example': 3}]})
        encoder = _gl.feature_engineering.TFIDF(features=['docs'])
        encoder = encoder.fit(sf)
        return encoder, sf
Example #20
0
class WordCounter(Transformer):
    '''
    __init__(features=None, excluded_features=None,
        to_lower=True, delimiters=["\\\\r", "\\\\v", "\\\\n", "\\\\f", "\\\\t", " "],
        output_column_prefix=None)

    Transform string/dict/list columns of an SFrame into their respective
    bag-of-words representation.

    Bag-of-words is a common text representation. An input text string is first
    tokenized. Each token is understood to be a word. The output is a dictionary
    of the count of the number of times each unique word appears in the text
    string. This dictionary is a sparse representation because most of the
    words in the vocabulary do not appear in every single sentence, hence their
    count is zero, which are not explicitly included in the dictionary.

    WordCounter can be applied to all the string-, dictionary-, and list-typed
    columns in a given SFrame. Its behavior for each supported input column
    type is as follows. (See :func:`~graphlab.feature_engineering.WordCounter.transform`
    for usage examples).

    * **string** : The string is first tokenized. By default, all letters are
      first converted to lower case, then tokenized by space characters. The
      user can specify a custom delimiter list, or use Penn tree-bank style
      tokenization (see input parameter description for details). Each token
      is taken to be a word, and a dictionary is generated where each key is a
      unique word that appears in the input text string, and the value is the
      number of times the word appears. For example, "I really like Really
      fluffy dogs" would get converted to
      {'i' : 1, 'really': 2, 'like': 1, 'fluffy': 1, 'dogs':1}.

    * **list** : Each element of the list must be a string, which is tokenized
      according to the input method and tokenization settings, followed by
      counting. The behavior is analogous to that of dict-type input, where the
      count of each list element is taken to be 1. For example, under default
      settings, an input list of ['alice bob Bob', 'Alice bob'] generates an
      output bag-of-words dictionary of {'alice': 2, 'bob': 3}.

    * **dict** : The method first obtains the list of keys in the dictionary.
      This list is processed as described above.

    Parameters
    ----------
    features : list[str] | str | None, optional
        Name(s) of feature column(s) to be transformed. If set to None, then all
        feature columns are used.

    excluded_features : list[str] | str | None, optional
        Name(s) of feature columns in the input dataset to be ignored. Either
        `excluded_features` or `features` can be passed, but not both.

    to_lower : bool, optional
        Indicates whether to map the input strings to lower case before counting.

    delimiters: list[string], optional
        A list of delimiter characters for tokenization. By default, the list
        is defined to be the list of space characters. The user can define
        any custom list of single-character delimiters. Alternatively, setting
        `delimiters=None` will use a Penn treebank type tokenization, which
        is better at handling punctuations. (See reference below for details.)

    output_column_prefix : str, optional
        The prefix to use for the column name of each transformed column.
        When provided, the transformation will add columns to the input data,
        where the new name is "`output_column_prefix`.original_column_name".
        If `output_column_prefix=None` (default), then the output column name
        is the same as the original feature column name.

    Returns
    -------
    out : WordCounter
        A WordCounter feature engineering object which is initialized with
        the defined parameters.

    Notes
    -----
    If the SFrame to be transformed already contains a column with the
    designated output column name, then that column will be replaced with the
    new output. In particular, this means that `output_column_prefix=None` will
    overwrite the original feature columns.

    References
    ----------
    - `Penn treebank tokenization <https://www.cis.upenn.edu/~treebank/tokenization.html>`_

    See Also
    --------
    graphlab.toolkits.text_analytics.count_words,
    graphlab.toolkits.feature_engineering._ngram_counter.NGramCounter,
    graphlab.toolkits.feature_engineering._tfidf.TFIDF,
    graphlab.toolkits.feature_engineering._tokenizer.Tokenizer,
    graphlab.toolkits.feature_engineering.create

    Examples
    --------

    .. sourcecode:: python

        >>> import graphlab as gl

        # Create data.
        >>> sf = gl.SFrame({
        ...    'string': ['sentences Sentences', 'another sentence'],
        ...    'dict': [{'bob': 1, 'Bob': 0.5}, {'a': 0, 'cat': 5}],
        ...    'list': [['one', 'two', 'three'], ['a', 'cat']]})

        # Create a WordCounter transformer.
        >>> from graphlab.toolkits.feature_engineering import WordCounter
        >>> encoder = WordCounter()

        # Fit and transform the data.
        >>> transformed_sf = encoder.fit_transform(sf)
        Columns:
            dict    dict
            list    dict
            string  dict

        Rows: 2

        Data:
        +------------------------+----------------------------------+
        |          dict          |               list               |
        +------------------------+----------------------------------+
        |      {'bob': 1.5}      | {'one': 1, 'three': 1, 'two': 1} |
        | {'a': 0, 'cat': 5}     |        {'a': 1, 'cat': 1}        |
        +------------------------+----------------------------------+
        +-------------------------------+
        |             string            |
        +-------------------------------+
        |        {'sentences': 2}       |
        | {'another': 1, 'sentence': 1} |
        +-------------------------------+
        [2 rows x 3 columns]

        # Penn treebank-style tokenization (recommended for smarter handling
        #    of punctuations)
        >>> sf = gl.SFrame({'string': ['sentence $$one', 'sentence two...']})
        >>> WordCounter(delimiters=None).fit_transform(sf)
        Columns:
            string  dict

        Rows: 2

        Data:
        +-----------------------------------+
        |               string              |
        +-----------------------------------+
        | {'sentence': 1, '$': 2, 'one': 1} |
        | {'sentence': 1, 'two': 1, '.': 3} |
        +-----------------------------------+
        [2 rows x 1 columns]

        # Save the transformer.
        >>> encoder.save('save-path')
'''

    # Doc strings
    _fit_examples_doc = _fit_examples_doc
    _fit_transform_examples_doc = _fit_transform_examples_doc
    _transform_examples_doc = _transform_examples_doc

    # Default options
    get_default_options = staticmethod(
        _get_default_options_wrapper(
            '_WordCounter', 'toolkits.feature_engineering._word_counter',
            'WordCounter', True))

    def __init__(self,
                 features=None,
                 excluded_features=None,
                 to_lower=True,
                 delimiters=["\r", "\v", "\n", "\f", "\t", " "],
                 output_column_prefix=None):

        # Process and make a copy of the features, exclude.
        _features, _exclude = _internal_utils.process_features(
            features, excluded_features)

        # Type checking
        _raise_error_if_not_of_type(features, [list, str, _NoneType])
        _raise_error_if_not_of_type(excluded_features, [list, str, _NoneType])
        _raise_error_if_not_of_type(to_lower, [bool])
        _raise_error_if_not_of_type(delimiters, [list, _NoneType])
        _raise_error_if_not_of_type(output_column_prefix, [str, _NoneType])

        if delimiters != None:
            for delim in delimiters:
                _raise_error_if_not_of_type(delim, str, "delimiters")
                if (len(delim) != 1):
                    raise ValueError(
                        "Delimiters must be single-character strings")

        # Set up options
        opts = {
            'features': features,
            'to_lower': to_lower,
            'delimiters': delimiters,
            'output_column_prefix': output_column_prefix
        }
        if _exclude:
            opts['exclude'] = True
            opts['features'] = _exclude
        else:
            opts['exclude'] = False
            opts['features'] = _features

        # Initialize object
        proxy = _gl.extensions._WordCounter()
        proxy.init_transformer(opts)
        super(WordCounter, self).__init__(proxy, self.__class__)

    def _get_summary_struct(self):
        _features = _precomputed_field(
            _internal_utils.pretty_print_list(self.get('features')))
        fields = [("Features", _features),
                  ("Convert strings to lower case", 'to_lower'),
                  ("Delimiters", "delimiters"),
                  ("Output column prefix", 'output_column_prefix')]
        section_titles = ['Model fields']
        return ([fields], section_titles)

    def __repr__(self):
        (sections, section_titles) = self._get_summary_struct()
        return _toolkit_repr_print(self, sections, section_titles, 30)

    @classmethod
    def _get_instance_and_data(self):
        sf = _gl.SFrame({
            'docs': [{
                'this': 1,
                'is': 1,
                'a': 2,
                'sample': 1
            }, {
                'this': 1,
                'is': 1,
                'another': 2,
                'example': 3
            }]
        })
        encoder = _gl.feature_engineering.WordCounter('docs')
        encoder = encoder.fit(sf)
        return encoder, sf
_DEFAULT_SOLVER_OPTIONS = {
'convergence_threshold': 1e-2,
'step_size': 1.0,
'lbfgs_memory_level': 11,
'mini_batch_size': 1,
'auto_tuning': True,
'max_iterations': 10}

DEFAULT_HYPER_PARAMETER_RANGE = {
    'l1_penalty' : [0.0, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0],
    'l2_penalty' : [0.0, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
}

get_default_options = _get_default_options_wrapper(
                          'regression_linear_regression', 
                          'linear_regression', 
                          'LinearRegression')

def create(dataset, target, features=None, l2_penalty=1e-2, l1_penalty=0.0,
    solver='auto', feature_rescaling=True,
    convergence_threshold = _DEFAULT_SOLVER_OPTIONS['convergence_threshold'],
    step_size = _DEFAULT_SOLVER_OPTIONS['step_size'],
    lbfgs_memory_level = _DEFAULT_SOLVER_OPTIONS['lbfgs_memory_level'],
    mini_batch_size = _DEFAULT_SOLVER_OPTIONS['mini_batch_size'],
    max_iterations = _DEFAULT_SOLVER_OPTIONS['max_iterations'], 
    auto_tuning = _DEFAULT_SOLVER_OPTIONS['auto_tuning'], verbose=True):

    """
    Create a :class:`~graphlab.linear_regression.LinearRegression` to
    predict a scalar target variable as a linear function of one or more
    features. In addition to standard numeric and categorical types, features
from graphlab.toolkits._supervised_learning import Classifier as _Classifier
import graphlab.toolkits._supervised_learning as _sl
from graphlab.toolkits._model_workflow import _collect_model_workflow
from graphlab.toolkits._internal_utils import _toolkit_repr_print
from graphlab.toolkits._supervised_learning import _show_model_tree
from graphlab.toolkits._internal_utils import _raise_error_evaluation_metric_is_valid
from graphlab.toolkits._internal_utils import _raise_error_if_not_sframe
from graphlab.toolkits._internal_utils import _raise_error_if_column_exists
from graphlab.toolkits._internal_utils import _check_categorical_option_type
from graphlab.toolkits._internal_utils import _map_unity_proxy_to_object
from graphlab.toolkits._tree_model_mixin import TreeModelMixin as _TreeModelMixin
from graphlab.util import _make_internal_url
import logging as _logging

get_default_options = _get_default_options_wrapper('random_forest_classifier',
                                                   'random_forest_classifier',
                                                   'RandomForestClassifier')

__doc_string_context = '''
      >>> url = 'https://static.turi.com/datasets/xgboost/mushroom.csv'
      >>> data = graphlab.SFrame.read_csv(url)

      >>> train, test = data.random_split(0.8)
      >>> model = graphlab.random_forest_classifier.create(train, target='label')
'''


class RandomForestClassifier(_Classifier, _TreeModelMixin):
    """
    The random forest model can be used as a classifier for predictive
    tasks.
    >>> import graphlab as gl
    >>> import datetime

    # Load a data set.
    >>> sf = gl.SFrame(
    ... 'https://static.turi.com/datasets/churn-prediction/online_retail.csv')

    # Convert InvoiceDate from string to datetime.
    >>> import dateutil
    >>> from dateutil import parser
    >>> sf['InvoiceDate'] = sf['InvoiceDate'].apply(parser.parse)

    # Convert SFrame into TimeSeries.
    >>> time_series = gl.TimeSeries(sf, 'InvoiceDate')

    # Create a train-test split.
    >>> train, valid = gl.churn_predictor.random_split(time_series,
    ...           user_id='CustomerID', fraction=0.9)

    # Train a churn prediction model.
    >>> model = gl.churn_predictor.create(train, user_id='CustomerID',
    ...                       features = ['Quantity'])
"""
from ._churn_predictor import create
from ._churn_predictor import ChurnPredictor
from ._churn_predictor import random_split
from graphlab.toolkits._model import _get_default_options_wrapper

get_default_options = _get_default_options_wrapper(
    '_ChurnPredictor', 'churn_predictor', 'ChurnPredictor', True)
            'item_data': item_data,
            'nearest_items': nearest_items,
            'model': model_proxy,
            'random_seed': 1,
            'similarity_type': similarity_type,
            'training_method': training_method,
            'threshold': threshold,
            'only_top_k': only_top_k}

    response = _graphlab.toolkits._main.run('recsys_train', opts, verbose)
    return ItemSimilarityRecommender(response['model'])



get_default_options = _get_default_options_wrapper(
                          'item_similarity',
                          'recommender.item_similarity',
                          'ItemSimilarityRecommender')

class ItemSimilarityRecommender(_Recommender):
    """
    A model that ranks an item according to its similarity to other items
    observed for the user in question.

    **Creating an ItemSimilarityRecommender**

    This model cannot be constructed directly.  Instead, use
    :func:`graphlab.recommender.item_similarity_recommender.create`
    to create an instance
    of this model. A detailed list of parameter options and code samples
    are available in the documentation for the create function.
class CountFeaturizer(Transformer):
    '''
    Replaces a collection of categorical columns with counts of a target column.

    The CountFeaturizer is an efficient way of reducing high dimensional
    categorical columns into simple counts for the purpose of classification.
    Supported types are only str and int and both are interpreted
    categorically. The CountFeaturizer is effective for significantly
    accelerating downstream learning procedures without loss of accuracy for
    extremely large datasets.

    Assuming we are going to try to predict column Y which has K unique classses.
    Then for every column X, we replace it with 2 columns,
    "count_X" and "prob_X". The column count_X contains an array of length K
    which contains the counts of each unique value of Y where X is fixed. The
    column prob_X contains the normalized value of count_X dropping the last
    value.

    For instance, given the following SFrame:

    .. sourcecode:: python

        >>> sf = graphlab.SFrame({'a' : [1,1,2], 'y':[0,1,0]})
        +---+---+
        | a | y |
        +---+---+
        | 1 | 0 |
        | 1 | 1 |
        | 2 | 0 |
        +---+---+

    After fit_transform the output SFrame is

    .. sourcecode:: python

        >>> cf = graphlab.feature_engineering.CountFeaturizer(target = 'y', laplace_smearing=0)
        >>> cf.fit_transform(sf)
        +------------+--------+---+
        |  count_a   | prob_a | y |
        +------------+--------+---+
        | [1.0, 1.0] | [0.5]  | 0 |
        | [1.0, 1.0] | [0.5]  | 1 |
        | [1.0, 0.0] | [1.0]  | 0 |
        +------------+--------+---+
        [3 rows x 3 columns]

    Observe that in the original sframe, there is 1 occurance where a = 1 & y =
    0 and 1 occurance where a = 1 & y = 1. Thus in every row where a = 1, we
    output [1.0, 1.0] in the count_a column.  Similarly, for the case of a = 2,
    we have a count of 1 where y = 0 & a = 2, and no occurances of y = 1 & a =
    2. Hence in every row where a = 2, we output [1.0, 0.0] in the count_a
    column. The prob_a column is just count_a column, normalized to sum to 1,
    and dropping the last value.

    The laplace_smearing parameter controls the amount of noise added to the
    result which may will allow fit() and transform() to be performed on the
    same dataset.  Tuning this parameter can be difficult in practice however.
    Therefore it is highly recommended (and is the default behavior) to set
    laplace_smearing=0 and split the training dataset into two sets, where one
    set is used only in fit() and the other set used only in transform().

    Parameters
    ----------
    target: str, required
        The target column we are trying to predict.

    features : list[str] | str | None, optional
        Name(s) of feature column(s) to be transformed. If set to None, then all
        columns are used.

    excluded_features : list[str] | str | None, optional
        Name(s) of feature columns in the input dataset to be ignored. Either
        `excluded_features` or `features` can be passed, but not both.

    num_bits : int, optional
        This parameter is the size of the countmin sketch used to approximate
        the counts and controls the accuracy of the counts.  The higher the
        value, the more accurate the counts, but takes up more memory. Defaults
        to 20.

    laplace_smearing : float, optional
        Defaults to 0. Adds some noise to the transform result to allow the same
        dataset to be used for both fit and transform. When the number of rows
        is small, this parameter can be reduced in value. If set to 0, it is
        recommended that the training set be split into two sets, where
        one set is used used in fit(), the the other used in transform().

    random_seed : int, optional
        A random seed. Fix this to get deterministic outcomes.

    count_column_prefix : str, optional
        The prefix added to the input column name to produce the output column
        name containing the counts. Defaults to `count_`

    prob_column_prefix : str, optional
        The prefix added to the input column name to produce the output column
        name containing the normalized counts. Defaults to `prob_`

    Returns
    -------
    out : CountFeaturizer
        A CountFeaturizer object which is initialized with the defined
        parameters.

    Notes
    -----
    The prob_X columns have one value dropped to eliminate a linear dependency.

    References
    ----------
    Implements the method described in `this blog
    <https://blogs.technet.microsoft.com/machinelearning/2015/02/17/big-learning-made-easy-with-counts/>`.


    Examples
    --------

    .. sourcecode:: python

        >>> from graphlab.toolkits.feature_engineering import *

        # Perform Count Featurization on columns 'a' and 'b' with respect to
        # the target 'y'
        >>> sf = graphlab.SFrame({'a' : [1,1,2], 'b' : [2,2,3], 'y':[0,1,0]})
        >>> cf = graphlab.feature_engineering.create(sf,
        ...               graphlab.feature_engineering.CountFeaturizer(
        ...                     features = ['a', 'b'], target = 'y'))

        # Transform the data
        >>> out_sf = cf.fit_transform(sf)
        >>> out_sf
        +------------+--------+------------+--------+---+
        |  count_a   | prob_a |  count_b   | prob_b | y |
        +------------+--------+------------+--------+---+
        | [1.0, 1.0] | [0.5]  | [1.0, 1.0] | [0.5]  | 0 |
        | [1.0, 1.0] | [0.5]  | [1.0, 1.0] | [0.5]  | 1 |
        | [1.0, 0.0] | [1.0]  | [1.0, 0.0] | [1.0]  | 0 |
        +------------+--------+------------+--------+---+
        [3 rows x 5 columns]

        # Save the transformer.
        >>> cf.save('save-path')
    '''

    _fit_examples_doc = _fit_examples_doc
    _fit_transform_examples_doc = _fit_transform_examples_doc
    _transform_examples_doc = _transform_examples_doc


    get_default_options = staticmethod(_get_default_options_wrapper(
        '_CountFeaturizer', 'toolkits.feature_engineering._count_featurizer',
                                                    'CountFeaturizer', True))

    _metric_handle = 'toolkits.feature_engineering.count_featurizer'

    def __init__(self, target, features=None, excluded_features=None,
            random_seed=None, laplace_smearing=0.0, num_bits=20,
            count_column_prefix='count_', prob_column_prefix='prob_'):

        _mt._get_metric_tracker().track(self._metric_handle + '.__init__')

        if count_column_prefix == prob_column_prefix:
            raise RuntimeError("count_column_prefix cannot be equal to prob_column_prefix")

        # Process and make a copy of the features, exclude.
        _features, _exclude = _internal_utils.process_features(
                                        features, excluded_features)

        # Type checking
        _raise_error_if_not_of_type(num_bits, [int])

        # Set up options
        opts = {
            'target':target,
            'num_bits': num_bits,
            'random_seed': random_seed,
            'laplace_smearing':laplace_smearing,
            'num_bits':num_bits,
            'count_column_prefix':count_column_prefix,
            'prob_column_prefix':prob_column_prefix
            }
        if _exclude:
            opts['exclude'] = True
            opts['features'] = _exclude
        else:
            opts['exclude'] = False
            opts['features'] = _features

        # Initialize object
        proxy = _gl.extensions._CountFeaturizer()
        proxy.init_transformer(opts)
        super(CountFeaturizer, self).__init__(proxy, self.__class__)

    def _get_summary_struct(self):
        """
        Returns a structured description of the model, including (where relevant)
        the schema of the training data, description of the training data,
        training statistics, and model hyperparameters.

        Returns
        -------
        sections : list (of list of tuples)
            A list of summary sections.
              Each section is a list.
                Each item in a section list is a tuple of the form:
                  ('<label>','<field>')
        section_titles: list
            A list of section titles.
              The order matches that of the 'sections' object.
        """
        _features = _precomputed_field(
            _internal_utils.pretty_print_list(self.get('features')))
        _exclude = _precomputed_field(
            _internal_utils.pretty_print_list(self.get('excluded_features')))
        fields = [
            ("Target", "target"),
            ("Features", _features),
            ("Excluded features", _exclude),
            ("Number of bits", 'num_bits'),
            ("Random seed", 'random_seed'),
            ("Laplace Smearing", 'laplace_smearing'),
            ("Count Column Prefix", 'count_column_prefix'),
            ("Probability Column Prefix", 'prob_column_prefix')
            ]
        section_titles = [ 'Model fields' ]
        return ([fields], section_titles)

    def __repr__(self):
        (sections, section_titles) = self._get_summary_struct()
        return _toolkit_repr_print(self, sections, section_titles, width= 30)

    @classmethod
    def _get_instance_and_data(cls):
        sf = _gl.SFrame({'a' : [1,1,2], 'b' : [2,2,3], 'y':[0,1,0]})
        cf = _gl.feature_engineering.CountFeaturizer(features = ['a', 'b'], target='y')
        return cf.fit(sf), sf
# Utils
from graphlab.util import _raise_error_if_not_of_type
from graphlab.toolkits._internal_utils import _toolkit_repr_print, \
                                              _precomputed_field, \
                                              _raise_error_if_not_sframe, \
                                              _check_categorical_option_type
from graphlab.toolkits._model import _get_default_options_wrapper
from graphlab.toolkits._model import SDKModel as _SDKModel

_DEFAULT_OPTIONS = {
    'min_support': 1,
    'max_patterns': 100,
    'min_length': 1,
}

get_default_options = _get_default_options_wrapper(
    '_FPGrowth', 'frequent_pattern_mining', 'FrequentPatternMiner', True)


def create(dataset, item, features=None, min_support=1, max_patterns=100,
           min_length=1):
    """
    Create a :class:`~graphlab.frequent_pattern_mining.FrequentPatternMiner` to
    extract the set of frequently occurring items in an event-series.

    Parameters
    ----------

    dataset : SFrame
        Dataset for training the model.

    item: string
        except (RuntimeError, KeyError):
            possible_args = set()

        bad_arguments = set(kwargs.keys()).difference(possible_args)
        if bad_arguments:
            raise TypeError("Bad Keyword Arguments: " + ', '.join(bad_arguments))

        opts.update(kwargs)

    response = _graphlab.toolkits._main.run('recsys_train', opts, verbose)

    return RankingFactorizationRecommender(response['model'])


get_default_options = _get_default_options_wrapper(
                          'ranking_factorization_recommender',
                          'recommender.RankingFactorizationRecommender',
                          'RankingFactorizationRecommender')

class RankingFactorizationRecommender(_Recommender):
    r"""
    A RankingFactorizationRecommender learns latent factors for each
    user and item and uses them to rank recommended items according to
    the likelihood of observing those (user, item) pairs. This is
    commonly desired when performing collaborative filtering for
    implicit feedback datasets or datasets with explicit ratings
    for which ranking prediction is desired.

    RankingFactorizationRecommender contains a number of options that
    tailor to a variety of datasets and evaluation metrics, making
    this one of the most powerful models in the GraphLab Create
    recommender toolkit.  
from graphlab.toolkits._model import _get_default_options_wrapper
from graphlab.toolkits._supervised_learning import Classifier as _Classifier
import graphlab.toolkits._supervised_learning as _sl
from graphlab.toolkits._model_workflow import _collect_model_workflow
from graphlab.toolkits._internal_utils import _toolkit_repr_print
from graphlab.toolkits._supervised_learning import _show_model_tree
from graphlab.toolkits._internal_utils import _raise_error_evaluation_metric_is_valid
from graphlab.toolkits._internal_utils import _raise_error_if_not_sframe
from graphlab.toolkits._internal_utils import _raise_error_if_column_exists
from graphlab.toolkits._internal_utils import _check_categorical_option_type
from graphlab.toolkits._internal_utils import _map_unity_proxy_to_object
from graphlab.toolkits._tree_model_mixin import TreeModelMixin as _TreeModelMixin


get_default_options = _get_default_options_wrapper(
                          'random_forest_classifier',
                          'random_forest_classifier',
                          'RandomForestClassifier')

__doc_string_context = '''
      >>> url = 'http://s3.amazonaws.com/gl-testdata/xgboost/mushroom.csv'
      >>> data = graphlab.SFrame.read_csv(url)

      >>> train, test = data.random_split(0.8)
      >>> model = graphlab.random_forest_classifier.create(train, target='label')
'''

class RandomForestClassifier(_Classifier, _TreeModelMixin):
    """
    The random forest model can be used as a classifier for predictive
    tasks.
    | EmpId | stay_probability |
    +-------+------------------+
    |   1   |  0.841130895119  |
    |   2   |  0.121616783954  |
    |   3   |  0.121616783954  |
    |   4   |  0.121616783954  |
    |   5   |  0.121616783954  |
    |   6   |  0.121616783954  |
    |   7   |  0.121616783954  |
    |   8   |  0.121616783954  |
    |   9   |  0.121616783954  |
    |   10  |  0.121616783954  |
    +-------+------------------+
    [49 rows x 2 columns]
    
    # It is important to notice that the output of the model is the User Id, as well
    # as the Probability of the user Staying (not churning). This means that 100%
    # means the user will stay (not churn), and 0% means the user will definitely churn.
    
    >>> model.save("model_file")
    >>> load_model = gl.load_model("model_file")

"""

from _churn_predictor import create
from _churn_predictor import ChurnPredictor
from graphlab.toolkits._model import _get_default_options_wrapper

get_default_options = _get_default_options_wrapper(
    '_ChurnPredictor', 'churn_predictor', 'ChurnPredictor', True)
class CountThresholder(Transformer):
    '''
    Map infrequent categorical variables to a `new/separate` category.

    Input columns to the CountThresholder must be of type *int*, *string*,
    *dict*, or *list*.  For each column in the input, the transformed output is
    a column where the input category is retained as is if:

     * it has occurred at least `threshold` times in the training data.

    categories that does not satisfy the above are set to `output_category_name`.

    The behaviour for different input data column types is as follows:
    (see :func:`~graphlab.feature_engineering.CountThresholder.transform` for
    for examples).


    * **string** : Strings are marked with the `output_category_name` if the
      threshold condition described above is not satisfied.

    * **int** : Behave the same way as *string*. If `output_category_name` is
      of type *string*, then the entire column is cast to string.

    * **list** : Each of the values in the list are mapped in the same way as
      a string value.

    * **dict** : They key of the dictionary is treated as a `namespace` and the
      value is treated as a `sub-category` in the `namespace`. The categorical
      variable passed through the transformer is a combination of the
      `namespace` and the `sub-category`.


    Parameters
    ----------
    features : list[str] | str | None, optional
        Name(s) of feature column(s) to be transformed. If set to None, then all
        feature columns are used.

    excluded_features : list[str] | str | None, optional
        Name(s) of feature columns in the input dataset to be ignored. Either
        `excluded_features` or `features` can be passed, but not both.

    threshold : int, optional
        Ignore all categories that have not occurred at least `threshold` times.
        All categories that do not occur at least `threshold` times are
        mapped to the `output_category_name`.

    output_category_name : str | None, optional
        The value to use for the categories that do not satisfy the `threshold`
        condition.

    output_column_prefix : str, optional
        The prefix to use for the column name of each transformed column.
        When provided, the transformation will add columns to the input data,
        where the new name is "`output_column_prefix`.original_column_name".
        If `output_column_prefix=None` (default), then the output column name
        is the same as the original feature column name.

    Returns
    -------
    out : CountThresholder
        A CountThresholder object which is initialized with the defined parameters.

    See Also
    --------
    graphlab.toolkits.feature_engineering._count_thresholder.CountThresholder
    graphlab.toolkits.feature_engineering.create

    Notes
    -----
    - If the SFrame to be transformed already contains a column with the
      designated output column name, then that column will be replaced with the
      new output. In particular, this means that `output_column_prefix=None` will
      overwrite the original feature columns.
    - If the `output_category_name` and input feature column are not of the same
      type, then the output column is cast to `str`.
    - `None` values are treated as separate categories and are encoded along
      with the rest of the values.

    Examples
    --------

    .. sourcecode:: python

        # Create data.
        >>> sf = graphlab.SFrame({'a': [1,2,3], 'b' : [2,3,4]})

        # Create a transformer.
        >>> from graphlab.toolkits.feature_engineering import CountThresholder
        >>> count_tr = graphlab.feature_engineering.create(sf,
                CountThresholder(features = ['a', 'b'], threshold = 1))

        # Transform the data.
        >>> transformed_sf = count_tr.transform(sf)

        # Save the transformer.
        >>> count_tr.save('save-path')

        # Return the categories that are not discarded.
        >>> count_tr['categories']
        Columns:
                feature str
                category    str

        Rows: 6

        Data:
            +---------+----------+
            | feature | category |
            +---------+----------+
            |    a    |    1     |
            |    a    |    2     |
            |    a    |    3     |
            |    b    |    2     |
            |    b    |    3     |
            |    b    |    4     |
            +---------+----------+
            [6 rows x 2 columns]
    '''
    # Doc strings
    _fit_examples_doc = _fit_examples_doc
    _transform_examples_doc = _transform_examples_doc
    _fit_transform_examples_doc = _fit_transform_examples_doc

    # Default options
    get_default_options = staticmethod(
        _get_default_options_wrapper(
            '_CountThresholder',
            'toolkits.feature_engineering._count_thresholder',
            'CountThresholder', True))

    def __init__(self,
                 features=None,
                 excluded_features=None,
                 threshold=1,
                 output_category_name=None,
                 output_column_prefix=None):

        # Process and make a copy of the features, exclude.
        _features, _exclude = _internal_utils.process_features(
            features, excluded_features)

        # Type checking
        _raise_error_if_not_of_type(threshold, [int, type(None)])

        # Set up options
        opts = {
            'threshold': threshold,
            'output_category_name': output_category_name,
            'output_column_prefix': output_column_prefix
        }
        if _exclude:
            opts['exclude'] = True
            opts['features'] = _exclude
        else:
            opts['exclude'] = False
            opts['features'] = _features

        # Initialize object
        proxy = _gl.extensions._CountThresholder()
        proxy.init_transformer(opts)
        super(CountThresholder, self).__init__(proxy, self.__class__)

    def _get_summary_struct(self):
        """
        Returns a structured description of the model, including (where relevant)
        the schema of the training data, description of the training data,
        training statistics, and model hyperparameters.

        Returns
        -------
        sections : list (of list of tuples)
            A list of summary sections.
              Each section is a list.
                Each item in a section list is a tuple of the form:
                  ('<label>','<field>')
        section_titles: list
            A list of section titles.
              The order matches that of the 'sections' object.
        """
        _features = _precomputed_field(
            _internal_utils.pretty_print_list(self.get('features')))
        _exclude = _precomputed_field(
            _internal_utils.pretty_print_list(self.get('excluded_features')))
        fields = [
            ("Features", _features),
            ("Excluded features", _exclude),
            ("New category name", 'output_category_name'),
            ("Occurrence threshold", 'threshold'),
        ]
        section_titles = ['Model fields']

        return ([fields], section_titles)

    def __repr__(self):
        (sections, section_titles) = self._get_summary_struct()
        return _toolkit_repr_print(self, sections, section_titles, 30)

    @classmethod
    def _get_instance_and_data(cls):
        sf = _gl.SFrame({'a': [1, 2, 3, 2, 3], 'b': [2, 3, 4, 2, 3]})
        count_tr = _gl.feature_engineering.CountThresholder(
            features=['a', 'b'], threshold=2, output_category_name='junk')
        return count_tr.fit(sf), sf
class RareWordTrimmer(Transformer):
    '''
    Remove words that occur below a certain number of times in a given column.
    This is a common method of cleaning text before it is used, and can increase the
    quality and explainability of the models learned on the transformed data.

    RareWordTrimmer can be applied to all the string-, dictionary-, and list-typed
    columns in a given SFrame. Its behavior for each supported input column
    type is as follows. (See :func:`~graphlab.feature_engineering.RareWordTrimmer.transform`
    for usage examples).

    * **string** : The string is first tokenized. By default, all letters are
      first converted to lower case, then tokenized by space characters. Each
      token is taken to be a word, and the words occuring below a threshold
      number of times across the entire column are removed, then the remaining
      tokens are concatenated back into a string.

    * **list** : Each element of the list must be a string, where each element
      is assumed to be a token. The remaining tokens are then filtered
      by count occurences and a threshold value.

    * **dict** : The method first obtains the list of keys in the dictionary.
      This list is then processed as a standard list, except the value of each
      key must be of integer type and is considered to be the count of that key.

    Parameters
    ----------
    features : list[str] | str | None, optional
        Name(s) of feature column(s) to be transformed. If set to None, then all
        feature columns are used.

    excluded_features : list[str] | str | None, optional
        Name(s) of feature columns in the input dataset to be ignored. Either
        `excluded_features` or `features` can be passed, but not both.

    threshold : int, optional
        The count below which words are removed from the input.

    stopwords: list[str], optional
        A manually specified list of stopwords, which are removed regardless
        of count.

    to_lower : bool, optional
        Indicates whether to map the input strings to lower case before counting.

    delimiters: list[string], optional
        A list of delimiter characters for tokenization. By default, the list
        is defined to be the list of space characters. The user can define
        any custom list of single-character delimiters. Alternatively, setting
        `delimiters=None` will use a Penn treebank type tokenization, which
        is better at handling punctuations. (See reference below for details.)

    output_column_prefix : str, optional
        The prefix to use for the column name of each transformed column.
        When provided, the transformation will add columns to the input data,
        where the new name is "`output_column_prefix`.original_column_name".
        If `output_column_prefix=None` (default), then the output column name
        is the same as the original feature column name.

    Returns
    -------
    out : RareWordTrimmer
        A RareWordTrimmer feature engineering object which is initialized with
        the defined parameters.

    Notes
    -----
    If the SFrame to be transformed already contains a column with the
    designated output column name, then that column will be replaced with the
    new output. In particular, this means that `output_column_prefix=None` will
    overwrite the original feature columns.

    References
    ----------
    - `Penn treebank tokenization <https://www.cis.upenn.edu/~treebank/tokenization.html>`_

    See Also
    --------
    graphlab.toolkits.text_analytics.count_words,
    graphlab.toolkits.feature_engineering._ngram_counter.NGramCounter,
    graphlab.toolkits.feature_engineering._tfidf.TFIDF,
    graphlab.toolkits.feature_engineering._tokenizer.Tokenizer,
    graphlab.toolkits.feature_engineering.create

    Examples
    --------

    .. sourcecode:: python

        >>> import graphlab as gl

        # Create data.
        >>> sf = gl.SFrame({
        ...    'string': ['sentences Sentences', 'another sentence another year'],
        ...    'dict': [{'bob': 1, 'Bob': 2}, {'a': 0, 'cat': 5}],
        ...    'list': [['one', 'two', 'three', 'Three'], ['a', 'cat', 'Cat']]})

        # Create a RareWordTrimmer transformer.
        >>> from graphlab.toolkits.feature_engineering import RareWordTrimmer
        >>> trimmer = RareWordTrimmer()

        # Fit and transform the data.
        >>> transformed_sf = trimmer.fit_transform(sf)
        Columns:
            dict    dict
            list    list
            string  str

        Rows: 2

        Data:
        +------------+----------------+---------------------+
        |    dict    |      list      |        string       |
        +------------+----------------+---------------------+
        | {'bob': 2} | [three, three] | sentences sentences |
        | {'cat': 5} |   [cat, cat]   |   another another   |
        +------------+----------------+---------------------+
        [2 rows x 3 columns]

       # Save the transformer.
       >>> trimmer.save('save-path')
'''

    # Doc strings
    _fit_examples_doc = _fit_examples_doc
    _transform_examples_doc = _transform_examples_doc
    _fit_transform_examples_doc = _fit_transform_examples_doc

    # Default options
    get_default_options = staticmethod(
        _get_default_options_wrapper(
            '_RareWordTrimmer', 'toolkits.feature_engineering._word_trimmer',
            'RareWordTrimmer', True))

    def __init__(self,
                 features=None,
                 excluded_features=None,
                 threshold=2,
                 stopwords=None,
                 to_lower=True,
                 delimiters=["\r", "\v", "\n", "\f", "\t", " "],
                 output_column_prefix=None):

        # Process and make a copy of the features, exclude.
        _features, _exclude = _internal_utils.process_features(
            features, excluded_features)

        # Type checking
        _raise_error_if_not_of_type(features, [list, str, type(None)])
        _raise_error_if_not_of_type(threshold, [int, type(None)])
        _raise_error_if_not_of_type(output_column_prefix, [str, type(None)])
        _raise_error_if_not_of_type(stopwords, [list, type(None)])
        _raise_error_if_not_of_type(to_lower, [bool])
        _raise_error_if_not_of_type(delimiters, [list, type(None)])

        if delimiters != None:
            for delim in delimiters:
                _raise_error_if_not_of_type(delim, str, "delimiters")
                if (len(delim) != 1):
                    raise ValueError(
                        "Delimiters must be single-character strings")

        # Set up options
        opts = {
            'threshold': threshold,
            'output_column_prefix': output_column_prefix,
            'to_lower': to_lower,
            'stopwords': stopwords,
            'delimiters': delimiters
        }
        if _exclude:
            opts['exclude'] = True
            opts['features'] = _exclude
        else:
            opts['exclude'] = False
            opts['features'] = _features

        # Initialize object
        proxy = _gl.extensions._RareWordTrimmer()
        proxy.init_transformer(opts)
        super(RareWordTrimmer, self).__init__(proxy, self.__class__)

    def _get_summary_struct(self):
        """
        Returns a structured description of the model, including (where relevant)
        the schema of the training data, description of the training data,
        training statistics, and model hyperparameters.

        Returns
        -------
        sections : list (of list of tuples)
            A list of summary sections.
              Each section is a list.
                Each item in a section list is a tuple of the form:
                  ('<label>','<field>')
        section_titles: list
            A list of section titles.
              The order matches that of the 'sections' object.
        """
        _features = _precomputed_field(
            _internal_utils.pretty_print_list(self.get('features')))
        _exclude = _precomputed_field(
            _internal_utils.pretty_print_list(self.get('excluded_features')))
        _stopwords = _precomputed_field(
            _internal_utils.pretty_print_list(self.get('stopwords')))

        fields = [("Features", _features), ("Excluded features", _exclude),
                  ("Output column name", 'output_column_prefix'),
                  ("Word count threshold", 'threshold'),
                  ("Manually specified stopwords", _stopwords),
                  ("Whether to convert to lowercase", "to_lower"),
                  ("Delimiters", "delimiters")]
        section_titles = ['Model fields']

        return ([fields], section_titles)

    def __repr__(self):
        """
        Return a string description of the model, including a description of
        the training data, training statistics, and model hyper-parameters.

        Returns
        -------
        out : string
            A description of the model.
        """
        accessible_fields = {
            "vocabulary": "The vocabulary of the trimmed input."
        }
        (sections, section_titles) = self._get_summary_struct()
        out = _toolkit_repr_print(self, sections, section_titles, width=30)
        out2 = _summarize_accessible_fields(accessible_fields, width=30)
        return out + "\n" + out2

    @classmethod
    def _get_instance_and_data(cls):
        sf = _gl.SFrame({
            'a': ['dog', 'dog', 'dog'],
            'b': ['cat', 'one', 'one']
        })
        trimmer = _gl.feature_engineering.RareWordTrimmer(features=['a', 'b'])
        return trimmer.fit(sf), sf
            possible_args = set(get_default_options()["name"])
        except (RuntimeError, KeyError):
            possible_args = set()

        bad_arguments = set(kwargs.keys()).difference(possible_args)
        if bad_arguments:
            raise TypeError("Bad Keyword Arguments: " + ', '.join(bad_arguments))

        opts.update(kwargs)

    response = _graphlab.toolkits._main.run('recsys_train', opts, verbose)

    return FactorizationRecommender(response['model'])

get_default_options = _get_default_options_wrapper(
                          'factorization_recommender',
                          'recommender.factorization_recommender',
                          'FactorizationRecommender')

class FactorizationRecommender(_Recommender):
    r"""
    A FactorizationRecommender learns latent factors for each
    user and item and uses them to make rating predictions.

    FactorizationRecommender [Koren_et_al]_ contains a number of options that
    tailor to a variety of datasets and evaluation metrics, making this one of
    the most powerful model in the GraphLab Create recommender toolkit.

    **Side information**

    Side features may be provided via the `user_data` and `item_data` options
    when the model is created.
Example #33
0
class QuadraticFeatures(Transformer):
    '''
    Calculates quadratic interaction terms between features.

    Adding interaction terms is a good way of injecting complex relationships
    between predictor variables while still using a simple learning algorithm
    (ie. Logistic Regression) that is easy to use and explain. The QuadraticFeatures
    transformer accomplishes this by taking a row of the SFrame, and multiplying
    the specified features together. If the features are of array.array or dictionary
    type, multiplications of all possible pairs are computed. If a non-numeric
    value is encountered, 1 is substituted for the value and the old string
    value becomes part of the interaction term name. Supported types are int,
    float, string, array.array, list, and dict.

    When the transformer is applied, an additional column with name
    specified by 'output_column_name' is added to the input SFrame.
    In this column of dictionary type, interactions are specified in the
    key names (by concatenating column names and keys/indices if applicable)
    and values are the multiplied values.

    Parameters
    ----------
    features : list | str | tuple , optional
        Can be a list of tuples, a list of feature name strings, a
        feature name string, a tuple, or None. If it is a
        list of tuples containing two interaction terms, those are the
        calculated interaction terms. In the case of providing a list of
        feature_names, all pairs between those feature names are calculated.
        If the list is of size none, all feature pairs are calculated in the
        SFrame the transformer is applied to.

    excluded_features: list | str | tuple, optional
        Can be a list of tuples, a list of feature name strings, a
        feature name string, a tuple, or None. In the case
        of tuples, those particular interactions are excluded. In the case
        of feature names, all interactions with those features are excluded.
        Cannot set both 'exclude' and 'features'.

    output_column_name : str , optional
        The name of the output column

    Returns
    -------
    out : QuadraticFeatures
        A QuadraticFeatures object which is initialized with the defined
        parameters.

    See Also
    --------
    graphlab.toolkits.feature_engineering.QuadraticFeatures
    graphlab.toolkits.feature_engineering.create

    Examples
    --------

    .. sourcecode:: python

        from graphlab.toolkits.feature_engineering import *

        # Construct a quadratic features transformer with default options.
        >>> sf = graphlab.SFrame({'a': [1,2,3], 'b' : [2,3,4], 'c': [9,10,11]})
        >>> quadratic = graphlab.feature_engineering.create(sf,
                    QuadraticFeatures(features = ['a', 'b', 'c']))

        # Transform the data.
        >>> quadratic_sf = quadratic.transform(sf)

        # Save the transformer.
        >>> quadratic.save('save-path')

    '''

    _fit_examples_doc = _fit_examples_doc
    _transform_examples_doc = _transform_examples_doc
    _fit_transform_examples_doc = _fit_transform_examples_doc

    get_default_options = staticmethod(
        _get_default_options_wrapper(
            '_QuadraticFeatures',
            'toolkits.feature_engineering._quadratic_features',
            'QuadraticFeatures', True))

    def __init__(self,
                 features=None,
                 excluded_features=None,
                 output_column_name='quadratic_features'):

        #Type checking
        _raise_error_if_not_of_type(output_column_name, [str])

        # set up options
        opts = {'output_column_name': output_column_name}
        # Make a copy of the parameters.
        _features = _copy.copy(features)
        _exclude = _copy.copy(excluded_features)

        # Check of both are None or empty.
        if _features and _exclude:
            raise ValueError(
                "The parameters 'features' and 'exclude' cannot both be set."
                " Please set one or the other.")
        if _features == [] and not _exclude:
            raise ValueError("Features cannot be an empty list.")

        # Check types
        _raise_error_if_not_of_type(_features, [NoneType, list, str, tuple],
                                    'features')
        _raise_error_if_not_of_type(_exclude, [NoneType, list, str, tuple],
                                    'exclude')

        # Allow a single list
        _features = [
            _features
        ] if type(_features) == str or type(_features) == tuple else _features
        _exclude = [
            _exclude
        ] if type(_exclude) == str or type(_exclude) == tuple else _exclude

        # Type check each feature/exclude
        if _features:
            for f in _features:
                _raise_error_if_not_of_type(f, [str, tuple], "Feature names")
        if _exclude:
            for e in _exclude:
                _raise_error_if_not_of_type(e, [str, tuple],
                                            "Excluded feature names")

        if _exclude:
            opts['exclude'] = True
            unprocessed_features = _exclude
        else:
            opts['exclude'] = False
            unprocessed_features = _features

        pair_list = set()

        if unprocessed_features is not None:
            if type(unprocessed_features[0]) is tuple:
                for t in unprocessed_features:
                    pair_list.add(tuple(sorted(t)))
            elif type(unprocessed_features[0]) is str:
                if _exclude:
                    for t in unprocessed_features:
                        pair_list.add(t)
                else:
                    for t in unprocessed_features:
                        for k in unprocessed_features:
                            pair_list.add(tuple(sorted((t, k))))

        if type(output_column_name) is not str:
            raise ValueError("'output_column_name' must be of type str")

        if unprocessed_features is not None:
            if type(unprocessed_features[0]) is str:
                opts['features'] = unprocessed_features
                if _exclude:
                    opts['feature_pairs'] = list(pair_list)
                else:
                    opts['feature_pairs'] = [list(x) for x in pair_list]
            else:
                opts['feature_pairs'] = [list(x) for x in pair_list]
                opts['features'] = [list(x) for x in unprocessed_features]
        else:
            opts['feature_pairs'] = None
            opts['features'] = None

        # initialize object
        proxy = _gl.extensions._QuadraticFeatures()
        proxy.init_transformer(opts)
        super(QuadraticFeatures, self).__init__(proxy, self.__class__)

    def _get_summary_struct(self):
        """
        Returns a structured description of the model, including (where relevant)
        the schema of the training data, description of the training data,
        training statistics, and model hyperparameters.

        Returns
        -------
        sections : list (of list of tuples)
            A list of summary sections.
              Each section is a list.
                Each item in a section list is a tuple of the form:
                  ('<label>','<field>')
        section_titles: list
            A list of section titles.
              The order matches that of the 'sections' object.
        """
        _features = _precomputed_field(
            _internal_utils.pretty_print_list(self.get('features')))
        _exclude = _precomputed_field(
            _internal_utils.pretty_print_list(self.get('excluded_features')))
        fields = [("Features", _features), ("Excluded features", _exclude),
                  ("Output column name", 'output_column_name')]
        section_titles = ['Model fields']

        return ([fields], section_titles)

    def __repr__(self):
        (sections, section_titles) = self._get_summary_struct()
        return _toolkit_repr_print(self, sections, section_titles, width=30)

    @classmethod
    def _get_instance_and_data(cls):
        sf = _gl.SFrame({'a': [1, 2, 3], 'b': [2, 3, 4]})
        encoder = _gl.feature_engineering.QuadraticFeatures(
            features=['a', 'b'])
        return encoder.fit(sf), sf
        get_default_options, list_fields, get

        Examples
        --------
        >>> sf = graphlab.SFrame({'a' : [0.1, 8, 3.5], 'b':[-3, 7.6, 3]})
        >>> model = graphlab.kmeans.create(sf, 2)
        >>> model.get_current_options()
        {'num_clusters': 2, 'max_iterations': 10}
        """
        opts = {'model': self.__proxy__, 'model_name': self.__name__}

        return _gl.toolkits._main.run('kmeans_get_current_options', opts)


get_default_options = _get_default_options_wrapper(
                          'kmeans',
                          'kmeans',
                          'KmeansModel')


def create(dataset, num_clusters=None, features=None, label=None,
           initial_centers=None, max_iterations=10, batch_size=None,
           verbose=True):
    """
    Create a k-means clustering model. The KmeansModel object contains the
    computed cluster centers and the cluster assignment for each instance in
    the input 'dataset'.

    Given a number of clusters, k-means iteratively chooses the best cluster
    centers and assigns nearby points to the best cluster. If no points change
    cluster membership between iterations, the algorithm terminates.
        bad_arguments = set(kwargs.keys()).difference(possible_args)
        if bad_arguments:
            raise TypeError("Bad Keyword Arguments: " +
                            ', '.join(bad_arguments))

        opts.update(kwargs)

    opts.update(kwargs)

    response = _graphlab.toolkits._main.run('recsys_train', opts, verbose)
    return ItemSimilarityRecommender(response['model'])


get_default_options = _get_default_options_wrapper(
    'item_similarity', 'recommender.item_similarity',
    'ItemSimilarityRecommender')


class ItemSimilarityRecommender(_Recommender):
    """
    A model that ranks an item according to its similarity to other items
    observed for the user in question.

    **Creating an ItemSimilarityRecommender**

    This model cannot be constructed directly.  Instead, use
    :func:`graphlab.recommender.item_similarity_recommender.create`
    to create an instance
    of this model. A detailed list of parameter options and code samples
    are available in the documentation for the create function.
_DEFAULT_SOLVER_OPTIONS = {
    "convergence_threshold": 1e-2,
    "step_size": 1.0,
    "lbfgs_memory_level": 11,
    "max_iterations": 10,
}

DEFAULT_HYPER_PARAMETER_RANGE = {
    "l1_penalty": [0.0, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0],
    "l2_penalty": [0.0, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0],
}


get_default_options = _get_default_options_wrapper(
    "classifier_logistic_regression", "logistic_classifier", "LogisticClassifier"
)


def create(
    dataset,
    target,
    features=None,
    l2_penalty=0.01,
    l1_penalty=0.0,
    solver="auto",
    feature_rescaling=True,
    convergence_threshold=_DEFAULT_SOLVER_OPTIONS["convergence_threshold"],
    step_size=_DEFAULT_SOLVER_OPTIONS["step_size"],
    lbfgs_memory_level=_DEFAULT_SOLVER_OPTIONS["lbfgs_memory_level"],
    max_iterations=_DEFAULT_SOLVER_OPTIONS["max_iterations"],
class OneHotEncoder(Transformer):
    '''
    Encode a collection of categorical features using a *1-of-K* encoding scheme.

    Input columns to the one-hot-encoder must by of type *int*, *string*,
    *dict*, or *list*. The transformed output is a column of type dictionary
    (`max_categories` per column dimension sparse vector) where the key
    corresponds to the index of the categorical variable and the value is `1`.

    The behaviour of the one-hot-encoder for each input data column type is as
    follows. (see :func:`~graphlab.feature_engineering.OneHotEncoder.transform`
    for examples of the same).


    * **string** : The key in the output dictionary is the string category and
      the value is 1.

    * **int** : Behave similar to *string* columns.

    * **list** : Each value in the list is treated like an individual string.
      Hence, a *list* of categorical variables can be used to represent a
      feature where all categories in the list are simultaneously `hot`.

    * **dict** : They key of the dictionary is treated as a `namespace` and the
      value is treated as a `sub-category` in the `namespace`. The categorical
      variable being encoded in this case is a combination of the `namespace`
      and the `sub-category`.


    Parameters
    ----------
    features : list[str] | str | None, optional
        Name(s) of feature column(s) to be transformed. If set to None, then all
        columns are used.

    excluded_features : list[str] | str | None, optional
        Name(s) of feature columns in the input dataset to be ignored. Either
        `excluded_features` or `features` can be passed, but not both.

    max_categories: int, optional
        The maximum number of categories (per feature column) to use in the
        encoding. If the number of unique categories in a column exceed
        `max_categories`, then only the most frequent used categories are retained.
        If set to None, then all categories in the column are used.

    output_column_name : str, optional
        The name of the output column. If the column already exists, then a
        suffix is appended to the name.

    Returns
    -------
    out : OneHotEncoder
        A OneHotEncoder object which is initialized with the defined
        parameters.

    Notes
    -------
    - `None` values are treated as separate categories and are encoded along with the rest of the values.

    See Also
    --------
    graphlab.toolkits.feature_engineering._count_thresholder.OneHotEnconder, graphlab.toolkits.feature_engineering.create

    Examples
    --------

    .. sourcecode:: python

        # Create data.
        >>> sf = graphlab.SFrame({'a': [1,2,3], 'b' : [2,3,4]})

        # Create a one-hot encoder on the features ['a', 'b'].
        >>> from graphlab.toolkits.feature_engineering import OneHotEncoder
        >>> encoder = graphlab.feature_engineering.create(sf,
                            OneHotEncoder(features = ['a', 'b']))

        # Transform data.
        >>> transformed_sf = encoder.transform(sf)
        Columns:
        encoded_features        dict

        Rows: 3

        Data:
        +------------------+
        | encoded_features |
        +------------------+
        |   {0: 1, 3: 1}   |
        |   {1: 1, 4: 1}   |
        |   {2: 1, 5: 1}   |
        +------------------+
        [3 rows x 1 columns]

        # Save the transformer.
        >>> encoder.save('save-path')

        # Return the indices in the encoding.
        >>> encoder['feature_encoding']
        Columns:
                feature str
                category        str
                index   int

        Rows: 6

        Data:
        +---------+----------+-------+
        | feature | category | index |
        +---------+----------+-------+
        |    a    |    1     |   0   |
        |    a    |    2     |   1   |
        |    a    |    3     |   2   |
        |    b    |    2     |   3   |
        |    b    |    3     |   4   |
        |    b    |    4     |   5   |
        +---------+----------+-------+
    '''

    # Doc strings
    _fit_examples_doc = _fit_examples_doc
    _transform_examples_doc = _transform_examples_doc
    _fit_transform_examples_doc = _fit_transform_examples_doc

    # Default options
    get_default_options = staticmethod(
        _get_default_options_wrapper(
            '_OneHotEncoder', 'toolkits.feature_engineering._one_hot_encoder',
            'OneHotEncoder', True))

    def __init__(self,
                 features=None,
                 excluded_features=None,
                 max_categories=None,
                 output_column_name='encoded_features'):

        # Process and make a copy of the features, exclude.
        _features, _exclude = _internal_utils.process_features(
            features, excluded_features)

        # Type checking
        _raise_error_if_not_of_type(max_categories, [int, type(None)])
        _raise_error_if_not_of_type(output_column_name, [str])

        # Set up options
        opts = {
            'max_categories': max_categories,
            'output_column_name': output_column_name,
        }
        if _exclude:
            opts['exclude'] = True
            opts['features'] = _exclude
        else:
            opts['exclude'] = False
            opts['features'] = _features

        # Initialize object
        proxy = _gl.extensions._OneHotEncoder()
        proxy.init_transformer(opts)
        super(OneHotEncoder, self).__init__(proxy, self.__class__)

    def _get_summary_struct(self):
        """
        Returns a structured description of the model, including (where relevant)
        the schema of the training data, description of the training data,
        training statistics, and model hyperparameters.

        Returns
        -------
        sections : list (of list of tuples)
            A list of summary sections.
              Each section is a list.
                Each item in a section list is a tuple of the form:
                  ('<label>','<field>')
        section_titles: list
            A list of section titles.
              The order matches that of the 'sections' object.
        """
        _features = _precomputed_field(
            _internal_utils.pretty_print_list(self.get('features')))
        _exclude = _precomputed_field(
            _internal_utils.pretty_print_list(self.get('excluded_features')))
        fields = [
            ("Features", _features),
            ("Excluded features", _exclude),
            ("Output column name", 'output_column_name'),
            ("Max categories per column", 'max_categories'),
        ]
        section_titles = ['Model fields']

        return ([fields], section_titles)

    def __repr__(self):
        """
        Return a string description of the model, including a description of
        the training data, training statistics, and model hyper-parameters.

        Returns
        -------
        out : string
            A description of the model.
        """
        (sections, section_titles) = self._get_summary_struct()
        return _toolkit_repr_print(self, sections, section_titles, width=30)

    @classmethod
    def _get_instance_and_data(cls):
        sf = _gl.SFrame({'a': [1, 2, 3, 2, 3], 'b': [2, 3, 4, 2, 3]})
        encoder = _gl.feature_engineering.OneHotEncoder(features=['a', 'b'],
                                                        max_categories=2)
        return encoder.fit(sf), sf
from graphlab.toolkits._supervised_learning import Classifier as _Classifier
import graphlab.toolkits._supervised_learning as _sl
import graphlab.toolkits._main as _toolkits_main
from graphlab.toolkits._model_workflow import _collect_model_workflow
from graphlab.toolkits._internal_utils import _toolkit_repr_print
from graphlab.toolkits._supervised_learning import _show_model_tree
from graphlab.toolkits._internal_utils import _raise_error_evaluation_metric_is_valid
from graphlab.toolkits._internal_utils import _raise_error_if_not_sframe
from graphlab.toolkits._internal_utils import _raise_error_if_column_exists
from graphlab.toolkits._internal_utils import _check_categorical_option_type
from graphlab.toolkits._internal_utils import _map_unity_proxy_to_object
from graphlab.toolkits._tree_model_mixin import TreeModelMixin as _TreeModelMixin


get_default_options = _get_default_options_wrapper(
                          'boosted_trees_classifier',
                          'boosted_trees_classifier',
                          'BoostedTreesClassifier')

__doc_string_context = '''
      >>> url = 'http://s3.amazonaws.com/gl-testdata/xgboost/mushroom.csv'
      >>> data = graphlab.SFrame.read_csv(url)

      >>> train, test = data.random_split(0.8)
      >>> model = graphlab.boosted_trees_classifier.create(train, target='label')
'''

class BoostedTreesClassifier(_Classifier, _TreeModelMixin):
    """
    The gradient boosted trees model can be used as a classifier for predictive
    tasks.
import graphlab.toolkits._supervised_learning as _sl
from graphlab.toolkits._supervised_learning import Classifier as _Classifier
from graphlab.toolkits._model import _get_default_options_wrapper
from graphlab.toolkits._internal_utils import _raise_error_if_not_sframe, \
                                            _map_unity_proxy_to_object, \
                                            _toolkit_repr_print, \
                                            _numeric_param_check_range
from graphlab.toolkits._model_workflow import _collect_model_workflow
from graphlab.util import cloudpickle as _cloudpickle

import logging as _logging
from copy import copy as _copy
import six as _six

get_default_options = _get_default_options_wrapper('neuralnet_classifier_v2',
                                                   'neuralnet_classifier',
                                                   'NeuralNetClassifier')

_context_doc_string = '''
>>> data = graphlab.SFrame('https://static.turi.com/datasets/mnist/sframe/train')
>>> training_data, validation_data = data.random_split(0.8)
>>> net = graphlab.deeplearning.get_builtin_neuralnet('mnist')
>>> m = graphlab.neuralnet_classifier.create(training_data,
...                                          target='label',
...                                          network=net,
...                                          max_iterations=3)
'''


class NeuralNetClassifier(_Classifier):
    """
Example #40
0
class TransformToFlatDictionary(Transformer):
    '''
    Transforms column values into dictionaries with flat, non-nested
    string keys and numeric values.  Each key in nested containers is a
    concatenation of the keys in each dictionary with `separator`
    separating them.  For example, if ``separator = "."``, then
 
      {"a" : {"b" : 1}, "c" : 2}
 
    becomes
 
      {"a.b" : 1, "c" : 2}.
 
    - List and vector elements are handled by converting the index of
      the appropriate element to a string, then treating that as the key.
 
    - String values are handled by treating them as a single
      {"string_value" : 1} pair.
 
    - None values are handled by replacing them with the
      string contents of `none_tag`.
 
    - image and datetime values are currently not supported and raise an
      error.

    
    Parameters
    ----------
    features : list, str
        Name of feature column(s) to be transformed.

    exclude : list, str
        Names of feature column(s) to be excluded from the transformation.

    separator : str
        The separator string added between keys of nested dicts or lists.  
        
    output_column_prefix : str, optional
        The prefix to use for the column name of each transformed column.
        When provided, the transformation will add columns to the input data,
        where the new name is "`output_column_prefix`.original_column_name".
        If `output_column_prefix=None` (default), then the output column name
        is the same as the original feature column name.

    Returns
    -------
    out : TransformToFlatDictionary
        A TransformToFlatDictionary object which is initialized with the defined
        parameters.

    Examples
    --------

    .. sourcecode:: python

        >>> import graphlab as gl

        # Create the data
        >>> sf = gl.SFrame(
            {'values': [{"a" : {"b" : 3}, "c": 2},
                        { "a" : { "b" : 3, "c" : 2.5 }, "c" : 2 },
                        {"a" : [1,2,4] , "c" : 2 },
                        { "a" : "b", "c" : 2 }]}

        # Create a TransformToFlatDictionary transformer object.
        >>> ft = gl.feature_engineering.TransformToFlatDictionary('values')

        # Fit the encoder for a given dataset.
        >>> ft = ft.fit(sf)

        >>> transformed_sf = ft.transform(sf)
        >>> transformed_sf.print_rows(max_column_width=60)
        +----------------------------------------------+
        |                    values                    |
        +----------------------------------------------+
        |              {'c': 2, 'a.b': 3}              |
        |        {'c': 2, 'a.b': 3, 'a.c': 2.5}        |
        | {'c': 2, 'a.0': 1.0, 'a.1': 2.0, 'a.2': 4.0} |
        |              {'c': 2, 'a.b': 1}              |
        +----------------------------------------------+
        [4 rows x 1 columns]
        '''

    # Doc strings
    _fit_examples_doc = _fit_examples_doc
    _fit_transform_examples_doc = _fit_transform_examples_doc
    _transform_examples_doc  = _transform_examples_doc

    # Default options
    get_default_options = staticmethod(_get_default_options_wrapper(
            '_TransformToFlatDictionary',
            'toolkits.feature_engineering._transform_to_flat_dictionary',
            'TransformToFlatDictionary', True))

    def __init__(self, features=None, excluded_features=None,
                 separator = ".", none_tag = "__none__",
                 output_column_prefix = None):

        # Process and make a copy of the features, exclude.
        _features, _exclude = _internal_utils.process_features(features, excluded_features)

        # Type checking
        _raise_error_if_not_of_type(output_column_prefix, [str, type(None)])
        if output_column_prefix is None:
            output_column_prefix = ''
        
        opts = {
            'separator' : separator,
            'none_tag' : none_tag,
            'output_column_prefix' : output_column_prefix
            }
            
        if _exclude:
            opts['exclude'] = True
            opts['features'] = _exclude
        else:
            opts['exclude'] = False
            opts['features'] = _features
            
        # Initialize object
        proxy = _gl.extensions._TransformToFlatDictionary()
        proxy.init_transformer(opts)
        super(TransformToFlatDictionary, self).__init__(proxy, self.__class__)

    def _get_summary_struct(self):
        _features = _precomputed_field(
            _internal_utils.pretty_print_list(self.get('features')))
        _exclude = _precomputed_field(
            _internal_utils.pretty_print_list(self.get('excluded_features')))
        fields = [
            ("Features", _features),
            ("Excluded_features", _exclude),
            ("Separator", "separator"),
            ("None Tag", "none_tag"),
            ("Output Column Prefix", 'output_column_prefix')
        ]
            
        section_titles = ['Model fields']
        return ([fields], section_titles)

    def __repr__(self):
        (sections, section_titles) = self._get_summary_struct()
        return _toolkit_repr_print(self, sections, section_titles, 30)

    @classmethod
    def _get_instance_and_data(self):
        sf = _gl.SFrame(
            {'docs': [{'this': 1, 'is': 1, 'a': 2, 'sample': 1},
                      {'this': 1, 'is': 1, 'another': 2, 'example': 3}]})
        encoder = _gl.feature_engineering.TFIDF(features=['docs'])
        encoder = encoder.fit(sf)
        return encoder, sf
Example #41
0
from graphlab.toolkits._internal_utils import _toolkit_repr_print, \
                                        _toolkit_get_topk_bottomk, \
                                        _raise_error_evaluation_metric_is_valid, \
                                        _summarize_coefficients
from graphlab.toolkits._model_workflow import _collect_model_workflow
from graphlab.toolkits._model import _get_default_options_wrapper

_DEFAULT_SOLVER_OPTIONS = {
'convergence_threshold': 1e-2,
'max_iterations': 10,
'lbfgs_memory_level': 11,
}


get_default_options = _get_default_options_wrapper(
                          'classifier_svm',
                          'svm_classifier',
                          'SVMClassifier')

def create(dataset, target, features=None,
    penalty=1.0, solver='auto',
    feature_rescaling=True,
    convergence_threshold = _DEFAULT_SOLVER_OPTIONS['convergence_threshold'],
    lbfgs_memory_level = _DEFAULT_SOLVER_OPTIONS['lbfgs_memory_level'],
    max_iterations = _DEFAULT_SOLVER_OPTIONS['max_iterations'],
    class_weights = None,
    validation_set = 'auto',
    verbose=True):
    """
    Create a :class:`~graphlab.svm_classifier.SVMClassifier` to predict the class of a binary
    target variable based on a model of which side of a hyperplane the example
    falls on. In addition to standard numeric and categorical types, features
Example #42
0
class NGramCounter(Transformer):
    '''
    __init__(self, features=None, excluded_features=None,
    n=2, method="word", to_lower=True, ignore_punct=True, ignore_space=True,
    delimiters=["\\\\r", "\\\\v", "\\\\n", "\\\\f", "\\\\t", " ", \
    "!", "#", "$", "%", "&", "'", "(", ")", \
    "*", "+", ",", "-", ".", "/", ":", ";", \
    "<", "=", ">", "?", "@", "[", "\\\\", "]", \
    "^", "_", "`", "{", "|", "}", "~"], \
    output_column_prefix=None)

    Transform string/dict/list columns of an SFrame into their respective
    bag-of-ngrams representation.

    An ngram is a sequence of n consecutive tokens. NGrams are often used to
    represent natural text. Text ngrams can be word-based or character-based.
    To formulate word-based ngrams, a text string is first tokenized into words.
    An ngram is then a sliding window of n words. For character ngrams, no
    tokenization is necessary, and the sliding window is taken directly over
    accepted characters.

    The output is a dictionary of the count of the number of times each unique
    ngram appears in the text string. This dictionary is a sparse representation
    because most of the ngrams do not appear in every single sentence, hence
    they have a zero count and are not explicitly included in the dictionary.

    NGramCounter can be applied to all the string-, dictionary-, and list-typed
    columns in a given SFrame. Its behavior for each supported input column
    type is as follows. (See :func:`~graphlab.feature_engineering.NGramCounter.transform`
    for usage examples).

    * **string** : By default, all letters are first converted to lower case.
      Then, if computing word ngrams, each string is tokenized by space and
      puncutation characters. (The user can specify a custom delimiter
      list, or use Penn tree-bank style tokenization. See input parameter
      description for details.) If computing character ngrams, then each
      accepted character is understood to be a token. What is accepted is
      determined based on the flags `ignore_punct` and `ignore_space`.
      A dictionary is generated where each key is a sequence of `n` tokens that
      appears in the input text string, and the value is the number of times
      the ngram appears. For example, based on default settings, the string "I
      really like Really fluffy dogs" would generate these 2-gram counts:
      {'i really': 1, 'really like': 1, 'like really': 1, 'really fluffy': 1, 'fluffy dogs': 1}.
      The string "aaa..hhh" would generate these character 2-gram counts:
      {'aa': 2, 'ah': 1, 'hh': 2}.

    * **dict** : Each (key, value) pair is treated as a string-count pair. The
      keys are tokenized according to either word or character tokenization
      methods. Input keys must be strings and input values numeric (integer or
      float). The output dictionary is a sum of the input values for the
      ngrams in the key string. For example, under default settings, the input
      dictionary {'alice bob Bob': 1, 'Alice bob': 2.5} would generate a word
      2-gram dictionary of {'alice bob': 3.5, 'bob bob': 1}.

    * **list** : Each element of the list must be a string, which is tokenized
      according to the input method and tokenization settings, followed by
      ngram counting. The behavior is analogous to that of dict-type input,
      where the count of each list element is taken to be 1. For example, under
      the default settings, an input list of ['alice bob Bob', 'Alice bob']
      generates an output word 2-gram dictionary of {'alice bob': 2, 'bob bob': 1}.

    Parameters
    ----------
    features : list[str] | str | None, optional
        Name(s) of feature column(s) to be transformed. If set to None, then all
        feature columns are used.

    excluded_features : list[str] | str | None, optional
        Name(s) of feature columns in the input dataset to be ignored. Either
        `excluded_features` or `features` can be passed, but not both.

    n : int, optional
        The number of words in each n-gram. An ``n`` value of 1 returns word
        counts.

    method : {'word', 'character'}, optional
        If "word", the function performs a count of word n-grams. If
        "character", does a character n-gram count.

    to_lower : bool, optional
        If True, all strings are converted to lower case before counting.

    ignore_punct : bool, optional
        If method is "character", indicates if *punctuations* between words are
        counted as part of the n-gram. For instance, with the input SArray
        element of "fun.games", if this parameter is set to False one
        tri-gram would be 'n.g'. If ``ignore_punct`` is set to True, there
        would be no such tri-gram (there would still be 'nga'). This
        parameter has no effect if the method is set to "word".

    ignore_space : bool, optional
        If method is "character", indicates if *spaces* between words are
        counted as part of the n-gram. For instance, with the input SArray
        element of "fun games", if this parameter is set to False one
        tri-gram would be 'n g'. If ``ignore_space`` is set to True, there
        would be no such tri-gram (there would still be 'nga'). This
        parameter has no effect if the method is set to "word".

    delimiters: list[string], optional
        A list of delimiter characters for tokenization. By default, the list
        is defined to be the list of space and punctuation characters. The
        user can define any custom list of single-character delimiters.
        Alternatively, setting `delimiters=None` will use a Penn treebank type
        tokenization, which is better at handling punctuations. (See reference
        below for details.)

    output_column_prefix : str, optional
        The prefix to use for the column name of each transformed column.
        When provided, the transformation will add columns to the input data,
        where the new name is "`output_column_prefix`.original_column_name".
        If `output_column_prefix=None` (default), then the output column name
        is the same as the original feature column name.

    Returns
    -------
    out : NGramCounter
        A NGramCounter feature engineering object which is initialized with
        the defined parameters.

    Notes
    -----
    If the SFrame to be transformed already contains a column with the
    designated output column name, then that column will be replaced with the
    new output. In particular, this means that `output_column_prefix=None` will
    overwrite the original feature columns.

    A bag-of-words representation is essentially an ngram where `n=1`. Larger
    `n` generates more unique ngrams. Therefore the output dictionary will
    be more sparse, contain more unique keys, and will be more expensive to
    compute. Calling this function with large values `n` (larger than 3 or 4)
    should be done very carefully.

    References
    ----------
    - `N-gram wikipedia article <http://en.wikipedia.org/wiki/N-gram>`_
    - `Penn treebank tokenization <https://www.cis.upenn.edu/~treebank/tokenization.html>`_

    See Also
    --------
    graphlab.toolkits.text_analytics.count_ngrams,
    graphlab.toolkits.feature_engineering._ngram_counter.WordCounter,
    graphlab.toolkits.feature_engineering._tfidf.TFIDF,
    graphlab.toolkits.feature_engineering._tokenizer.Tokenizer,
    graphlab.toolkits.feature_engineering.create

    Examples
    --------

    .. sourcecode:: python

        import graphlab as gl

        # Create data.
        >>> sf = gl.SFrame({
        ...    'string': ['sent.ences Sent.ences', 'another sentence'],
        ...    'dict': [{'alice bob': 1, 'Bob alice': 0.5}, {'a dog': 0, 'a dog cat': 5}],
        ...    'list': [['one', 'bar bah'], ['a dog', 'a dog cat']]})

        # Create a NGramCounter transformer.
        >>> from graphlab.toolkits.feature_engineering import NGramCounter
        >>> encoder = NGramCounter()

        # Save the transformer.
        >>> encoder.save('save-path')

        # Fit and transform the data.
        >>> transformed_sf = encoder.fit_transform(sf)
        Columns:
            dict    dict
            list    dict
            string  dict

        Rows: 2

        Data:
        +------------------------------------+----------------------------+
        |                dict                |            list            |
        +------------------------------------+----------------------------+
        | {'bob alice': 0.5, 'alice bob': 1} |       {'bar bah': 1}       |
        |     {'dog cat': 5, 'a dog': 5}     | {'dog cat': 1, 'a dog': 2} |
        +------------------------------------+----------------------------+
        +------------------------------------+
        |               string               |
        +------------------------------------+
        | {'sent ences': 2, 'ences sent': 1} |
        |      {'another sentence': 1}       |
        +------------------------------------+
        [2 rows x 3 columns]

        # Penn treebank-style tokenization (recommended for smarter handling
        #    of punctuations)
        >>> sf = gl.SFrame({'string': ['sentence $$one', 'sentence two...']})
        >>> NGramCounter(delimiters=None).fit_transform(sf)
        Columns:
            string  dict

        Rows: 2

        Data:
        +-------------------------------------------+
        |                   string                  |
        +-------------------------------------------+
        |  {'sentence $': 1, '$ $': 1, '$ one': 1}  |
        | {'sentence two': 1, '. .': 2, 'two .': 1} |
        +-------------------------------------------+
        [2 rows x 1 columns]

        # Character n-grams
        >>> sf = gl.SFrame({'string': ['aa$bb.', ' aa bb  ']})
        >>> NGramCounter(method='character').fit_transform(sf)
        Columns:
            string  dict

        Rows: 2

        Data:
        +-----------------------------+
        |            string           |
        +-----------------------------+
        | {'aa': 1, 'ab': 1, 'bb': 1} |
        | {'aa': 1, 'ab': 1, 'bb': 1} |
        +-----------------------------+
        [2 rows x 1 columns]

        # Character n-grams, not skipping over spaces or punctuations
        >>> sf = gl.SFrame({'string': ['aa$bb.', ' aa bb  ']})
        >>> encoder = NGramCounter(method='character', ignore_punct=False, ignore_space=False)
        >>> encoder.fit_transform(sf)
        Columns:
            string  dict

        Rows: 2
        Data:
        +-----------------------------------------------------------------+
        |                              string                             |
        +-----------------------------------------------------------------+
        |          {'aa': 1, 'b.': 1, '$b': 1, 'a$': 1, 'bb': 1}          |
        | {' b': 1, 'aa': 1, '  ': 1, ' a': 1, 'b ': 1, 'bb': 1, 'a ': 1} |
        +-----------------------------------------------------------------+
        [2 rows x 1 columns]

    '''

    # Doc strings
    _fit_examples_doc = _fit_examples_doc
    _fit_transform_examples_doc = _fit_transform_examples_doc
    _transform_examples_doc  = _transform_examples_doc

    # Default options
    get_default_options = staticmethod(_get_default_options_wrapper(
            '_NGramCounter', 'toolkits.feature_engineering._ngram_counter',
                                                'NGramCounter', True))

    def __init__(self, features=None, excluded_features=None,
        n=2, method="word", to_lower=True, ignore_punct=True, ignore_space=True,
        delimiters=["\r", "\v", "\n", "\f", "\t", " ",
                    "!", "#", "$", "%", "&", "'", "(", ")",
                    "*", "+", ",", "-", ".", "/", ":", ";",
                    "<", "=", ">", "?", "@", "[", "\\", "]",
                    "^", "_", "`", "{", "|", "}", "~"],
        output_column_prefix=None):

        # Process and make a copy of the features, exclude.
        _features, _exclude = _internal_utils.process_features(features, excluded_features)

        # Type checking
        _raise_error_if_not_of_type(features, [list, str, _NoneType])
        _raise_error_if_not_of_type(excluded_features, [list, str, _NoneType])
        _raise_error_if_not_of_type(n, [int])
        _raise_error_if_not_of_type(method, [str])
        _raise_error_if_not_of_type(to_lower, [bool])
        _raise_error_if_not_of_type(ignore_punct, [bool])
        _raise_error_if_not_of_type(ignore_space, [bool])
        _raise_error_if_not_of_type(delimiters, [list, _NoneType])
        _raise_error_if_not_of_type(output_column_prefix, [str, _NoneType])

        if delimiters != None:
            for delim in delimiters:
                _raise_error_if_not_of_type(delim, str, "delimiters")
                if (len(delim) != 1):
                    raise ValueError("Delimiters must be single-character strings")

        if n < 1:
            raise ValueError("Input 'n' must be greater than 0")

        if n > 5 and method == 'word':
            warnings.warn("It is unusual for n-grams to be of size larger than 5.")

        if method != "word" and method != "character":
            raise ValueError("Invalid 'method' input  value. Please input " +
                             "either 'word' or 'character' ")

        # Set up options
        opts = {
          'n': n,
          'features': features,
          'ngram_type': method,
          'to_lower': to_lower,
          'ignore_punct': ignore_punct,
          'ignore_space': ignore_space,
          'delimiters': delimiters,
          'output_column_prefix' : output_column_prefix
        }
        if _exclude:
            opts['exclude'] = True
            opts['features'] = _exclude
        else:
            opts['exclude'] = False
            opts['features'] = _features

        # Initialize object
        proxy = _gl.extensions._NGramCounter()
        proxy.init_transformer(opts)
        super(NGramCounter, self).__init__(proxy, self.__class__)

    def _get_summary_struct(self):
        _features = _precomputed_field(
            _internal_utils.pretty_print_list(self.get('features')))
        fields = [
            ("NGram length", 'n'),
            ("NGram type (word or character)", 'ngram_type'),
            ("Convert strings to lower case", 'to_lower'),
            ("Ignore punctuation in character ngram", 'ignore_punct'),
            ("Ignore space in character ngram", 'ignore_space'),
            ("Delimiters", "delimiters"),
            ("Features", _features),
            ("Output column prefix", 'output_column_prefix')
        ]
        section_titles = ['Model fields']
        return ([fields], section_titles)

    def __repr__(self):
        (sections, section_titles) = self._get_summary_struct()
        return _toolkit_repr_print(self, sections, section_titles, 30)

    @classmethod
    def _get_instance_and_data(self):
        sf = _gl.SFrame(
            {'docs': [{'this': 1, 'is': 1, 'a': 2, 'sample': 1},
                      {'this': 1, 'is': 1, 'another': 2, 'example': 3}]})
        encoder = _gl.feature_engineering.NGramCounter('docs')
        encoder = encoder.fit(sf)
        return encoder, sf
Example #43
0
        >>> sf = graphlab.SFrame({'a' : [0.1, 8, 3.5], 'b':[-3, 7.6, 3]})
        >>> model = graphlab.kmeans.create(sf, 2)
        >>> model.get_current_options()
        {'num_clusters': 2, 'max_iterations': 10}
        """

        _mt._get_metric_tracker().track('toolkit.kmeans.get_current_options')

        opts = {'model': self.__proxy__, 'model_name': self.__name__}

        return _graphlab.toolkits._main.run(
            'kmeans_get_current_options', opts)


get_default_options = _get_default_options_wrapper(
                          'kmeans',
                          'kmeans',
                          'KmeansModel')


def create(dataset, num_clusters=None, features=None, initial_centers=None,
           max_iterations=10, batch_size=None, verbose=True):
    r"""
    Run the k-means++ clustering algorithm, returning a KmeansModel object
    that contains the cluster centers and the cluster assignment for
    each data point in the dataset.

    Given a number of clusters, k-means++ iteratively chooses the best cluster
    centers and assigns nearby points to the best cluster. If no points change
    cluster membership between iterations, the algorithm terminates.

    Parameters
Example #44
0
class Tokenizer(Transformer):
    '''
    __init__(features=None, excluded_features=None, 
        to_lower=False, delimiters=["\\\\r", "\\\\v", "\\\\n", "\\\\f", "\\\\t", " "],
        output_column_prefix=None)

    Tokenizing is a method of breaking natural language text into its smallest
    standalone and meaningful components (in English, usually space-delimited
    words, but not always).

    By default, Tokenizer tokenizes strings by space characters. The user may 
    specify a customized list of delimiters, or use Penn treebank-style 
    tokenization. 

    .. warning:: 
        The default tokenization setting is now different from that of 
        GraphLab Create v1.6. The old default was Penn treebank-style 
        tokenization. (This is still available by setting `delimiters=None`.) 
        The current default is to tokenize by space characters. 

    Parameters
    ----------
    features : list[str] | str | None, optional
        Name(s) of feature column(s) to be transformed. If set to None, then all
        feature columns are used.

    excluded_features : list[str] | str | None, optional
        Name(s) of feature columns in the input dataset to be ignored. Either
        `excluded_features` or `features` can be passed, but not both.

    to_lower : bool, optional
        Indicates whether to map the input strings to lower case before counting.

    delimiters: list[string], optional
        A list of delimiter characters for tokenization. By default, the list 
        is defined to be the list of space characters. The user can define 
        any custom list of single-character delimiters. Alternatively, setting
        `delimiters=None` will use a Penn treebank-style tokenization that 
        separates individual punctuation marks and detects positive and negative
        real numbers, phone numbers with no spaces, urls, and emails. The 
        Penn treebank-style tokenization also attempts to separate contractions
        and possessives. For instance, "don't" would be tokenized as 
        ["do", "n\'t"].

    output_column_prefix : str, optional
        The prefix to use for the column name of each transformed column.
        When provided, the transformation will add columns to the input data,
        where the new name is "`output_column_prefix`.original_column_name".
        If `output_column_prefix=None` (default), then the output column name
        is the same as the original feature column name.

    Returns
    -------
    out : Tokenizer
        A Tokenizer object which is initialized with the defined parameters.

    Notes
    -----
    This implementation of Tokenizer applies regular expressions to the natural
    language text to capture a high-recall set of valid text patterns.

    If the SFrame to be transformed already contains a column with the 
    designated output column name, then that column will be replaced with the 
    new output. In particular, this means that `output_column_prefix=None` will
    overwrite the original feature columns.

    References
    ----------
    - `Penn treebank tokenization <https://www.cis.upenn.edu/~treebank/tokenization.html>`_

    See Also
    --------
    graphlab.toolkits.text_analytics.tokenize,
    graphlab.toolkits.feature_engineering._word_counter.WordCounter,
    graphlab.toolkits.feature_engineering._ngram_counter.NGramCounter,
    graphlab.toolkits.feature_engineering._tfidf.TFIDF,
    graphlab.toolkits.feature_engineering.create

    Examples
    --------

    .. sourcecode:: python

        >>> import graphlab
        >>> from graphlab.toolkits.feature_engineering import *

        # Create a sample dataset
        >>> sf = graphlab.SFrame({
        ...    'docs': ["This is a document!", "This one's also a document."]})

        # Construct a tokenizer with default options.
        >>> tokenizer = Tokenizer()

        # Transform the data using the tokenizer.
        >>> tokenized_sf = tokenizer.fit_transform(sf)
        >>> tokenized_sf
        Columns:
            docs    list

        Rows: 2

        Data:
        +-----------------------------------+
        |                docs               |
        +-----------------------------------+
        |      [This, is, a, document!]     |
        | [This, one's, also, a, document.] |
        +-----------------------------------+
        [2 rows x 1 columns]

        # Convert to lower case and use Penn treebank-style tokenization.
        >>> ptb_tokenizer = Tokenizer(to_lower=True, delimiters=None)
        >>> tokenized_sf = ptb_tokenizer.fit_transform(sf)
        >>> tokenized_sf
        Columns:
            docs    list

        Rows: 2

        Data:
        +---------------------------------------+
        |                  docs                 |
        +---------------------------------------+
        |       [this, is, a, document, !]      |
        | [this, one, 's, also, a, document, .] |
        +---------------------------------------+
        [2 rows x 1 columns]

        # Tokenize only a single column 'docs'.
        >>> tokenizer = Tokenizer(features = ['docs'])
        >>> tokenizer['features']
        ['docs']

        # Tokenize all columns except 'docs'.
        >>> tokenizer = Tokenizer(excluded_features = ['docs'])
        >>> tokenizer['features']  # `features` are set to `None`


    '''

    _fit_examples_doc = _fit_examples_doc
    _transform_examples_doc = _transform_examples_doc
    _fit_transform_examples_doc = _fit_transform_examples_doc

    get_default_options = staticmethod(
        _get_default_options_wrapper(
            '_Tokenizer', 'toolkits.feature_engineering._tokenizer',
            'Tokenizer', True))

    def __init__(self,
                 features=None,
                 excluded_features=None,
                 to_lower=False,
                 delimiters=["\r", "\v", "\n", "\f", "\t", " "],
                 output_column_prefix=None):

        # Process and make a copy of the features, exclude.
        _features, _exclude = _internal_utils.process_features(
            features, excluded_features)

        # Type checking
        _raise_error_if_not_of_type(features, [list, str, _NoneType])
        _raise_error_if_not_of_type(excluded_features, [list, str, _NoneType])
        _raise_error_if_not_of_type(to_lower, [bool])
        _raise_error_if_not_of_type(delimiters, [list, _NoneType])
        _raise_error_if_not_of_type(output_column_prefix, [str, _NoneType])

        if delimiters != None:
            for delim in delimiters:
                _raise_error_if_not_of_type(delim, str, "delimiters")
                if (len(delim) != 1):
                    raise ValueError(
                        "Delimiters must be single-character strings")

        # Set up options
        opts = {
            'features': features,
            'to_lower': to_lower,
            'delimiters': delimiters,
            'output_column_prefix': output_column_prefix
        }
        if _exclude:
            opts['exclude'] = True
            opts['features'] = _exclude
        else:
            opts['exclude'] = False
            opts['features'] = _features

        # Initialize object
        proxy = _gl.extensions._Tokenizer()
        proxy.init_transformer(opts)
        super(Tokenizer, self).__init__(proxy, self.__class__)

    def _get_summary_struct(self):
        """
        Returns a structured description of the model, including (where relevant)
        the schema of the training data, description of the training data,
        training statistics, and model hyperparameters.

        Returns
        -------
        sections : list (of list of tuples)
            A list of summary sections.
              Each section is a list.
                Each item in a section list is a tuple of the form:
                  ('<label>','<field>')
        section_titles: list
            A list of section titles.
              The order matches that of the 'sections' object.
        """
        _features = _precomputed_field(
            _internal_utils.pretty_print_list(self.get('features')))

        fields = [("Features", _features),
                  ("Convert strings to lower case", 'to_lower'),
                  ("Delimiters", "delimiters"),
                  ("Output column prefix", 'output_column_prefix')]
        section_titles = ['Model fields']

        return ([fields], section_titles)

    def __repr__(self):
        (sections, section_titles) = self._get_summary_struct()
        return _toolkit_repr_print(self, sections, section_titles, width=30)

    @classmethod
    def _get_instance_and_data(self):
        sf = _gl.SFrame({'docs': ["this is a test", "this is another test"]})
        encoder = _gl.feature_engineering.Tokenizer('docs')
        return encoder.fit(sf), sf
    opts = {'dataset': observation_data,
            'user_id': user_id,
            'item_id': item_id,
            'target': target,
            'user_data': user_data,
            'item_data': item_data,
            'nearest_items': _graphlab.SFrame(),
            'model': model_proxy,
            'random_seed': 1}

    response = _graphlab.toolkits._main.run('recsys_train', opts, verbose)
    return PopularityRecommender(response['model'])


get_default_options = _get_default_options_wrapper(
                          'popularity', 
                          'recommender.popularity_recommender', 
                          'PopularityRecommender')


class PopularityRecommender(_Recommender):
    """
    The Popularity Model ranks an item according to its overall popularity.

    When making recommendations, the items are scored by the number of times it
    is seen in the training set. The item scores are the same for all users.
    Hence the recommendations are not tailored for individuals.

    The Popularity Recommender is simple and fast and provides a reasonable baseline.
    It can work well when observation data is sparse. It can be used as a
    "background" model for new users.
_BOOSTED_TREES_MODEL_PARAMS_KEYS = [
    'step_size', 'max_depth', 'max_iterations', 'min_child_weight',
    'min_loss_reduction', 'row_subsample'
]
_BOOSTED_TREE_TRAINING_PARAMS_KEYS = [
    'objective', 'training_time', 'training_error', 'validation_error',
    'evaluation_metric'
]
_BOOSTED_TREE_TRAINING_DATA_PARAMS_KEYS = [
    'target', 'features', 'num_features', 'num_examples',
    'num_validation_examples'
]

get_default_options = _get_default_options_wrapper('boosted_trees_regression',
                                                   'boosted_trees_regression',
                                                   'BoostedTreesRegression')


class BoostedTreesRegression(_SupervisedLearningModel, _TreeModelMixin):
    """
    Encapsulates gradient boosted trees for regression tasks.

    The prediction is based on a collection of base learners, `regression trees
    <http://en.wikipedia.org/wiki/Decision_tree_learning>`_.


    Different from linear models, e.g. linear regression,
    the gradient boost trees model is able to model non-linear interactions
    between the features and the target using decision trees as the subroutine.
    It is good for handling numerical features and categorical features with
if _sys.version_info.major == 3:
    _izip = zip
    _xrange = range
else:
    from itertools import izip as _izip
    _xrange = xrange


import operator as _operator
import array as _array
from graphlab.toolkits._model import _get_default_options_wrapper



get_default_options = _get_default_options_wrapper(
                          'cgs_topic_model',
                          'topic_model',
                          'TopicModel')

def create(dataset,
           num_topics=10,
           initial_topics=None,
           alpha=None,
           beta=.1,
           num_iterations=10,
           num_burnin=5,
           associations=None,
           verbose=False,
           print_interval=10,
           validation_set=None,
           method='auto'):
    """
Example #48
0
from graphlab.toolkits.text_analytics._util import _check_input
from graphlab.toolkits.text_analytics._util import random_split as _random_split
from graphlab.toolkits._internal_utils import (
    _check_categorical_option_type,
    _map_unity_proxy_to_object,
    _precomputed_field,
    _toolkit_repr_print,
)
from graphlab.toolkits._model_workflow import _collect_model_workflow

from itertools import izip as _izip
import array as _array
from graphlab.toolkits._model import _get_default_options_wrapper


get_default_options = _get_default_options_wrapper("cgs_topic_model", "topic_model", "TopicModel")


def create(
    dataset,
    num_topics=10,
    initial_topics=None,
    alpha=None,
    beta=0.1,
    num_iterations=10,
    num_burnin=5,
    associations=None,
    verbose=False,
    print_interval=10,
    validation_set=None,
    method="auto",