def get_libodbc_path():
    """
    Get the first path that GraphLab Create will search for libodbc.so.
    """
    c = gl.get_runtime_config()
    return c['GRAPHLAB_LIBODBC_PREFIX']
Example #2
0
def create(data=None,
           target=None,
           features=None,
           method='auto',
           validation_set=None):
    """
    Create a model that trains a classifier in order to perform sentiment
    analysis on a collection of documents.

    When a target column name is not provided, a pretrained model will be used.
    The model was trained on a large collection of product review data from both
    Amazon and Yelp datasets. Predicted scores are between 0 and 1, where
    higher scores indicate more positive predicted sentiment. The model is
    a :class:`~graphlab.logistic_classifier.LogisticClassifier` model trained
    using a bag-of-words representation of
    the text data, using ratings less than 3 as negative sentiment and
    ratings of more than 3 as positive sentiment.

    Parameters
    ----------
    data: SFrame, optional
      Contains at least one column that contains the text data of interest.
      This can be unstructured text data, such as that appearing in forums,
      user-generated reviews, and so on. This is not required when using a
      pre-trained model.

    target: str, optional
      The column name containing numeric sentiment scores for each document.
      If provided, a sentiment model will be trained for the provided data set.
      If not provided, a pre-trained model will be used.

    features: list of str, optional
      The column names of interest containing text data. Each provided column
      must be str type. Defaults to using all columns of type str.

    method: str, optional
      Method to use for feature engineering and modeling. Currently only
      bag-of-words and logistic classifier ('bow-logistic') is available.

    validation_set : SFrame, optional
      A dataset for monitoring the model's generalization performance.
      This is ignored if no value is provided to the `target` argument.

    Returns
    -------
    out : :class:`~SentimentAnalysisModel`

    Examples
    --------

    You can train a sentiment analysis classifier on text data when you have
    ratings data available.

    >>> import graphlab as gl
    >>> data = gl.SFrame({'rating': [1, 5], 'text': ['hate it', 'love it']})
    >>> m = gl.sentiment_analysis.create(data, 'rating', features=['text'])
    >>> m.predict_row({'text': 'really love it'})
    >>> m.predict_row({'text': 'really hate it'})

    If you do not have ratings data, we provide a pretrained model for you to
    use as a starting point.

    >>> m = gl.sentiment_analysis.create(data, features=['text'])
    >>> m.predict(data)

    You may also evaluate predictions against known sentiment scores.

    >>> m.evaluate(data)
    """
    _mt._get_metric_tracker().track('{}.create'.format(__name__))
    logger = _logging.getLogger(__name__)

    # Validate method.
    if method == 'auto':
        method = 'bow-logistic'
    if method != 'bow-logistic':
        raise ValueError("Unsupported method provided.")

    # Check if pretrained
    if target is None:

        # Name of pretrained model: format is [name]/[version].
        model_name = 'sentiment-combined/1'

        # Download if model is not present in [tmp dir]/model_cache/.
        tmp_dir = _gl.get_runtime_config()['GRAPHLAB_CACHE_FILE_LOCATIONS']
        model_local_path = _os.path.join(tmp_dir, 'model_cache', model_name)
        model_remote_path = 'https://static.turi.com/products/graphlab-create/resources/models/python2.7/sentiment-analysis/' + model_name

        feature_extractor = _feature_extractor_for_pretrained
        if not _os.path.exists(model_local_path):
            logger.info('Downloading pretrained model...')
            m = _gl.load_model(model_remote_path)
            m.save(model_local_path)
        else:
            m = _gl.load_model(model_local_path)

        num_rows = 0

    else:
        if data is None:
            raise ValueError(
                "The data argument is required when a target column is provided."
            )

        # Validate data
        # Validate features. Use all columns by default
        if features is None:
            features = data.column_names()

        # Remove target column from list of feature columns.
        features = [f for f in features if f != target]

        # Transform the target column and create the training set.
        _target = 'like'
        train = _transform_with_target(data, target, _target)

        # Process training set using the default feature extractor.
        feature_extractor = _default_feature_extractor
        train = feature_extractor(train)

        # Check for a validation set.
        kwargs = {}
        if validation_set is not None:
            validation_set = _transform_with_target(validation_set, target,
                                                    _target)
            validation_set = feature_extractor(validation_set)
            kwargs['validation_set'] = validation_set

        m = _gl.logistic_classifier.create(train,
                                           target=_target,
                                           features=features,
                                           l2_penalty=.2,
                                           **kwargs)
        num_rows = data.num_rows()

    model = SentimentAnalysisModel()

    model.__proxy__.update({
        'target': target,
        'features': features,
        'method': method,
        'num_rows': num_rows,
        'feature_extractor': feature_extractor,
        'classifier': m
    })
    return model
Example #3
0
def get_libodbc_path():
    """
    Get the first path that GraphLab Create will search for libodbc.so.
    """
    c = gl.get_runtime_config()
    return c['GRAPHLAB_LIBODBC_PREFIX']
def create(data, target=None, features=None, method='auto', validation_set=None):
    """
    Create a model that trains a classifier in order to perform sentiment
    analysis on a collection of documents.

    When a target column name is not provided, a pretrained model will be used.
    The model was trained on a large collection of product review data from both
    Amazon and Yelp datasets. Predicted scores are between 0 and 1, where
    higher scores indicate more positive predicted sentiment. The model is
    a :class:`~graphlab.logistic_classifier.LogisticClassifier` model trained
    using a bag-of-words representation of
    the text data, using ratings less than 3 as negative sentiment and
    ratings of more than 3 as positive sentiment.

    .. warning::
        This toolkit is currently in beta, and feedback is
        welcome! Please send comments to [email protected].

    Parameters
    ----------
    data: SFrame
      Contains at least one column that contains the text data of interest.
      This can be unstructured text data, such as that appearing in forums,
      user-generated reviews, and so on.

    target: str, optional
      The column name containing numeric sentiment scores for each document.
      If provided, a sentiment model will be trained for this data set.
      If not provided, a pre-trained model will be used.

    features: list of str, optional
      The column names of interest containing text data. Each provided column
      must be str type. Defaults to using all columns of type str.

    method: str, optional
      Method to use for feature engineering and modeling. Currently only
      bag-of-words and logistic classifier ('bow-logistic') is available.

    validation_set : SFrame, optional
      A dataset for monitoring the model's generalization performance.
      This is ignored if no value is provided to the `target` argument.

    Returns
    -------
    out : :class:`~SentimentAnalysisModel`

    Examples
    --------

    You can train a sentiment analysis classifier on text data when you have
    ratings data available.

    >>> import graphlab as gl
    >>> data = gl.SFrame({'rating': [1, 5], 'text': ['hate it', 'love it']})
    >>> m = gl.sentiment_analysis.create(data, 'rating', features=['text'])
    >>> m.predict_row({'text': 'really love it'})
    >>> m.predict_row({'text': 'really hate it'})

    If you do not have ratings data, we provide a pretrained model for you to
    use as a starting point.

    >>> m = gl.sentiment_analysis.create(data, features=['text'])
    >>> m.predict(data)

    You may also evaluate predictions against known sentiment scores.

    >>> m.evaluate(data)
    """
    _mt._get_metric_tracker().track('{}.create'.format(__name__))
    logger = _logging.getLogger(__name__)

    # Validate method.
    if method == 'auto':
        method = 'bow-logistic'
    if method != 'bow-logistic':
        raise ValueError("Unsupported method provided.")

    # Validate features. Use all columns by default
    if features is None:
        features = data.column_names()

    # Check if pretrained
    if target is None:

        # Name of pretrained model: format is [name]/[version].
        model_name = 'sentiment-combined/1'

        # Download if model is not present in [tmp dir]/model_cache/.
        tmp_dir = _gl.get_runtime_config()['GRAPHLAB_CACHE_FILE_LOCATIONS']
        model_local_path = _os.path.join(tmp_dir, 'model_cache', model_name)
        model_remote_path = 's3://dato-models/sentiment-analysis/' + model_name

        feature_extractor = _feature_extractor_for_pretrained
        if not _os.path.exists(model_local_path):
            logger.info('Downloading pretrained model...')
            m = _gl.load_model(model_remote_path)
            m.save(model_local_path)
        else:
            m = _gl.load_model(model_local_path)
    else:
        # Remove target column from list of feature columns.
        features = [f for f in features if f != target]

        # Transform the target column and create the training set.
        _target = 'like'
        train = _transform_with_target(data, target, _target)

        # Process training set using the default feature extractor.
        feature_extractor = _default_feature_extractor
        train = feature_extractor(train)

        # Check for a validation set.
        kwargs = {}
        if validation_set is not None:
            validation_set = _transform_with_target(validation_set, target, _target)
            validation_set = feature_extractor(validation_set)
            kwargs['validation_set'] = validation_set

        m = _gl.logistic_classifier.create(train,
                                           target=_target,
                                           features=features,
                                           l2_penalty=.2,
                                           **kwargs)

    model = SentimentAnalysisModel()
    model._state.update(
        {'target':   target,
         'features': features,
         'method':   method,
         'num_rows': data.num_rows(),
         'feature_extractor': feature_extractor,
         'classifier': m})
    return model