def get_libodbc_path(): """ Get the first path that GraphLab Create will search for libodbc.so. """ c = gl.get_runtime_config() return c['GRAPHLAB_LIBODBC_PREFIX']
def create(data=None, target=None, features=None, method='auto', validation_set=None): """ Create a model that trains a classifier in order to perform sentiment analysis on a collection of documents. When a target column name is not provided, a pretrained model will be used. The model was trained on a large collection of product review data from both Amazon and Yelp datasets. Predicted scores are between 0 and 1, where higher scores indicate more positive predicted sentiment. The model is a :class:`~graphlab.logistic_classifier.LogisticClassifier` model trained using a bag-of-words representation of the text data, using ratings less than 3 as negative sentiment and ratings of more than 3 as positive sentiment. Parameters ---------- data: SFrame, optional Contains at least one column that contains the text data of interest. This can be unstructured text data, such as that appearing in forums, user-generated reviews, and so on. This is not required when using a pre-trained model. target: str, optional The column name containing numeric sentiment scores for each document. If provided, a sentiment model will be trained for the provided data set. If not provided, a pre-trained model will be used. features: list of str, optional The column names of interest containing text data. Each provided column must be str type. Defaults to using all columns of type str. method: str, optional Method to use for feature engineering and modeling. Currently only bag-of-words and logistic classifier ('bow-logistic') is available. validation_set : SFrame, optional A dataset for monitoring the model's generalization performance. This is ignored if no value is provided to the `target` argument. Returns ------- out : :class:`~SentimentAnalysisModel` Examples -------- You can train a sentiment analysis classifier on text data when you have ratings data available. >>> import graphlab as gl >>> data = gl.SFrame({'rating': [1, 5], 'text': ['hate it', 'love it']}) >>> m = gl.sentiment_analysis.create(data, 'rating', features=['text']) >>> m.predict_row({'text': 'really love it'}) >>> m.predict_row({'text': 'really hate it'}) If you do not have ratings data, we provide a pretrained model for you to use as a starting point. >>> m = gl.sentiment_analysis.create(data, features=['text']) >>> m.predict(data) You may also evaluate predictions against known sentiment scores. >>> m.evaluate(data) """ _mt._get_metric_tracker().track('{}.create'.format(__name__)) logger = _logging.getLogger(__name__) # Validate method. if method == 'auto': method = 'bow-logistic' if method != 'bow-logistic': raise ValueError("Unsupported method provided.") # Check if pretrained if target is None: # Name of pretrained model: format is [name]/[version]. model_name = 'sentiment-combined/1' # Download if model is not present in [tmp dir]/model_cache/. tmp_dir = _gl.get_runtime_config()['GRAPHLAB_CACHE_FILE_LOCATIONS'] model_local_path = _os.path.join(tmp_dir, 'model_cache', model_name) model_remote_path = 'https://static.turi.com/products/graphlab-create/resources/models/python2.7/sentiment-analysis/' + model_name feature_extractor = _feature_extractor_for_pretrained if not _os.path.exists(model_local_path): logger.info('Downloading pretrained model...') m = _gl.load_model(model_remote_path) m.save(model_local_path) else: m = _gl.load_model(model_local_path) num_rows = 0 else: if data is None: raise ValueError( "The data argument is required when a target column is provided." ) # Validate data # Validate features. Use all columns by default if features is None: features = data.column_names() # Remove target column from list of feature columns. features = [f for f in features if f != target] # Transform the target column and create the training set. _target = 'like' train = _transform_with_target(data, target, _target) # Process training set using the default feature extractor. feature_extractor = _default_feature_extractor train = feature_extractor(train) # Check for a validation set. kwargs = {} if validation_set is not None: validation_set = _transform_with_target(validation_set, target, _target) validation_set = feature_extractor(validation_set) kwargs['validation_set'] = validation_set m = _gl.logistic_classifier.create(train, target=_target, features=features, l2_penalty=.2, **kwargs) num_rows = data.num_rows() model = SentimentAnalysisModel() model.__proxy__.update({ 'target': target, 'features': features, 'method': method, 'num_rows': num_rows, 'feature_extractor': feature_extractor, 'classifier': m }) return model
def create(data, target=None, features=None, method='auto', validation_set=None): """ Create a model that trains a classifier in order to perform sentiment analysis on a collection of documents. When a target column name is not provided, a pretrained model will be used. The model was trained on a large collection of product review data from both Amazon and Yelp datasets. Predicted scores are between 0 and 1, where higher scores indicate more positive predicted sentiment. The model is a :class:`~graphlab.logistic_classifier.LogisticClassifier` model trained using a bag-of-words representation of the text data, using ratings less than 3 as negative sentiment and ratings of more than 3 as positive sentiment. .. warning:: This toolkit is currently in beta, and feedback is welcome! Please send comments to [email protected]. Parameters ---------- data: SFrame Contains at least one column that contains the text data of interest. This can be unstructured text data, such as that appearing in forums, user-generated reviews, and so on. target: str, optional The column name containing numeric sentiment scores for each document. If provided, a sentiment model will be trained for this data set. If not provided, a pre-trained model will be used. features: list of str, optional The column names of interest containing text data. Each provided column must be str type. Defaults to using all columns of type str. method: str, optional Method to use for feature engineering and modeling. Currently only bag-of-words and logistic classifier ('bow-logistic') is available. validation_set : SFrame, optional A dataset for monitoring the model's generalization performance. This is ignored if no value is provided to the `target` argument. Returns ------- out : :class:`~SentimentAnalysisModel` Examples -------- You can train a sentiment analysis classifier on text data when you have ratings data available. >>> import graphlab as gl >>> data = gl.SFrame({'rating': [1, 5], 'text': ['hate it', 'love it']}) >>> m = gl.sentiment_analysis.create(data, 'rating', features=['text']) >>> m.predict_row({'text': 'really love it'}) >>> m.predict_row({'text': 'really hate it'}) If you do not have ratings data, we provide a pretrained model for you to use as a starting point. >>> m = gl.sentiment_analysis.create(data, features=['text']) >>> m.predict(data) You may also evaluate predictions against known sentiment scores. >>> m.evaluate(data) """ _mt._get_metric_tracker().track('{}.create'.format(__name__)) logger = _logging.getLogger(__name__) # Validate method. if method == 'auto': method = 'bow-logistic' if method != 'bow-logistic': raise ValueError("Unsupported method provided.") # Validate features. Use all columns by default if features is None: features = data.column_names() # Check if pretrained if target is None: # Name of pretrained model: format is [name]/[version]. model_name = 'sentiment-combined/1' # Download if model is not present in [tmp dir]/model_cache/. tmp_dir = _gl.get_runtime_config()['GRAPHLAB_CACHE_FILE_LOCATIONS'] model_local_path = _os.path.join(tmp_dir, 'model_cache', model_name) model_remote_path = 's3://dato-models/sentiment-analysis/' + model_name feature_extractor = _feature_extractor_for_pretrained if not _os.path.exists(model_local_path): logger.info('Downloading pretrained model...') m = _gl.load_model(model_remote_path) m.save(model_local_path) else: m = _gl.load_model(model_local_path) else: # Remove target column from list of feature columns. features = [f for f in features if f != target] # Transform the target column and create the training set. _target = 'like' train = _transform_with_target(data, target, _target) # Process training set using the default feature extractor. feature_extractor = _default_feature_extractor train = feature_extractor(train) # Check for a validation set. kwargs = {} if validation_set is not None: validation_set = _transform_with_target(validation_set, target, _target) validation_set = feature_extractor(validation_set) kwargs['validation_set'] = validation_set m = _gl.logistic_classifier.create(train, target=_target, features=features, l2_penalty=.2, **kwargs) model = SentimentAnalysisModel() model._state.update( {'target': target, 'features': features, 'method': method, 'num_rows': data.num_rows(), 'feature_extractor': feature_extractor, 'classifier': m}) return model